mmap.c 80.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
Alan Cox's avatar
Alan Cox committed
6
 * Address space accounting code	<alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
7
8
9
 */

#include <linux/slab.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
10
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
17
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
19
20
21
22
23
24
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
25
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
29
#include <linux/mmu_notifier.h>
30
#include <linux/perf_event.h>
Al Viro's avatar
Al Viro committed
31
#include <linux/audit.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
32
#include <linux/khugepaged.h>
33
#include <linux/uprobes.h>
34
#include <linux/rbtree_augmented.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
36
37
38

#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
39
#include <asm/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
40

41
42
#include "internal.h"

43
44
45
46
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)	(0)
#endif

47
48
49
50
#ifndef arch_rebalance_pgtables
#define arch_rebalance_pgtables(addr, len)		(addr)
#endif

51
52
53
54
static void unmap_region(struct mm_struct *mm,
		struct vm_area_struct *vma, struct vm_area_struct *prev,
		unsigned long start, unsigned long end);

Linus Torvalds's avatar
Linus Torvalds committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type	prot
 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
 *		
 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
 *
 */
pgprot_t protection_map[16] = {
	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

75
76
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
77
78
79
	return __pgprot(pgprot_val(protection_map[vm_flags &
				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
			pgprot_val(arch_vm_get_page_prot(vm_flags)));
80
81
82
}
EXPORT_SYMBOL(vm_get_page_prot);

83
84
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
85
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
86
87
88
89
90
/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
Linus Torvalds's avatar
Linus Torvalds committed
91

92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 */
unsigned long vm_memory_committed(void)
{
	return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

Linus Torvalds's avatar
Linus Torvalds committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
122
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
Linus Torvalds's avatar
Linus Torvalds committed
123
124
125
126
127
128
129
130
131
132
133
134
{
	unsigned long free, allowed;

	vm_acct_memory(pages);

	/*
	 * Sometimes we want to use more memory than we have
	 */
	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
		return 0;

	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
135
136
137
138
139
140
141
142
143
144
		free = global_page_state(NR_FREE_PAGES);
		free += global_page_state(NR_FILE_PAGES);

		/*
		 * shmem pages shouldn't be counted as free in this
		 * case, they can't be purged, only swapped out, and
		 * that won't affect the overall amount of available
		 * memory in the system.
		 */
		free -= global_page_state(NR_SHMEM);
Linus Torvalds's avatar
Linus Torvalds committed
145
146
147
148
149
150
151
152
153

		free += nr_swap_pages;

		/*
		 * Any slabs which are created with the
		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
		 * which are reclaimable, under pressure.  The dentry
		 * cache and most inode caches should fall into this
		 */
154
		free += global_page_state(NR_SLAB_RECLAIMABLE);
Linus Torvalds's avatar
Linus Torvalds committed
155

156
157
158
		/*
		 * Leave reserved pages. The pages are not for anonymous pages.
		 */
159
		if (free <= totalreserve_pages)
160
161
			goto error;
		else
162
			free -= totalreserve_pages;
163
164
165
166

		/*
		 * Leave the last 3% for root
		 */
Linus Torvalds's avatar
Linus Torvalds committed
167
		if (!cap_sys_admin)
168
			free -= free / 32;
Linus Torvalds's avatar
Linus Torvalds committed
169
170
171

		if (free > pages)
			return 0;
172
173

		goto error;
Linus Torvalds's avatar
Linus Torvalds committed
174
175
176
177
178
179
180
181
182
183
184
185
186
	}

	allowed = (totalram_pages - hugetlb_total_pages())
	       	* sysctl_overcommit_ratio / 100;
	/*
	 * Leave the last 3% for root
	 */
	if (!cap_sys_admin)
		allowed -= allowed / 32;
	allowed += total_swap_pages;

	/* Don't let a single process grow too big:
	   leave 3% of the size of this process for other processes */
Alan Cox's avatar
Alan Cox committed
187
188
	if (mm)
		allowed -= mm->total_vm / 32;
Linus Torvalds's avatar
Linus Torvalds committed
189

190
	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
Linus Torvalds's avatar
Linus Torvalds committed
191
		return 0;
192
error:
Linus Torvalds's avatar
Linus Torvalds committed
193
194
195
196
197
198
	vm_unacct_memory(pages);

	return -ENOMEM;
}

/*
199
 * Requires inode->i_mapping->i_mmap_mutex
Linus Torvalds's avatar
Linus Torvalds committed
200
201
202
203
204
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
		struct file *file, struct address_space *mapping)
{
	if (vma->vm_flags & VM_DENYWRITE)
205
		atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
	if (vma->vm_flags & VM_SHARED)
		mapping->i_mmap_writable--;

	flush_dcache_mmap_lock(mapping);
	if (unlikely(vma->vm_flags & VM_NONLINEAR))
211
		list_del_init(&vma->shared.nonlinear);
Linus Torvalds's avatar
Linus Torvalds committed
212
	else
213
		vma_interval_tree_remove(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
214
215
216
217
	flush_dcache_mmap_unlock(mapping);
}

/*
218
 * Unlink a file-based vm structure from its interval tree, to hide
219
 * vma from rmap and vmtruncate before freeing its page tables.
Linus Torvalds's avatar
Linus Torvalds committed
220
 */
221
void unlink_file_vma(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
225
226
{
	struct file *file = vma->vm_file;

	if (file) {
		struct address_space *mapping = file->f_mapping;
227
		mutex_lock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
228
		__remove_shared_vm_struct(vma, file, mapping);
229
		mutex_unlock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
230
	}
231
232
233
234
235
236
237
238
239
240
}

/*
 * Close a vm structure and free it, returning the next.
 */
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
	struct vm_area_struct *next = vma->vm_next;

	might_sleep();
Linus Torvalds's avatar
Linus Torvalds committed
241
242
	if (vma->vm_ops && vma->vm_ops->close)
		vma->vm_ops->close(vma);
243
	if (vma->vm_file)
244
		fput(vma->vm_file);
245
	mpol_put(vma_policy(vma));
Linus Torvalds's avatar
Linus Torvalds committed
246
	kmem_cache_free(vm_area_cachep, vma);
247
	return next;
Linus Torvalds's avatar
Linus Torvalds committed
248
249
}

250
251
static unsigned long do_brk(unsigned long addr, unsigned long len);

252
SYSCALL_DEFINE1(brk, unsigned long, brk)
Linus Torvalds's avatar
Linus Torvalds committed
253
254
255
256
{
	unsigned long rlim, retval;
	unsigned long newbrk, oldbrk;
	struct mm_struct *mm = current->mm;
257
	unsigned long min_brk;
Linus Torvalds's avatar
Linus Torvalds committed
258
259
260

	down_write(&mm->mmap_sem);

261
#ifdef CONFIG_COMPAT_BRK
262
263
264
265
266
	/*
	 * CONFIG_COMPAT_BRK can still be overridden by setting
	 * randomize_va_space to 2, which will still cause mm->start_brk
	 * to be arbitrarily shifted
	 */
267
	if (current->brk_randomized)
268
269
270
		min_brk = mm->start_brk;
	else
		min_brk = mm->end_data;
271
272
273
274
#else
	min_brk = mm->start_brk;
#endif
	if (brk < min_brk)
Linus Torvalds's avatar
Linus Torvalds committed
275
		goto out;
Ram Gupta's avatar
Ram Gupta committed
276
277
278
279
280
281
282

	/*
	 * Check against rlimit here. If this check is done later after the test
	 * of oldbrk with newbrk then it can escape the test and let the data
	 * segment grow beyond its set limit the in case where the limit is
	 * not page aligned -Ram Gupta
	 */
Jiri Slaby's avatar
Jiri Slaby committed
283
	rlim = rlimit(RLIMIT_DATA);
Jiri Kosina's avatar
Jiri Kosina committed
284
285
	if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
			(mm->end_data - mm->start_data) > rlim)
Ram Gupta's avatar
Ram Gupta committed
286
287
		goto out;

Linus Torvalds's avatar
Linus Torvalds committed
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
	newbrk = PAGE_ALIGN(brk);
	oldbrk = PAGE_ALIGN(mm->brk);
	if (oldbrk == newbrk)
		goto set_brk;

	/* Always allow shrinking brk. */
	if (brk <= mm->brk) {
		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
			goto set_brk;
		goto out;
	}

	/* Check against existing mmap mappings. */
	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
		goto out;

	/* Ok, looks good - let it rip. */
	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
		goto out;
set_brk:
	mm->brk = brk;
out:
	retval = mm->brk;
	up_write(&mm->mmap_sem);
	return retval;
}

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
	unsigned long max, subtree_gap;
	max = vma->vm_start;
	if (vma->vm_prev)
		max -= vma->vm_prev->vm_end;
	if (vma->vm_rb.rb_left) {
		subtree_gap = rb_entry(vma->vm_rb.rb_left,
				struct vm_area_struct, vm_rb)->rb_subtree_gap;
		if (subtree_gap > max)
			max = subtree_gap;
	}
	if (vma->vm_rb.rb_right) {
		subtree_gap = rb_entry(vma->vm_rb.rb_right,
				struct vm_area_struct, vm_rb)->rb_subtree_gap;
		if (subtree_gap > max)
			max = subtree_gap;
	}
	return max;
}

336
#ifdef CONFIG_DEBUG_VM_RB
Linus Torvalds's avatar
Linus Torvalds committed
337
338
static int browse_rb(struct rb_root *root)
{
339
	int i = 0, j, bug = 0;
Linus Torvalds's avatar
Linus Torvalds committed
340
341
342
343
344
345
	struct rb_node *nd, *pn = NULL;
	unsigned long prev = 0, pend = 0;

	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
		struct vm_area_struct *vma;
		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
346
347
348
349
350
		if (vma->vm_start < prev) {
			printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
			bug = 1;
		}
		if (vma->vm_start < pend) {
Linus Torvalds's avatar
Linus Torvalds committed
351
			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
352
353
354
355
356
357
358
359
360
361
362
363
364
			bug = 1;
		}
		if (vma->vm_start > vma->vm_end) {
			printk("vm_end %lx < vm_start %lx\n",
				vma->vm_end, vma->vm_start);
			bug = 1;
		}
		if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
			printk("free gap %lx, correct %lx\n",
			       vma->rb_subtree_gap,
			       vma_compute_subtree_gap(vma));
			bug = 1;
		}
Linus Torvalds's avatar
Linus Torvalds committed
365
366
		i++;
		pn = nd;
David Miller's avatar
David Miller committed
367
368
		prev = vma->vm_start;
		pend = vma->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
369
370
	}
	j = 0;
371
	for (nd = pn; nd; nd = rb_prev(nd))
Linus Torvalds's avatar
Linus Torvalds committed
372
		j++;
373
374
375
	if (i != j) {
		printk("backwards %d, forwards %d\n", j, i);
		bug = 1;
Linus Torvalds's avatar
Linus Torvalds committed
376
	}
377
	return bug ? -1 : i;
Linus Torvalds's avatar
Linus Torvalds committed
378
379
}

380
381
382
383
384
385
386
387
388
static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
	struct rb_node *nd;

	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
		struct vm_area_struct *vma;
		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
		BUG_ON(vma != ignore &&
		       vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
Linus Torvalds's avatar
Linus Torvalds committed
389
390
391
392
393
394
395
	}
}

void validate_mm(struct mm_struct *mm)
{
	int bug = 0;
	int i = 0;
396
	unsigned long highest_address = 0;
397
398
399
	struct vm_area_struct *vma = mm->mmap;
	while (vma) {
		struct anon_vma_chain *avc;
400
		vma_lock_anon_vma(vma);
401
402
		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
			anon_vma_interval_tree_verify(avc);
403
		vma_unlock_anon_vma(vma);
404
		highest_address = vma->vm_end;
405
		vma = vma->vm_next;
Linus Torvalds's avatar
Linus Torvalds committed
406
407
		i++;
	}
408
409
410
411
412
413
414
415
416
	if (i != mm->map_count) {
		printk("map_count %d vm_next %d\n", mm->map_count, i);
		bug = 1;
	}
	if (highest_address != mm->highest_vm_end) {
		printk("mm->highest_vm_end %lx, found %lx\n",
		       mm->highest_vm_end, highest_address);
		bug = 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
417
	i = browse_rb(&mm->mm_rb);
418
419
420
421
	if (i != mm->map_count) {
		printk("map_count %d rb %d\n", mm->map_count, i);
		bug = 1;
	}
422
	BUG_ON(bug);
Linus Torvalds's avatar
Linus Torvalds committed
423
424
}
#else
425
#define validate_mm_rb(root, ignore) do { } while (0)
Linus Torvalds's avatar
Linus Torvalds committed
426
427
428
#define validate_mm(mm) do { } while (0)
#endif

429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)

/*
 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 * in the rbtree.
 */
static void vma_gap_update(struct vm_area_struct *vma)
{
	/*
	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
	 * function that does exacltly what we want.
	 */
	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}

static inline void vma_rb_insert(struct vm_area_struct *vma,
				 struct rb_root *root)
{
	/* All rb_subtree_gap values must be consistent prior to insertion */
	validate_mm_rb(root, NULL);

	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
{
	/*
	 * All rb_subtree_gap values must be consistent prior to erase,
	 * with the possible exception of the vma being erased.
	 */
	validate_mm_rb(root, vma);

	/*
	 * Note rb_erase_augmented is a fairly large inline function,
	 * so make sure we instantiate it only once with our desired
	 * augmented rbtree callbacks.
	 */
	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_sem and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
	struct anon_vma_chain *avc;

	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
	struct anon_vma_chain *avc;

	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

503
504
505
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
		unsigned long end, struct vm_area_struct **pprev,
		struct rb_node ***rb_link, struct rb_node **rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
506
{
507
	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
Linus Torvalds's avatar
Linus Torvalds committed
508
509
510
511
512
513
514
515
516
517
518

	__rb_link = &mm->mm_rb.rb_node;
	rb_prev = __rb_parent = NULL;

	while (*__rb_link) {
		struct vm_area_struct *vma_tmp;

		__rb_parent = *__rb_link;
		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

		if (vma_tmp->vm_end > addr) {
519
520
521
			/* Fail if an existing vma overlaps the area */
			if (vma_tmp->vm_start < end)
				return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
522
523
524
525
526
527
528
529
530
531
532
533
			__rb_link = &__rb_parent->rb_left;
		} else {
			rb_prev = __rb_parent;
			__rb_link = &__rb_parent->rb_right;
		}
	}

	*pprev = NULL;
	if (rb_prev)
		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
	*rb_link = __rb_link;
	*rb_parent = __rb_parent;
534
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
535
536
537
538
539
}

void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
		struct rb_node **rb_link, struct rb_node *rb_parent)
{
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
	/* Update tracking information for the gap following the new vma. */
	if (vma->vm_next)
		vma_gap_update(vma->vm_next);
	else
		mm->highest_vm_end = vma->vm_end;

	/*
	 * vma->vm_prev wasn't known when we followed the rbtree to find the
	 * correct insertion point for that vma. As a result, we could not
	 * update the vma vm_rb parents rb_subtree_gap values on the way down.
	 * So, we first insert the vma with a zero rb_subtree_gap value
	 * (to be consistent with what we did on the way down), and then
	 * immediately update the gap to the correct value. Finally we
	 * rebalance the rbtree after all augmented values have been set.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
555
	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
556
557
558
	vma->rb_subtree_gap = 0;
	vma_gap_update(vma);
	vma_rb_insert(vma, &mm->mm_rb);
Linus Torvalds's avatar
Linus Torvalds committed
559
560
}

561
static void __vma_link_file(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
562
{
ZhenwenXu's avatar
ZhenwenXu committed
563
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
564
565
566
567
568
569

	file = vma->vm_file;
	if (file) {
		struct address_space *mapping = file->f_mapping;

		if (vma->vm_flags & VM_DENYWRITE)
570
			atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
Linus Torvalds's avatar
Linus Torvalds committed
571
572
573
574
575
576
577
		if (vma->vm_flags & VM_SHARED)
			mapping->i_mmap_writable++;

		flush_dcache_mmap_lock(mapping);
		if (unlikely(vma->vm_flags & VM_NONLINEAR))
			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
		else
578
			vma_interval_tree_insert(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
		flush_dcache_mmap_unlock(mapping);
	}
}

static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
	struct vm_area_struct *prev, struct rb_node **rb_link,
	struct rb_node *rb_parent)
{
	__vma_link_list(mm, vma, prev, rb_parent);
	__vma_link_rb(mm, vma, rb_link, rb_parent);
}

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
			struct vm_area_struct *prev, struct rb_node **rb_link,
			struct rb_node *rb_parent)
{
	struct address_space *mapping = NULL;

	if (vma->vm_file)
		mapping = vma->vm_file->f_mapping;

601
	if (mapping)
602
		mutex_lock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
603
604
605
606
607

	__vma_link(mm, vma, prev, rb_link, rb_parent);
	__vma_link_file(vma);

	if (mapping)
608
		mutex_unlock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
609
610
611
612
613
614

	mm->map_count++;
	validate_mm(mm);
}

/*
615
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
616
 * mm's list and rbtree.  It has already been inserted into the interval tree.
Linus Torvalds's avatar
Linus Torvalds committed
617
 */
ZhenwenXu's avatar
ZhenwenXu committed
618
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
619
{
620
	struct vm_area_struct *prev;
ZhenwenXu's avatar
ZhenwenXu committed
621
	struct rb_node **rb_link, *rb_parent;
Linus Torvalds's avatar
Linus Torvalds committed
622

623
624
625
	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
			   &prev, &rb_link, &rb_parent))
		BUG();
Linus Torvalds's avatar
Linus Torvalds committed
626
627
628
629
630
631
632
633
	__vma_link(mm, vma, prev, rb_link, rb_parent);
	mm->map_count++;
}

static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
		struct vm_area_struct *prev)
{
634
	struct vm_area_struct *next;
635

636
637
	vma_rb_erase(vma, &mm->mm_rb);
	prev->vm_next = next = vma->vm_next;
638
639
	if (next)
		next->vm_prev = prev;
Linus Torvalds's avatar
Linus Torvalds committed
640
641
642
643
644
645
646
647
648
649
650
	if (mm->mmap_cache == vma)
		mm->mmap_cache = prev;
}

/*
 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 * is already present in an i_mmap tree without adjusting the tree.
 * The following helper function should be used when such adjustments
 * are necessary.  The "insert" vma (if any) is to be inserted
 * before we drop the necessary locks.
 */
651
int vma_adjust(struct vm_area_struct *vma, unsigned long start,
Linus Torvalds's avatar
Linus Torvalds committed
652
653
654
655
656
657
	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
	struct mm_struct *mm = vma->vm_mm;
	struct vm_area_struct *next = vma->vm_next;
	struct vm_area_struct *importer = NULL;
	struct address_space *mapping = NULL;
658
	struct rb_root *root = NULL;
659
	struct anon_vma *anon_vma = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
660
	struct file *file = vma->vm_file;
661
	bool start_changed = false, end_changed = false;
Linus Torvalds's avatar
Linus Torvalds committed
662
663
664
665
	long adjust_next = 0;
	int remove_next = 0;

	if (next && !insert) {
666
667
		struct vm_area_struct *exporter = NULL;

Linus Torvalds's avatar
Linus Torvalds committed
668
669
670
671
672
673
674
		if (end >= next->vm_end) {
			/*
			 * vma expands, overlapping all the next, and
			 * perhaps the one after too (mprotect case 6).
			 */
again:			remove_next = 1 + (end > next->vm_end);
			end = next->vm_end;
675
			exporter = next;
Linus Torvalds's avatar
Linus Torvalds committed
676
677
678
679
680
681
682
			importer = vma;
		} else if (end > next->vm_start) {
			/*
			 * vma expands, overlapping part of the next:
			 * mprotect case 5 shifting the boundary up.
			 */
			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
683
			exporter = next;
Linus Torvalds's avatar
Linus Torvalds committed
684
685
686
687
688
689
690
691
			importer = vma;
		} else if (end < vma->vm_end) {
			/*
			 * vma shrinks, and !insert tells it's not
			 * split_vma inserting another: so it must be
			 * mprotect case 4 shifting the boundary down.
			 */
			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
692
			exporter = vma;
Linus Torvalds's avatar
Linus Torvalds committed
693
694
695
			importer = next;
		}

696
697
698
699
700
		/*
		 * Easily overlooked: when mprotect shifts the boundary,
		 * make sure the expanding vma has anon_vma set if the
		 * shrinking vma had, to cover any anon pages imported.
		 */
701
702
		if (exporter && exporter->anon_vma && !importer->anon_vma) {
			if (anon_vma_clone(importer, exporter))
703
				return -ENOMEM;
704
			importer->anon_vma = exporter->anon_vma;
705
706
707
		}
	}

Linus Torvalds's avatar
Linus Torvalds committed
708
709
	if (file) {
		mapping = file->f_mapping;
710
		if (!(vma->vm_flags & VM_NONLINEAR)) {
Linus Torvalds's avatar
Linus Torvalds committed
711
			root = &mapping->i_mmap;
712
			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
713
714

			if (adjust_next)
715
716
				uprobe_munmap(next, next->vm_start,
							next->vm_end);
717
718
		}

719
		mutex_lock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
720
721
		if (insert) {
			/*
722
			 * Put into interval tree now, so instantiated pages
Linus Torvalds's avatar
Linus Torvalds committed
723
724
725
726
727
728
729
730
			 * are visible to arm/parisc __flush_dcache_page
			 * throughout; but we cannot insert into address
			 * space until vma start or end is updated.
			 */
			__vma_link_file(insert);
		}
	}

731
732
	vma_adjust_trans_huge(vma, start, end, adjust_next);

733
734
735
736
	anon_vma = vma->anon_vma;
	if (!anon_vma && adjust_next)
		anon_vma = next->anon_vma;
	if (anon_vma) {
737
738
		VM_BUG_ON(adjust_next && next->anon_vma &&
			  anon_vma != next->anon_vma);
739
		anon_vma_lock_write(anon_vma);
740
741
742
743
		anon_vma_interval_tree_pre_update_vma(vma);
		if (adjust_next)
			anon_vma_interval_tree_pre_update_vma(next);
	}
744

Linus Torvalds's avatar
Linus Torvalds committed
745
746
	if (root) {
		flush_dcache_mmap_lock(mapping);
747
		vma_interval_tree_remove(vma, root);
Linus Torvalds's avatar
Linus Torvalds committed
748
		if (adjust_next)
749
			vma_interval_tree_remove(next, root);
Linus Torvalds's avatar
Linus Torvalds committed
750
751
	}

752
753
754
755
756
757
758
759
	if (start != vma->vm_start) {
		vma->vm_start = start;
		start_changed = true;
	}
	if (end != vma->vm_end) {
		vma->vm_end = end;
		end_changed = true;
	}
Linus Torvalds's avatar
Linus Torvalds committed
760
761
762
763
764
765
766
767
	vma->vm_pgoff = pgoff;
	if (adjust_next) {
		next->vm_start += adjust_next << PAGE_SHIFT;
		next->vm_pgoff += adjust_next;
	}

	if (root) {
		if (adjust_next)
768
769
			vma_interval_tree_insert(next, root);
		vma_interval_tree_insert(vma, root);
Linus Torvalds's avatar
Linus Torvalds committed
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
		flush_dcache_mmap_unlock(mapping);
	}

	if (remove_next) {
		/*
		 * vma_merge has merged next into vma, and needs
		 * us to remove next before dropping the locks.
		 */
		__vma_unlink(mm, next, vma);
		if (file)
			__remove_shared_vm_struct(next, file, mapping);
	} else if (insert) {
		/*
		 * split_vma has split insert from vma, and needs
		 * us to insert it before dropping the locks
		 * (it may either follow vma or precede it).
		 */
		__insert_vm_struct(mm, insert);
788
789
790
791
792
793
794
795
796
	} else {
		if (start_changed)
			vma_gap_update(vma);
		if (end_changed) {
			if (!next)
				mm->highest_vm_end = end;
			else if (!adjust_next)
				vma_gap_update(next);
		}
Linus Torvalds's avatar
Linus Torvalds committed
797
798
	}

799
800
801
802
	if (anon_vma) {
		anon_vma_interval_tree_post_update_vma(vma);
		if (adjust_next)
			anon_vma_interval_tree_post_update_vma(next);
803
		anon_vma_unlock(anon_vma);
804
	}
Linus Torvalds's avatar
Linus Torvalds committed
805
	if (mapping)
806
		mutex_unlock(&mapping->i_mmap_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
807

808
	if (root) {
809
		uprobe_mmap(vma);
810
811

		if (adjust_next)
812
			uprobe_mmap(next);
813
814
	}

Linus Torvalds's avatar
Linus Torvalds committed
815
	if (remove_next) {
Matt Helsley's avatar
Matt Helsley committed
816
		if (file) {
817
			uprobe_munmap(next, next->vm_start, next->vm_end);
Linus Torvalds's avatar
Linus Torvalds committed
818
			fput(file);
Matt Helsley's avatar
Matt Helsley committed
819
		}
820
821
		if (next->anon_vma)
			anon_vma_merge(vma, next);
Linus Torvalds's avatar
Linus Torvalds committed
822
		mm->map_count--;
823
		mpol_put(vma_policy(next));
Linus Torvalds's avatar
Linus Torvalds committed
824
825
826
827
828
829
		kmem_cache_free(vm_area_cachep, next);
		/*
		 * In mprotect's case 6 (see comments on vma_merge),
		 * we must remove another next too. It would clutter
		 * up the code too much to do both in one go.
		 */
830
831
		next = vma->vm_next;
		if (remove_next == 2)
Linus Torvalds's avatar
Linus Torvalds committed
832
			goto again;
833
834
835
836
		else if (next)
			vma_gap_update(next);
		else
			mm->highest_vm_end = end;
Linus Torvalds's avatar
Linus Torvalds committed
837
	}
838
	if (insert && file)
839
		uprobe_mmap(insert);
Linus Torvalds's avatar
Linus Torvalds committed
840
841

	validate_mm(mm);
842
843

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
844
845
846
847
848
849
850
851
852
}

/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
static inline int is_mergeable_vma(struct vm_area_struct *vma,
			struct file *file, unsigned long vm_flags)
{
853
	if (vma->vm_flags ^ vm_flags)
Linus Torvalds's avatar
Linus Torvalds committed
854
855
856
857
858
859
860
861
862
		return 0;
	if (vma->vm_file != file)
		return 0;
	if (vma->vm_ops && vma->vm_ops->close)
		return 0;
	return 1;
}

static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
Shaohua Li's avatar
Shaohua Li committed
863
864
					struct anon_vma *anon_vma2,
					struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
865
{
Shaohua Li's avatar
Shaohua Li committed
866
867
868
869
870
871
872
873
	/*
	 * The list_is_singular() test is to avoid merging VMA cloned from
	 * parents. This can improve scalability caused by anon_vma lock.
	 */
	if ((!anon_vma1 || !anon_vma2) && (!vma ||
		list_is_singular(&vma->anon_vma_chain)))
		return 1;
	return anon_vma1 == anon_vma2;
Linus Torvalds's avatar
Linus Torvalds committed
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 */
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
	if (is_mergeable_vma(vma, file, vm_flags) &&
Shaohua Li's avatar
Shaohua Li committed
892
	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
Linus Torvalds's avatar
Linus Torvalds committed
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
		if (vma->vm_pgoff == vm_pgoff)
			return 1;
	}
	return 0;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 */
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
	if (is_mergeable_vma(vma, file, vm_flags) &&
Shaohua Li's avatar
Shaohua Li committed
911
	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
Linus Torvalds's avatar
Linus Torvalds committed
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
		pgoff_t vm_pglen;
		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
			return 1;
	}
	return 0;
}

/*
 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
 * whether that can be merged with its predecessor or its successor.
 * Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where AAAA is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
 *
 *     AAAA             AAAA                AAAA          AAAA
 *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
 *    cannot merge    might become    might become    might become
 *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
 *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
 *    mremap move:                                    PPPPNNNNNNNN 8
 *        AAAA
 *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
 *    might become    case 1 below    case 2 below    case 3 below
 *
 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
 */
struct vm_area_struct *vma_merge(struct mm_struct *mm,
			struct vm_area_struct *prev, unsigned long addr,
			unsigned long end, unsigned long vm_flags,
		     	struct anon_vma *anon_vma, struct file *file,
			pgoff_t pgoff, struct mempolicy *policy)
{
	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
	struct vm_area_struct *area, *next;
957
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988

	/*
	 * We later require that vma->vm_flags == vm_flags,
	 * so this tests vma->vm_flags & VM_SPECIAL, too.
	 */
	if (vm_flags & VM_SPECIAL)
		return NULL;

	if (prev)
		next = prev->vm_next;
	else
		next = mm->mmap;
	area = next;
	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
		next = next->vm_next;

	/*
	 * Can it merge with the predecessor?
	 */
	if (prev && prev->vm_end == addr &&
  			mpol_equal(vma_policy(prev), policy) &&
			can_vma_merge_after(prev, vm_flags,
						anon_vma, file, pgoff)) {
		/*
		 * OK, it can.  Can we now merge in the successor as well?
		 */
		if (next && end == next->vm_start &&
				mpol_equal(policy, vma_policy(next)) &&
				can_vma_merge_before(next, vm_flags,
					anon_vma, file, pgoff+pglen) &&
				is_mergeable_anon_vma(prev->anon_vma,
Shaohua Li's avatar
Shaohua Li committed
989
						      next->anon_vma, NULL)) {
Linus Torvalds's avatar
Linus Torvalds committed
990
							/* cases 1, 6 */
991
			err = vma_adjust(prev, prev->vm_start,
Linus Torvalds's avatar
Linus Torvalds committed
992
993
				next->vm_end, prev->vm_pgoff, NULL);
		} else					/* cases 2, 5, 7 */
994
			err = vma_adjust(prev, prev->vm_start,
Linus Torvalds's avatar
Linus Torvalds committed
995
				end, prev->vm_pgoff, NULL);
996
997
		if (err)
			return NULL;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
998
		khugepaged_enter_vma_merge(prev);
Linus Torvalds's avatar
Linus Torvalds committed
999
1000
		return prev;
	}