memory.c 79.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/module.h>
50
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
51
#include <linux/init.h>
52
#include <linux/writeback.h>
53
#include <linux/memcontrol.h>
Linus Torvalds's avatar
Linus Torvalds committed
54
55
56
57
58
59
60
61
62
63

#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

#include <linux/swapops.h>
#include <linux/elf.h>

64
65
#include "internal.h"

66
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

unsigned long num_physpages;
/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);

88
89
90
91
92
93
94
95
96
97
98
99
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
100
101
102
103

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
104
	return 1;
105
106
107
108
}
__setup("norandmaps", disable_randmaps);


Linus Torvalds's avatar
Linus Torvalds committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
	pgd_ERROR(*pgd);
	pgd_clear(pgd);
}

void pud_clear_bad(pud_t *pud)
{
	pud_ERROR(*pud);
	pud_clear(pud);
}

void pmd_clear_bad(pmd_t *pmd)
{
	pmd_ERROR(*pmd);
	pmd_clear(pmd);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
137
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
Linus Torvalds's avatar
Linus Torvalds committed
138
{
139
	pgtable_t token = pmd_pgtable(*pmd);
140
	pmd_clear(pmd);
141
	pte_free_tlb(tlb, token);
142
	tlb->mm->nr_ptes--;
Linus Torvalds's avatar
Linus Torvalds committed
143
144
}

145
146
147
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
148
149
150
{
	pmd_t *pmd;
	unsigned long next;
151
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
152

153
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
154
155
156
157
158
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
159
		free_pte_range(tlb, pmd);
Linus Torvalds's avatar
Linus Torvalds committed
160
161
	} while (pmd++, addr = next, addr != end);

162
163
164
165
166
167
168
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
169
	}
170
171
172
173
174
175
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
}

178
179
180
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
181
182
183
{
	pud_t *pud;
	unsigned long next;
184
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
185

186
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
187
188
189
190
191
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
192
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
193
194
	} while (pud++, addr = next, addr != end);

195
196
197
198
199
200
201
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
202
	}
203
204
205
206
207
208
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
	pud_free_tlb(tlb, pud);
Linus Torvalds's avatar
Linus Torvalds committed
209
210
211
}

/*
212
213
 * This function frees user-level page tables of a process.
 *
Linus Torvalds's avatar
Linus Torvalds committed
214
215
 * Must be called with pagetable lock held.
 */
216
void free_pgd_range(struct mmu_gather *tlb,
217
218
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
219
220
221
{
	pgd_t *pgd;
	unsigned long next;
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
	unsigned long start;

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
249

250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

	start = addr;
267
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
268
269
270
271
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
272
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
273
	} while (pgd++, addr = next, addr != end);
274
275
}

276
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
277
		unsigned long floor, unsigned long ceiling)
278
279
280
281
282
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

283
284
285
286
287
288
		/*
		 * Hide vma from rmap and vmtruncate before freeing pgtables
		 */
		anon_vma_unlink(vma);
		unlink_file_vma(vma);

289
		if (is_vm_hugetlb_page(vma)) {
290
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
291
				floor, next? next->vm_start: ceiling);
292
293
294
295
296
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
297
			       && !is_vm_hugetlb_page(next)) {
298
299
				vma = next;
				next = vma->vm_next;
300
301
				anon_vma_unlink(vma);
				unlink_file_vma(vma);
302
303
304
305
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
306
307
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
308
309
}

310
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
311
{
312
	pgtable_t new = pte_alloc_one(mm, address);
313
314
315
	if (!new)
		return -ENOMEM;

316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

331
	spin_lock(&mm->page_table_lock);
332
	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
Linus Torvalds's avatar
Linus Torvalds committed
333
334
		mm->nr_ptes++;
		pmd_populate(mm, pmd, new);
335
		new = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
336
	}
337
	spin_unlock(&mm->page_table_lock);
338
339
	if (new)
		pte_free(mm, new);
340
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
341
342
}

343
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
344
{
345
346
347
348
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

349
350
	smp_wmb(); /* See comment in __pte_alloc */

351
	spin_lock(&init_mm.page_table_lock);
352
	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
353
		pmd_populate_kernel(&init_mm, pmd, new);
354
355
		new = NULL;
	}
356
	spin_unlock(&init_mm.page_table_lock);
357
358
	if (new)
		pte_free_kernel(&init_mm, new);
359
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
360
361
}

362
363
364
365
366
367
368
369
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
	if (file_rss)
		add_mm_counter(mm, file_rss, file_rss);
	if (anon_rss)
		add_mm_counter(mm, anon_rss, anon_rss);
}

Nick Piggin's avatar
Nick Piggin committed
370
/*
371
372
373
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
Nick Piggin's avatar
Nick Piggin committed
374
375
376
377
378
379
380
381
382
383
384
385
386
 *
 * The calling function must still handle the error.
 */
void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
{
	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
			"vm_flags = %lx, vaddr = %lx\n",
		(long long)pte_val(pte),
		(vma->vm_mm == current->mm ? current->comm : "???"),
		vma->vm_flags, vaddr);
	dump_stack();
}

387
388
389
390
391
static inline int is_cow_mapping(unsigned int flags)
{
	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

392
/*
393
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
394
 *
395
396
397
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
398
 *
399
400
401
402
403
404
405
406
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
407
 *
Jared Hulbert's avatar
Jared Hulbert committed
408
409
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
410
411
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
412
413
414
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
415
416
417
418
419
420
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
421
422
 *
 *
423
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
424
425
426
427
428
429
430
431
432
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
433
 */
434
435
436
437
438
439
440
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
441
{
442
443
444
445
446
447
448
449
450
451
452
453
454
455
	unsigned long pfn;

	if (HAVE_PTE_SPECIAL) {
		if (likely(!pte_special(pte))) {
			VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
			return pte_page(pte);
		}
		VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

	pfn = pte_pfn(pte);
456

Jared Hulbert's avatar
Jared Hulbert committed
457
458
459
460
461
462
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
463
464
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
465
466
467
468
469
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
470
471
	}

472
	VM_BUG_ON(!pfn_valid(pfn));
473
474

	/*
475
	 * NOTE! We still have PageReserved() pages in the page tables.
476
	 *
477
	 * eg. VDSO mappings can cause them to exist.
478
	 */
Jared Hulbert's avatar
Jared Hulbert committed
479
out:
480
	return pfn_to_page(pfn);
481
482
}

Linus Torvalds's avatar
Linus Torvalds committed
483
484
485
486
487
488
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

489
static inline void
Linus Torvalds's avatar
Linus Torvalds committed
490
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
Nick Piggin's avatar
Nick Piggin committed
491
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
492
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
493
{
Nick Piggin's avatar
Nick Piggin committed
494
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
495
496
497
498
499
500
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
		if (!pte_file(pte)) {
501
502
503
			swp_entry_t entry = pte_to_swp_entry(pte);

			swap_duplicate(entry);
Linus Torvalds's avatar
Linus Torvalds committed
504
505
506
			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
507
508
509
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
						 &src_mm->mmlist);
Linus Torvalds's avatar
Linus Torvalds committed
510
511
				spin_unlock(&mmlist_lock);
			}
512
513
514
515
516
517
518
519
520
521
			if (is_write_migration_entry(entry) &&
					is_cow_mapping(vm_flags)) {
				/*
				 * COW mappings require pages in both parent
				 * and child to be set to read.
				 */
				make_migration_entry_read(&entry);
				pte = swp_entry_to_pte(entry);
				set_pte_at(src_mm, addr, src_pte, pte);
			}
Linus Torvalds's avatar
Linus Torvalds committed
522
		}
523
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
524
525
526
527
528
529
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
530
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
531
		ptep_set_wrprotect(src_mm, addr, src_pte);
532
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
533
534
535
536
537
538
539
540
541
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
542
543
544
545

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
Nick Piggin's avatar
Nick Piggin committed
546
		page_dup_rmap(page, vma, addr);
547
548
		rss[!!PageAnon(page)]++;
	}
549
550
551

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
552
553
554
555
556
557
558
}

static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pte_t *src_pte, *dst_pte;
559
	spinlock_t *src_ptl, *dst_ptl;
560
	int progress = 0;
561
	int rss[2];
Linus Torvalds's avatar
Linus Torvalds committed
562
563

again:
564
	rss[1] = rss[0] = 0;
565
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
566
567
568
	if (!dst_pte)
		return -ENOMEM;
	src_pte = pte_offset_map_nested(src_pmd, addr);
569
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
570
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
571
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
572
573
574
575
576
577

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
578
579
580
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
581
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
582
583
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
584
585
586
587
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
588
		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
Linus Torvalds's avatar
Linus Torvalds committed
589
590
591
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

592
	arch_leave_lazy_mmu_mode();
593
	spin_unlock(src_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
594
	pte_unmap_nested(src_pte - 1);
595
	add_mm_rss(dst_mm, rss[0], rss[1]);
596
597
	pte_unmap_unlock(dst_pte - 1, dst_ptl);
	cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;

655
656
657
658
659
660
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
661
	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
662
663
664
665
		if (!vma->anon_vma)
			return 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
	return 0;
}

682
static unsigned long zap_pte_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
683
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
684
				unsigned long addr, unsigned long end,
685
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
686
{
Nick Piggin's avatar
Nick Piggin committed
687
	struct mm_struct *mm = tlb->mm;
Linus Torvalds's avatar
Linus Torvalds committed
688
	pte_t *pte;
689
	spinlock_t *ptl;
690
691
	int file_rss = 0;
	int anon_rss = 0;
Linus Torvalds's avatar
Linus Torvalds committed
692

693
	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
694
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
695
696
	do {
		pte_t ptent = *pte;
697
698
		if (pte_none(ptent)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
699
			continue;
700
		}
701
702
703

		(*zap_work) -= PAGE_SIZE;

Linus Torvalds's avatar
Linus Torvalds committed
704
		if (pte_present(ptent)) {
705
			struct page *page;
706

707
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
				    details->check_mapping != page->mapping)
					continue;
				/*
				 * Each page->index must be checked when
				 * invalidating or truncating nonlinear.
				 */
				if (details->nonlinear_vma &&
				    (page->index < details->first_index ||
				     page->index > details->last_index))
					continue;
			}
Nick Piggin's avatar
Nick Piggin committed
726
			ptent = ptep_get_and_clear_full(mm, addr, pte,
727
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
728
729
730
731
732
733
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
			if (unlikely(details) && details->nonlinear_vma
			    && linear_page_index(details->nonlinear_vma,
						addr) != page->index)
Nick Piggin's avatar
Nick Piggin committed
734
				set_pte_at(mm, addr, pte,
Linus Torvalds's avatar
Linus Torvalds committed
735
736
					   pgoff_to_pte(page->index));
			if (PageAnon(page))
737
				anon_rss--;
738
739
740
741
			else {
				if (pte_dirty(ptent))
					set_page_dirty(page);
				if (pte_young(ptent))
742
					SetPageReferenced(page);
743
				file_rss--;
744
			}
Nick Piggin's avatar
Nick Piggin committed
745
			page_remove_rmap(page, vma);
Linus Torvalds's avatar
Linus Torvalds committed
746
747
748
749
750
751
752
753
754
755
756
			tlb_remove_page(tlb, page);
			continue;
		}
		/*
		 * If details->check_mapping, we leave swap entries;
		 * if details->nonlinear_vma, we leave file entries.
		 */
		if (unlikely(details))
			continue;
		if (!pte_file(ptent))
			free_swap_and_cache(pte_to_swp_entry(ptent));
757
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
758
	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
759

760
	add_mm_rss(mm, file_rss, anon_rss);
761
	arch_leave_lazy_mmu_mode();
762
	pte_unmap_unlock(pte - 1, ptl);
763
764

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
765
766
}

767
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
768
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
769
				unsigned long addr, unsigned long end,
770
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
771
772
773
774
775
776
777
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
778
779
		if (pmd_none_or_clear_bad(pmd)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
780
			continue;
781
782
783
784
785
786
		}
		next = zap_pte_range(tlb, vma, pmd, addr, next,
						zap_work, details);
	} while (pmd++, addr = next, (addr != end && *zap_work > 0));

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
787
788
}

789
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
790
				struct vm_area_struct *vma, pgd_t *pgd,
Linus Torvalds's avatar
Linus Torvalds committed
791
				unsigned long addr, unsigned long end,
792
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
793
794
795
796
797
798
799
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
800
801
		if (pud_none_or_clear_bad(pud)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
802
			continue;
803
804
805
806
807
808
		}
		next = zap_pmd_range(tlb, vma, pud, addr, next,
						zap_work, details);
	} while (pud++, addr = next, (addr != end && *zap_work > 0));

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
809
810
}

811
812
static unsigned long unmap_page_range(struct mmu_gather *tlb,
				struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
813
				unsigned long addr, unsigned long end,
814
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
815
816
817
818
819
820
821
822
823
824
825
826
{
	pgd_t *pgd;
	unsigned long next;

	if (details && !details->check_mapping && !details->nonlinear_vma)
		details = NULL;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
827
828
		if (pgd_none_or_clear_bad(pgd)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
829
			continue;
830
831
832
833
		}
		next = zap_pud_range(tlb, vma, pgd, addr, next,
						zap_work, details);
	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
Linus Torvalds's avatar
Linus Torvalds committed
834
	tlb_end_vma(tlb, vma);
835
836

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
}

#ifdef CONFIG_PREEMPT
# define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
#else
/* No preempt: go for improved straight-line efficiency */
# define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
#endif

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlbp: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
 * @details: details of nonlinear truncation or shared cache invalidation
 *
855
 * Returns the end address of the unmapping (restart addr if interrupted).
Linus Torvalds's avatar
Linus Torvalds committed
856
 *
857
 * Unmap all pages in the vma list.
Linus Torvalds's avatar
Linus Torvalds committed
858
 *
859
860
 * We aim to not hold locks for too long (for scheduling latency reasons).
 * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
Linus Torvalds's avatar
Linus Torvalds committed
861
862
863
864
865
866
867
868
869
870
871
 * return the ending mmu_gather to the caller.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
872
unsigned long unmap_vmas(struct mmu_gather **tlbp,
Linus Torvalds's avatar
Linus Torvalds committed
873
874
875
876
		struct vm_area_struct *vma, unsigned long start_addr,
		unsigned long end_addr, unsigned long *nr_accounted,
		struct zap_details *details)
{
877
	long zap_work = ZAP_BLOCK_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
878
879
	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
	int tlb_start_valid = 0;
880
	unsigned long start = start_addr;
Linus Torvalds's avatar
Linus Torvalds committed
881
	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
882
	int fullmm = (*tlbp)->fullmm;
Linus Torvalds's avatar
Linus Torvalds committed
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902

	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
		unsigned long end;

		start = max(vma->vm_start, start_addr);
		if (start >= vma->vm_end)
			continue;
		end = min(vma->vm_end, end_addr);
		if (end <= vma->vm_start)
			continue;

		if (vma->vm_flags & VM_ACCOUNT)
			*nr_accounted += (end - start) >> PAGE_SHIFT;

		while (start != end) {
			if (!tlb_start_valid) {
				tlb_start = start;
				tlb_start_valid = 1;
			}

903
			if (unlikely(is_vm_hugetlb_page(vma))) {
904
905
906
907
908
909
910
911
912
913
914
915
916
917
				/*
				 * It is undesirable to test vma->vm_file as it
				 * should be non-null for valid hugetlb area.
				 * However, vm_file will be NULL in the error
				 * cleanup path of do_mmap_pgoff. When
				 * hugetlbfs ->mmap method fails,
				 * do_mmap_pgoff() nullifies vma->vm_file
				 * before calling this function to clean up.
				 * Since no pte has actually been setup, it is
				 * safe to do nothing in this case.
				 */
				if (vma->vm_file) {
					unmap_hugepage_range(vma, start, end, NULL);
					zap_work -= (end - start) /
918
					pages_per_huge_page(hstate_vma(vma));
919
920
				}

921
922
923
924
925
926
927
928
				start = end;
			} else
				start = unmap_page_range(*tlbp, vma,
						start, end, &zap_work, details);

			if (zap_work > 0) {
				BUG_ON(start != end);
				break;
Linus Torvalds's avatar
Linus Torvalds committed
929
930
931
932
933
			}

			tlb_finish_mmu(*tlbp, tlb_start, start);

			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
934
				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
Linus Torvalds's avatar
Linus Torvalds committed
935
				if (i_mmap_lock) {
936
					*tlbp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
937
938
939
940
941
					goto out;
				}
				cond_resched();
			}

942
			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
943
			tlb_start_valid = 0;
944
			zap_work = ZAP_BLOCK_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
945
946
947
		}
	}
out:
948
	return start;	/* which is now the end (or restart) address */
Linus Torvalds's avatar
Linus Torvalds committed
949
950
951
952
953
954
955
956
957
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 */
958
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
959
960
961
962
963
964
965
966
967
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
	struct mmu_gather *tlb;
	unsigned long end = address + size;
	unsigned long nr_accounted = 0;

	lru_add_drain();
	tlb = tlb_gather_mmu(mm, 0);
968
	update_hiwater_rss(mm);
969
970
971
	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
	if (tlb)
		tlb_finish_mmu(tlb, address, end);
972
	return end;
Linus Torvalds's avatar
Linus Torvalds committed
973
974
975
976
977
}

/*
 * Do a quick page-table lookup for a single page.
 */
978
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
979
			unsigned int flags)
Linus Torvalds's avatar
Linus Torvalds committed
980
981
982
983
984
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
985
	spinlock_t *ptl;
Linus Torvalds's avatar
Linus Torvalds committed
986
	struct page *page;
987
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
988

989
990
991
992
993
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		goto out;
	}
Linus Torvalds's avatar
Linus Torvalds committed
994

995
	page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
996
997
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
998
		goto no_page_table;
Linus Torvalds's avatar
Linus Torvalds committed
999
1000
1001

	pud = pud_offset(pgd, address);
	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
1002
		goto no_page_table;
Linus Torvalds's avatar
Linus Torvalds committed
1003
1004
	
	pmd = pmd_offset(pud, address);
1005
	if (pmd_none(*pmd))
1006
1007
1008
1009
1010
		goto no_page_table;

	if (pmd_huge(*pmd)) {
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
1011
		goto out;
1012
	}
Linus Torvalds's avatar
Linus Torvalds committed
1013

1014
1015
1016
	if (unlikely(pmd_bad(*pmd)))
		goto no_page_table;

1017
	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
Linus Torvalds's avatar
Linus Torvalds committed
1018
1019

	pte = *ptep;
1020
	if (!pte_present(pte))
1021
		goto no_page;
1022
1023
	if ((flags & FOLL_WRITE) && !pte_write(pte))
		goto unlock;
1024
1025
	page = vm_normal_page(vma, address, pte);
	if (unlikely(!page))
1026
		goto bad_page;
Linus Torvalds's avatar
Linus Torvalds committed
1027

1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
	if (flags & FOLL_GET)
		get_page(page);
	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		mark_page_accessed(page);
	}
unlock:
	pte_unmap_unlock(ptep, ptl);
Linus Torvalds's avatar
Linus Torvalds committed
1038
out:
1039
	return page;
Linus Torvalds's avatar
Linus Torvalds committed
1040

1041
1042
1043
1044
1045
1046
1047
1048
1049
bad_page:
	pte_unmap_unlock(ptep, ptl);
	return ERR_PTR(-EFAULT);

no_page:
	pte_unmap_unlock(ptep, ptl);
	if (!pte_none(pte))
		return page;
	/* Fall through to ZERO_PAGE handling */
1050
1051
1052
1053
1054
1055
no_page_table:
	/*
	 * When core dumping an enormous anonymous area that nobody
	 * has touched so far, we don't want to allocate page tables.
	 */
	if (flags & FOLL_ANON) {
Nick Piggin's avatar
Nick Piggin committed
1056
		page = ZERO_PAGE(0);
1057
1058
1059
1060
1061
		if (flags & FOLL_GET)
			get_page(page);
		BUG_ON(flags & FOLL_WRITE);
	}
	return page;
Linus Torvalds's avatar
Linus Torvalds committed
1062
1063
}

1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
/* Can we do the FOLL_ANON optimization? */
static inline int use_zero_page(struct vm_area_struct *vma)
{
	/*
	 * We don't want to optimize FOLL_ANON for make_pages_present()
	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
	 * we want to get the page from the page tables to make sure
	 * that we serialize and update with any other user of that
	 * mapping.
	 */
	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
		return 0;
	/*
Nick Piggin's avatar
Nick Piggin committed
1077
	 * And if we have a fault routine, it's not an anonymous region.
1078
	 */
Nick Piggin's avatar
Nick Piggin committed
1079
	return !vma->vm_ops || !vma->vm_ops->fault;
1080
1081
}

Linus Torvalds's avatar
Linus Torvalds committed
1082
1083
1084
1085
1086
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		unsigned long start, int len, int write, int force,
		struct page **pages, struct vm_area_struct **vmas)
{
	int i;
1087
	unsigned int vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
1088

1089
1090
	if (len <= 0)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1091
1092
1093
1094
	/* 
	 * Require read or write permissions.
	 * If 'force' is set, we only require the "MAY" flags.
	 */
1095
1096
	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
1097
1098
1099
	i = 0;

	do {
1100
1101
		struct vm_area_struct *vma;
		unsigned int foll_flags;
Linus Torvalds's avatar
Linus Torvalds committed
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120

		vma = find_extend_vma(mm, start);
		if (!vma && in_gate_area(tsk, start)) {
			unsigned long pg = start & PAGE_MASK;
			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
			pgd_t *pgd;
			pud_t *pud;
			pmd_t *pmd;
			pte_t *pte;
			if (write) /* user gate pages are read-only */
				return i ? : -EFAULT;
			if (pg > TASK_SIZE)
				pgd = pgd_offset_k(pg);
			else
				pgd = pgd_offset_gate(mm, pg);
			BUG_ON(pgd_none(*pgd));
			pud = pud_offset(pgd, pg);
			BUG_ON(pud_none(*pud));
			pmd = pmd_offset(pud, pg);
1121
1122
			if (pmd_none(*pmd))
				return i ? : -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
1123
			pte = pte_offset_map(pmd, pg);
1124
1125
1126
1127
			if (pte_none(*pte)) {
				pte_unmap(pte);
				return i ? : -EFAULT;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1128
			if (pages) {
1129
				struct page *page = vm_normal_page(gate_vma, start, *pte);
1130
1131
1132
				pages[i] = page;
				if (page)
					get_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
			}
			pte_unmap(pte);
			if (vmas)
				vmas[i] = gate_vma;
			i++;
			start += PAGE_SIZE;
			len--;
			continue;
		}

1143
		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
1144
				|| !(vm_flags & vma->vm_flags))
Linus Torvalds's avatar
Linus Torvalds committed
1145
1146
1147
1148
			return i ? : -EFAULT;

		if (is_vm_hugetlb_page(vma)) {
			i = follow_hugetlb_page(mm, vma, pages, vmas,
1149
						&start, &len, i, write);
Linus Torvalds's avatar
Linus Torvalds committed
1150
1151
			continue;
		}
1152
1153
1154
1155

		foll_flags = FOLL_TOUCH;
		if (pages)
			foll_flags |= FOLL_GET;
1156
		if (!write && use_zero_page(vma))
1157
1158
			foll_flags |= FOLL_ANON;

Linus Torvalds's avatar
Linus Torvalds committed
1159
		do {
1160
			struct page *page;
Linus Torvalds's avatar
Linus Torvalds committed
1161

1162
1163
1164
1165
1166
1167
			/*
			 * If tsk is ooming, cut off its access to large memory
			 * allocations. It has a pending SIGKILL, but it can't
			 * be processed until returning to user space.
			 */
			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1168
				return i ? i : -ENOMEM;
1169

1170
1171
			if (write)
				foll_flags |= FOLL_WRITE;
1172

1173
			cond_resched();
1174
			while (!(page = follow_page(vma, start, foll_flags))) {
1175
				int ret;
Nick Piggin's avatar
Nick Piggin committed
1176
				ret = handle_mm_fault(mm, vma, start,
1177
						foll_flags & FOLL_WRITE);
Nick Piggin's avatar
Nick Piggin committed
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
				if (ret & VM_FAULT_ERROR) {
					if (ret & VM_FAULT_OOM)
						return i ? i : -ENOMEM;
					else if (ret & VM_FAULT_SIGBUS)
						return i ? i : -EFAULT;
					BUG();
				}
				if (ret & VM_FAULT_MAJOR)
					tsk->maj_flt++;
				else
					tsk->min_flt++;

1190
				/*
Nick Piggin's avatar
Nick Piggin committed
1191
1192
1193
1194
1195
				 * The VM_FAULT_WRITE bit tells us that
				 * do_wp_page has broken COW when necessary,
				 * even if maybe_mkwrite decided not to set
				 * pte_write. We can thus safely do subsequent
				 * page lookups as if they were reads.
1196
1197
				 */
				if (ret & VM_FAULT_WRITE)
1198
					foll_flags &= ~FOLL_WRITE;
Nick Piggin's avatar
Nick Piggin committed
1199

1200
				cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
1201
			}
1202
1203
			if (IS_ERR(page))
				return i ? i : PTR_ERR(page);
Linus Torvalds's avatar
Linus Torvalds committed
1204
			if (pages) {
1205
				pages[i] = page;
1206

1207
				flush_anon_page(vma, page, start);
1208
				flush_dcache_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1209
1210
1211
1212
1213
1214
			}
			if (vmas)
				vmas[i] = vma;
			i++;
			start += PAGE_SIZE;
			len--;
1215
1216
		} while (len && start < vma->vm_end);
	} while (len);
Linus Torvalds's avatar
Linus Torvalds committed
1217
1218
1219
1220
	return i;
}
EXPORT_SYMBOL(get_user_pages);

Harvey Harrison's avatar
Harvey Harrison committed
1221
1222
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
			spinlock_t **ptl)
1223
1224
1225
1226
{
	pgd_t * pgd = pgd_offset(mm, addr);
	pud_t * pud = pud_alloc(mm, pgd, addr);
	if (pud) {
1227
		pmd_t * pmd = pmd_alloc(mm, pud, addr);
1228
1229
1230
1231
1232
1233
		if (pmd)
			return pte_alloc_map_lock(mm, pmd, addr, ptl);
	}
	return NULL;
}

1234
1235
1236
1237
1238
1239
1240
/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
Nick Piggin's avatar
Nick Piggin committed
1241
1242
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page, pgprot_t prot)
1243
{
Nick Piggin's avatar
Nick Piggin committed
1244
	struct mm_struct *mm = vma->vm_mm;
1245
	int retval;
1246
	pte_t *pte;
1247
1248
	spinlock_t *ptl;

1249
	retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1250
1251
	if (retval)
		goto out;
1252
1253

	retval = -EINVAL;
1254
	if (PageAnon(page))
1255
		goto out_uncharge;
1256
1257
	retval = -ENOMEM;
	flush_dcache_page(page);
1258
	pte = get_locked_pte(mm, addr, &ptl);
1259
	if (!pte)
1260
		goto out_uncharge;
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
	retval = -EBUSY;
	if (!pte_none(*pte))
		goto out_unlock;

	/* Ok, finally just insert the thing.. */
	get_page(page);
	inc_mm_counter(mm, file_rss);
	page_add_file_rmap(page);
	set_pte_at(mm, addr, pte, mk_pte(page, prot));

	retval = 0;
1272
1273
	pte_unmap_unlock(pte, ptl);
	return retval;
1274
1275
out_unlock:
	pte_unmap_unlock(pte, ptl);
1276
1277
out_uncharge:
	mem_cgroup_uncharge_page(page);
1278
1279
1280
1281
out:
	return retval;
}

1282
1283
1284
1285
1286
1287
/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
1288
1289
1290
1291
1292
1293
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
1294
 * (see split_page()).
1295
1296
1297
1298
1299
1300
1301
1302
1303
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 */
Nick Piggin's avatar
Nick Piggin committed
1304
1305
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page)
1306
1307
1308
1309
1310
{
	if (addr < vma->vm_start || addr >= vma->vm_end)
		return -EFAULT;
	if (!page_count(page))
		return -EINVAL;
1311
	vma->vm_flags |= VM_INSERTPAGE;
Nick Piggin's avatar
Nick Piggin committed
1312
	return insert_page(vma, addr, page, vma->vm_page_prot);
1313
}
1314
EXPORT_SYMBOL(vm_insert_page);
1315

Nick Piggin's avatar
Nick Piggin committed
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327