gup.c 78.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
8
#include <linux/memremap.h>
9 10 11 12 13
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>

14
#include <linux/sched/signal.h>
15
#include <linux/rwsem.h>
16
#include <linux/hugetlb.h>
17 18 19
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
20

21
#include <asm/mmu_context.h>
22
#include <asm/pgtable.h>
23
#include <asm/tlbflush.h>
24

25 26
#include "internal.h"

27 28 29 30 31
struct follow_page_context {
	struct dev_pagemap *pgmap;
	unsigned int page_mask;
};

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
/*
 * Return the compound head page with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
	struct page *head = compound_head(page);

	if (WARN_ON_ONCE(page_ref_count(head) < 0))
		return NULL;
	if (unlikely(!page_cache_add_speculative(head, refs)))
		return NULL;
	return head;
}

John Hubbard's avatar
John Hubbard committed
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
/*
 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
 * flags-dependent amount.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: head page (with refcount appropriately incremented) for success, or
 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
 * considered failure, and furthermore, a likely bug in the caller, so a warning
 * is also emitted.
 */
static __maybe_unused struct page *try_grab_compound_head(struct page *page,
							  int refs,
							  unsigned int flags)
{
	if (flags & FOLL_GET)
		return try_get_compound_head(page, refs);
	else if (flags & FOLL_PIN) {
		refs *= GUP_PIN_COUNTING_BIAS;
		return try_get_compound_head(page, refs);
	}

	WARN_ON_ONCE(1);
	return NULL;
}

/**
 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * @page:    pointer to page to be grabbed
 * @flags:   gup flags: these are the FOLL_* flag values.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time. Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: true for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 * FOLL_PIN was set, but the page could not be grabbed.
 */
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
	WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));

	if (flags & FOLL_GET)
		return try_get_page(page);
	else if (flags & FOLL_PIN) {
		page = compound_head(page);

		if (WARN_ON_ONCE(page_ref_count(page) <= 0))
			return false;

		page_ref_add(page, GUP_PIN_COUNTING_BIAS);
	}

	return true;
}

#ifdef CONFIG_DEV_PAGEMAP_OPS
static bool __unpin_devmap_managed_user_page(struct page *page)
{
	int count;

	if (!page_is_devmap_managed(page))
		return false;

	count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);

	/*
	 * devmap page refcounts are 1-based, rather than 0-based: if
	 * refcount is 1, then the page is free and the refcount is
	 * stable because nobody holds a reference on the page.
	 */
	if (count == 1)
		free_devmap_managed_page(page);
	else if (!count)
		__put_page(page);

	return true;
}
#else
static bool __unpin_devmap_managed_user_page(struct page *page)
{
	return false;
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
	page = compound_head(page);

	/*
	 * For devmap managed pages we need to catch refcount transition from
	 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
	 * page is free and we need to inform the device driver through
	 * callback. See include/linux/memremap.h and HMM for details.
	 */
	if (__unpin_devmap_managed_user_page(page))
		return;

	if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
		__put_page(page);
}
EXPORT_SYMBOL(unpin_user_page);

176
/**
177
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
178
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
179
 * @npages: number of pages in the @pages array.
180
 * @make_dirty: whether to mark the pages dirty
181 182 183 184 185
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
186
 * compound page) dirty, if @make_dirty is true, and if the page was previously
187 188
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
189
 *
190
 * Please see the unpin_user_page() documentation for details.
191
 *
192 193 194
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
195
 * set_page_dirty_lock(), unpin_user_page().
196 197
 *
 */
198 199
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
				 bool make_dirty)
200
{
201
	unsigned long index;
202

203 204 205 206 207 208 209
	/*
	 * TODO: this can be optimized for huge pages: if a series of pages is
	 * physically contiguous and part of the same compound page, then a
	 * single operation to the head page should suffice.
	 */

	if (!make_dirty) {
210
		unpin_user_pages(pages, npages);
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
		return;
	}

	for (index = 0; index < npages; index++) {
		struct page *page = compound_head(pages[index]);
		/*
		 * Checking PageDirty at this point may race with
		 * clear_page_dirty_for_io(), but that's OK. Two key
		 * cases:
		 *
		 * 1) This code sees the page as already dirty, so it
		 * skips the call to set_page_dirty(). That could happen
		 * because clear_page_dirty_for_io() called
		 * page_mkclean(), followed by set_page_dirty().
		 * However, now the page is going to get written back,
		 * which meets the original intention of setting it
		 * dirty, so all is well: clear_page_dirty_for_io() goes
		 * on to call TestClearPageDirty(), and write the page
		 * back.
		 *
		 * 2) This code sees the page as clean, so it calls
		 * set_page_dirty(). The page stays dirty, despite being
		 * written back, so it gets written back again in the
		 * next writeback cycle. This is harmless.
		 */
		if (!PageDirty(page))
			set_page_dirty_lock(page);
238
		unpin_user_page(page);
239
	}
240
}
241
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
242 243

/**
244
 * unpin_user_pages() - release an array of gup-pinned pages.
245 246 247
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
248
 * For each page in the @pages array, release the page using unpin_user_page().
249
 *
250
 * Please see the unpin_user_page() documentation for details.
251
 */
252
void unpin_user_pages(struct page **pages, unsigned long npages)
253 254 255 256 257 258 259 260 261
{
	unsigned long index;

	/*
	 * TODO: this can be optimized for huge pages: if a series of pages is
	 * physically contiguous and part of the same compound page, then a
	 * single operation to the head page should suffice.
	 */
	for (index = 0; index < npages; index++)
262
		unpin_user_page(pages[index]);
263
}
264
EXPORT_SYMBOL(unpin_user_pages);
265

266
#ifdef CONFIG_MMU
267 268
static struct page *no_page_table(struct vm_area_struct *vma,
		unsigned int flags)
269
{
270 271 272 273 274 275 276 277 278 279 280 281
	/*
	 * When core dumping an enormous anonymous area that nobody
	 * has touched so far, we don't want to allocate unnecessary pages or
	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
	 * then get_dump_page() will return NULL to leave a hole in the dump.
	 * But we can only make this optimization where a hole would surely
	 * be zero-filled if handle_mm_fault() actually did handle it.
	 */
	if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
		return ERR_PTR(-EFAULT);
	return NULL;
}
282

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
		pte_t *pte, unsigned int flags)
{
	/* No page to get reference */
	if (flags & FOLL_GET)
		return -EFAULT;

	if (flags & FOLL_TOUCH) {
		pte_t entry = *pte;

		if (flags & FOLL_WRITE)
			entry = pte_mkdirty(entry);
		entry = pte_mkyoung(entry);

		if (!pte_same(*pte, entry)) {
			set_pte_at(vma->vm_mm, address, pte, entry);
			update_mmu_cache(vma, address, pte);
		}
	}

	/* Proper page table entry exists, but no corresponding struct page */
	return -EEXIST;
}

307 308 309 310 311 312
/*
 * FOLL_FORCE can write to even unwritable pte's, but only
 * after we've gone through a COW cycle and they are dirty.
 */
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
313
	return pte_write(pte) ||
314 315 316
		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}

317
static struct page *follow_page_pte(struct vm_area_struct *vma,
318 319
		unsigned long address, pmd_t *pmd, unsigned int flags,
		struct dev_pagemap **pgmap)
320 321 322 323 324
{
	struct mm_struct *mm = vma->vm_mm;
	struct page *page;
	spinlock_t *ptl;
	pte_t *ptep, pte;
325

326 327 328 329
	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
			 (FOLL_PIN | FOLL_GET)))
		return ERR_PTR(-EINVAL);
330
retry:
331
	if (unlikely(pmd_bad(*pmd)))
332
		return no_page_table(vma, flags);
333 334 335 336 337 338 339 340 341 342 343 344

	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
	pte = *ptep;
	if (!pte_present(pte)) {
		swp_entry_t entry;
		/*
		 * KSM's break_ksm() relies upon recognizing a ksm page
		 * even while it is being migrated, so for that case we
		 * need migration_entry_wait().
		 */
		if (likely(!(flags & FOLL_MIGRATION)))
			goto no_page;
345
		if (pte_none(pte))
346 347 348 349 350 351
			goto no_page;
		entry = pte_to_swp_entry(pte);
		if (!is_migration_entry(entry))
			goto no_page;
		pte_unmap_unlock(ptep, ptl);
		migration_entry_wait(mm, pmd, address);
352
		goto retry;
353
	}
354
	if ((flags & FOLL_NUMA) && pte_protnone(pte))
355
		goto no_page;
356
	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
357 358 359
		pte_unmap_unlock(ptep, ptl);
		return NULL;
	}
360 361

	page = vm_normal_page(vma, address, pte);
John Hubbard's avatar
John Hubbard committed
362
	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
363
		/*
John Hubbard's avatar
John Hubbard committed
364 365 366
		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
		 * case since they are only valid while holding the pgmap
		 * reference.
367
		 */
368 369
		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
		if (*pgmap)
370 371 372 373
			page = pte_page(pte);
		else
			goto no_page;
	} else if (unlikely(!page)) {
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
		if (flags & FOLL_DUMP) {
			/* Avoid special (like zero) pages in core dumps */
			page = ERR_PTR(-EFAULT);
			goto out;
		}

		if (is_zero_pfn(pte_pfn(pte))) {
			page = pte_page(pte);
		} else {
			int ret;

			ret = follow_pfn_pte(vma, address, ptep, flags);
			page = ERR_PTR(ret);
			goto out;
		}
389 390
	}

391 392 393 394 395 396 397 398 399 400 401 402 403
	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
		int ret;
		get_page(page);
		pte_unmap_unlock(ptep, ptl);
		lock_page(page);
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);
		if (ret)
			return ERR_PTR(ret);
		goto retry;
	}

John Hubbard's avatar
John Hubbard committed
404 405 406 407
	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
	if (unlikely(!try_grab_page(page, flags))) {
		page = ERR_PTR(-ENOMEM);
		goto out;
408
	}
409 410 411 412 413 414 415 416 417 418 419
	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		/*
		 * pte_mkyoung() would be more correct here, but atomic care
		 * is needed to avoid losing the dirty bit: it is easier to use
		 * mark_page_accessed().
		 */
		mark_page_accessed(page);
	}
Eric B Munson's avatar
Eric B Munson committed
420
	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
421 422 423 424
		/* Do not mlock pte-mapped THP */
		if (PageTransCompound(page))
			goto out;

425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
		/*
		 * The preliminary mapping check is mainly to avoid the
		 * pointless overhead of lock_page on the ZERO_PAGE
		 * which might bounce very badly if there is contention.
		 *
		 * If the page is already locked, we don't need to
		 * handle it now - vmscan will handle it later if and
		 * when it attempts to reclaim the page.
		 */
		if (page->mapping && trylock_page(page)) {
			lru_add_drain();  /* push cached pages to LRU */
			/*
			 * Because we lock page here, and migration is
			 * blocked by the pte's page reference, and we
			 * know the page is still mapped, we don't even
			 * need to check for file-cache page truncation.
			 */
			mlock_vma_page(page);
			unlock_page(page);
		}
	}
446
out:
447 448 449 450 451
	pte_unmap_unlock(ptep, ptl);
	return page;
no_page:
	pte_unmap_unlock(ptep, ptl);
	if (!pte_none(pte))
452 453 454 455
		return NULL;
	return no_page_table(vma, flags);
}

456 457
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
				    unsigned long address, pud_t *pudp,
458 459
				    unsigned int flags,
				    struct follow_page_context *ctx)
460
{
461
	pmd_t *pmd, pmdval;
462 463 464 465
	spinlock_t *ptl;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

466
	pmd = pmd_offset(pudp, address);
467 468 469 470 471 472
	/*
	 * The READ_ONCE() will stabilize the pmdval in a register or
	 * on the stack so that it will stop changing under the code.
	 */
	pmdval = READ_ONCE(*pmd);
	if (pmd_none(pmdval))
473
		return no_page_table(vma, flags);
474
	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
475 476 477 478
		page = follow_huge_pmd(mm, address, pmd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
479
	}
480
	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
481
		page = follow_huge_pd(vma, address,
482
				      __hugepd(pmd_val(pmdval)), flags,
483 484 485 486 487
				      PMD_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
488
retry:
489
	if (!pmd_present(pmdval)) {
490 491 492
		if (likely(!(flags & FOLL_MIGRATION)))
			return no_page_table(vma, flags);
		VM_BUG_ON(thp_migration_supported() &&
493 494
				  !is_pmd_migration_entry(pmdval));
		if (is_pmd_migration_entry(pmdval))
495
			pmd_migration_entry_wait(mm, pmd);
496 497 498 499 500 501 502
		pmdval = READ_ONCE(*pmd);
		/*
		 * MADV_DONTNEED may convert the pmd to null because
		 * mmap_sem is held in read mode
		 */
		if (pmd_none(pmdval))
			return no_page_table(vma, flags);
503 504
		goto retry;
	}
505
	if (pmd_devmap(pmdval)) {
506
		ptl = pmd_lock(mm, pmd);
507
		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
508 509 510 511
		spin_unlock(ptl);
		if (page)
			return page;
	}
512
	if (likely(!pmd_trans_huge(pmdval)))
513
		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
514

515
	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
516 517
		return no_page_table(vma, flags);

518
retry_locked:
519
	ptl = pmd_lock(mm, pmd);
520 521 522 523
	if (unlikely(pmd_none(*pmd))) {
		spin_unlock(ptl);
		return no_page_table(vma, flags);
	}
524 525 526 527 528 529 530
	if (unlikely(!pmd_present(*pmd))) {
		spin_unlock(ptl);
		if (likely(!(flags & FOLL_MIGRATION)))
			return no_page_table(vma, flags);
		pmd_migration_entry_wait(mm, pmd);
		goto retry_locked;
	}
531 532
	if (unlikely(!pmd_trans_huge(*pmd))) {
		spin_unlock(ptl);
533
		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
534
	}
Song Liu's avatar
Song Liu committed
535
	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
536 537 538 539 540
		int ret;
		page = pmd_page(*pmd);
		if (is_huge_zero_page(page)) {
			spin_unlock(ptl);
			ret = 0;
541
			split_huge_pmd(vma, pmd, address);
542 543
			if (pmd_trans_unstable(pmd))
				ret = -EBUSY;
Song Liu's avatar
Song Liu committed
544
		} else if (flags & FOLL_SPLIT) {
545 546 547 548
			if (unlikely(!try_get_page(page))) {
				spin_unlock(ptl);
				return ERR_PTR(-ENOMEM);
			}
549
			spin_unlock(ptl);
550 551 552 553
			lock_page(page);
			ret = split_huge_page(page);
			unlock_page(page);
			put_page(page);
554 555
			if (pmd_none(*pmd))
				return no_page_table(vma, flags);
Song Liu's avatar
Song Liu committed
556 557 558 559
		} else {  /* flags & FOLL_SPLIT_PMD */
			spin_unlock(ptl);
			split_huge_pmd(vma, pmd, address);
			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
560 561 562
		}

		return ret ? ERR_PTR(ret) :
563
			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
564
	}
565 566
	page = follow_trans_huge_pmd(vma, address, pmd, flags);
	spin_unlock(ptl);
567
	ctx->page_mask = HPAGE_PMD_NR - 1;
568
	return page;
569 570
}

571 572
static struct page *follow_pud_mask(struct vm_area_struct *vma,
				    unsigned long address, p4d_t *p4dp,
573 574
				    unsigned int flags,
				    struct follow_page_context *ctx)
575 576 577 578 579 580 581 582 583
{
	pud_t *pud;
	spinlock_t *ptl;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

	pud = pud_offset(p4dp, address);
	if (pud_none(*pud))
		return no_page_table(vma, flags);
584
	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
585 586 587 588 589
		page = follow_huge_pud(mm, address, pud, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
590 591 592 593 594 595 596 597
	if (is_hugepd(__hugepd(pud_val(*pud)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(pud_val(*pud)), flags,
				      PUD_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
598 599
	if (pud_devmap(*pud)) {
		ptl = pud_lock(mm, pud);
600
		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
601 602 603 604 605 606 607
		spin_unlock(ptl);
		if (page)
			return page;
	}
	if (unlikely(pud_bad(*pud)))
		return no_page_table(vma, flags);

608
	return follow_pmd_mask(vma, address, pud, flags, ctx);
609 610 611 612
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
				    unsigned long address, pgd_t *pgdp,
613 614
				    unsigned int flags,
				    struct follow_page_context *ctx)
615 616
{
	p4d_t *p4d;
617
	struct page *page;
618 619 620 621 622 623 624 625

	p4d = p4d_offset(pgdp, address);
	if (p4d_none(*p4d))
		return no_page_table(vma, flags);
	BUILD_BUG_ON(p4d_huge(*p4d));
	if (unlikely(p4d_bad(*p4d)))
		return no_page_table(vma, flags);

626 627 628 629 630 631 632 633
	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(p4d_val(*p4d)), flags,
				      P4D_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
634
	return follow_pud_mask(vma, address, p4d, flags, ctx);
635 636 637 638 639 640 641
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
642 643
 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 *       pointer to output page_mask
644 645 646
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
647 648 649 650 651 652
 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 *
 * On output, the @ctx->page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
653 654 655
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
656
static struct page *follow_page_mask(struct vm_area_struct *vma,
657
			      unsigned long address, unsigned int flags,
658
			      struct follow_page_context *ctx)
659 660 661 662 663
{
	pgd_t *pgd;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

664
	ctx->page_mask = 0;
665 666 667 668

	/* make this handle hugepd */
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
John Hubbard's avatar
John Hubbard committed
669
		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
670 671 672 673 674 675 676 677
		return page;
	}

	pgd = pgd_offset(mm, address);

	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		return no_page_table(vma, flags);

678 679 680 681 682 683
	if (pgd_huge(*pgd)) {
		page = follow_huge_pgd(mm, address, pgd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
684 685 686 687 688 689 690 691
	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(pgd_val(*pgd)), flags,
				      PGDIR_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
692

693 694 695 696 697 698 699 700 701 702 703 704 705
	return follow_p4d_mask(vma, address, pgd, flags, ctx);
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
			 unsigned int foll_flags)
{
	struct follow_page_context ctx = { NULL };
	struct page *page;

	page = follow_page_mask(vma, address, foll_flags, &ctx);
	if (ctx.pgmap)
		put_dev_pagemap(ctx.pgmap);
	return page;
706 707
}

708 709 710 711 712
static int get_gate_page(struct mm_struct *mm, unsigned long address,
		unsigned int gup_flags, struct vm_area_struct **vma,
		struct page **page)
{
	pgd_t *pgd;
713
	p4d_t *p4d;
714 715 716 717 718 719 720 721 722 723 724 725
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
	int ret = -EFAULT;

	/* user gate pages are read-only */
	if (gup_flags & FOLL_WRITE)
		return -EFAULT;
	if (address > TASK_SIZE)
		pgd = pgd_offset_k(address);
	else
		pgd = pgd_offset_gate(mm, address);
726 727
	if (pgd_none(*pgd))
		return -EFAULT;
728
	p4d = p4d_offset(pgd, address);
729 730
	if (p4d_none(*p4d))
		return -EFAULT;
731
	pud = pud_offset(p4d, address);
732 733
	if (pud_none(*pud))
		return -EFAULT;
734
	pmd = pmd_offset(pud, address);
735
	if (!pmd_present(*pmd))
736 737 738 739 740 741 742 743 744 745 746 747 748 749
		return -EFAULT;
	VM_BUG_ON(pmd_trans_huge(*pmd));
	pte = pte_offset_map(pmd, address);
	if (pte_none(*pte))
		goto unmap;
	*vma = get_gate_vma(mm);
	if (!page)
		goto out;
	*page = vm_normal_page(*vma, address, *pte);
	if (!*page) {
		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
			goto unmap;
		*page = pte_page(*pte);
	}
750 751 752 753
	if (unlikely(!try_get_page(*page))) {
		ret = -ENOMEM;
		goto unmap;
	}
754 755 756 757 758 759 760
out:
	ret = 0;
unmap:
	pte_unmap(pte);
	return ret;
}

761 762 763 764 765
/*
 * mmap_sem must be held on entry.  If @nonblocking != NULL and
 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 */
766 767 768 769
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
		unsigned long address, unsigned int *flags, int *nonblocking)
{
	unsigned int fault_flags = 0;
770
	vm_fault_t ret;
771

Eric B Munson's avatar
Eric B Munson committed
772 773 774
	/* mlock all present pages, but do not fault in new pages */
	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
		return -ENOENT;
775 776
	if (*flags & FOLL_WRITE)
		fault_flags |= FAULT_FLAG_WRITE;
777 778
	if (*flags & FOLL_REMOTE)
		fault_flags |= FAULT_FLAG_REMOTE;
779 780 781 782
	if (nonblocking)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
	if (*flags & FOLL_NOWAIT)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
783 784 785 786
	if (*flags & FOLL_TRIED) {
		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
		fault_flags |= FAULT_FLAG_TRIED;
	}
787

788
	ret = handle_mm_fault(vma, address, fault_flags);
789
	if (ret & VM_FAULT_ERROR) {
790 791 792 793
		int err = vm_fault_to_errno(ret, *flags);

		if (err)
			return err;
794 795 796 797 798 799 800 801 802 803 804
		BUG();
	}

	if (tsk) {
		if (ret & VM_FAULT_MAJOR)
			tsk->maj_flt++;
		else
			tsk->min_flt++;
	}

	if (ret & VM_FAULT_RETRY) {
805
		if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
806 807 808 809 810 811 812 813 814 815 816 817 818 819
			*nonblocking = 0;
		return -EBUSY;
	}

	/*
	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
	 * can thus safely do subsequent page lookups as if they were reads.
	 * But only do so when looping for pte_write is futile: in some cases
	 * userspace may also be wanting to write to the gotten user page,
	 * which a read fault here might prevent (a readonly page might get
	 * reCOWed by userspace write).
	 */
	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
820
		*flags |= FOLL_COW;
821 822 823
	return 0;
}

824 825 826
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
	vm_flags_t vm_flags = vma->vm_flags;
827 828
	int write = (gup_flags & FOLL_WRITE);
	int foreign = (gup_flags & FOLL_REMOTE);
829 830 831 832

	if (vm_flags & (VM_IO | VM_PFNMAP))
		return -EFAULT;

833 834 835
	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
		return -EFAULT;

836
	if (write) {
837 838 839 840 841 842 843 844 845 846 847 848
		if (!(vm_flags & VM_WRITE)) {
			if (!(gup_flags & FOLL_FORCE))
				return -EFAULT;
			/*
			 * We used to let the write,force case do COW in a
			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
			 * set a breakpoint in a read-only mapping of an
			 * executable, without corrupting the file (yet only
			 * when that file had been opened for writing!).
			 * Anon pages in shared mappings are surprising: now
			 * just reject it.
			 */
849
			if (!is_cow_mapping(vm_flags))
850 851 852 853 854 855 856 857 858 859 860 861
				return -EFAULT;
		}
	} else if (!(vm_flags & VM_READ)) {
		if (!(gup_flags & FOLL_FORCE))
			return -EFAULT;
		/*
		 * Is there actually any vma we can reach here which does not
		 * have VM_MAYREAD set?
		 */
		if (!(vm_flags & VM_MAYREAD))
			return -EFAULT;
	}
862 863 864 865 866
	/*
	 * gups are always data accesses, not instruction
	 * fetches, so execute=false here
	 */
	if (!arch_vma_access_permitted(vma, write, false, foreign))
867
		return -EFAULT;
868 869 870
	return 0;
}

871 872 873 874 875 876 877 878 879 880 881 882 883 884
/**
 * __get_user_pages() - pin user pages in memory
 * @tsk:	task_struct of target task
 * @mm:		mm_struct of target mm
 * @start:	starting user address
 * @nr_pages:	number of pages from start to pin
 * @gup_flags:	flags modifying pin behaviour
 * @pages:	array that receives pointers to the pages pinned.
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page.
 *		Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
885 886 887 888 889 890 891 892 893 894 895
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * @vmas are valid only as long as mmap_sem is held.
896
 *
897
 * Must be called with mmap_sem held.  It may be released.  See below.
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
920 921 922 923 924 925 926 927
 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 * this case.
 *
 * A caller using such a combination of @nonblocking and @gup_flags
 * must therefore hold the mmap_sem for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
928 929 930 931 932
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
933
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
934 935 936 937
		unsigned long start, unsigned long nr_pages,
		unsigned int gup_flags, struct page **pages,
		struct vm_area_struct **vmas, int *nonblocking)
{
938
	long ret = 0, i = 0;
939
	struct vm_area_struct *vma = NULL;
940
	struct follow_page_context ctx = { NULL };
941 942 943 944

	if (!nr_pages)
		return 0;

945 946
	start = untagged_addr(start);

947
	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
948 949 950 951 952 953 954 955 956 957

	/*
	 * If FOLL_FORCE is set then do not force a full fault as the hinting
	 * fault information is unrelated to the reference behaviour of a task
	 * using the address space
	 */
	if (!(gup_flags & FOLL_FORCE))
		gup_flags |= FOLL_NUMA;

	do {
958 959 960 961 962 963 964 965 966 967 968 969
		struct page *page;
		unsigned int foll_flags = gup_flags;
		unsigned int page_increm;

		/* first iteration or cross vma bound */
		if (!vma || start >= vma->vm_end) {
			vma = find_extend_vma(mm, start);
			if (!vma && in_gate_area(mm, start)) {
				ret = get_gate_page(mm, start & PAGE_MASK,
						gup_flags, &vma,
						pages ? &pages[i] : NULL);
				if (ret)
970
					goto out;
971
				ctx.page_mask = 0;
972 973
				goto next_page;
			}
974

975 976 977 978
			if (!vma || check_vma_flags(vma, gup_flags)) {
				ret = -EFAULT;
				goto out;
			}
979 980 981
			if (is_vm_hugetlb_page(vma)) {
				i = follow_hugetlb_page(mm, vma, pages, vmas,
						&start, &nr_pages, i,
982
						gup_flags, nonblocking);
983
				continue;
984
			}
985 986 987 988 989 990
		}
retry:
		/*
		 * If we have a pending SIGKILL, don't keep faulting pages and
		 * potentially allocating memory.
		 */
991
		if (fatal_signal_pending(current)) {
992 993 994
			ret = -ERESTARTSYS;
			goto out;
		}
995
		cond_resched();
996 997

		page = follow_page_mask(vma, start, foll_flags, &ctx);
998 999 1000 1001 1002 1003
		if (!page) {
			ret = faultin_page(tsk, vma, start, &foll_flags,
					nonblocking);
			switch (ret) {
			case 0:
				goto retry;
1004 1005 1006
			case -EBUSY:
				ret = 0;
				/* FALLTHRU */
1007 1008 1009
			case -EFAULT:
			case -ENOMEM:
			case -EHWPOISON:
1010
				goto out;
1011 1012
			case -ENOENT:
				goto next_page;
1013
			}
1014
			BUG();
1015 1016 1017 1018 1019 1020 1021
		} else if (PTR_ERR(page) == -EEXIST) {
			/*
			 * Proper page table entry exists, but no corresponding
			 * struct page.
			 */
			goto next_page;
		} else if (IS_ERR(page)) {
1022 1023
			ret = PTR_ERR(page);
			goto out;
1024
		}
1025 1026 1027 1028
		if (pages) {
			pages[i] = page;
			flush_anon_page(vma, page, start);
			flush_dcache_page(page);
1029
			ctx.page_mask = 0;
1030 1031
		}
next_page:
1032 1033
		if (vmas) {
			vmas[i] = vma;
1034
			ctx.page_mask = 0;
1035
		}
1036
		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1037 1038 1039 1040 1041
		if (page_increm > nr_pages)
			page_increm = nr_pages;
		i += page_increm;
		start += page_increm * PAGE_SIZE;
		nr_pages -= page_increm;
1042
	} while (nr_pages);
1043 1044 1045 1046
out:
	if (ctx.pgmap)
		put_dev_pagemap(ctx.pgmap);
	return i ? i : ret;
1047 1048
}

1049 1050
static bool vma_permits_fault(struct vm_area_struct *vma,
			      unsigned int fault_flags)
1051
{
1052 1053
	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1054
	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1055 1056 1057 1058

	if (!(vm_flags & vma->vm_flags))
		return false;

1059 1060
	/*
	 * The architecture might have a hardware protection
1061
	 * mechanism other than read/write that can deny access.
1062 1063 1064
	 *
	 * gup always represents data access, not instruction
	 * fetches, so execute=false here:
1065
	 */
1066
	if (!arch_vma_access_permitted(vma, write, false, foreign))
1067 1068
		return false;

1069 1070 1071
	return true;
}

1072 1073 1074 1075 1076 1077 1078
/*
 * fixup_user_fault() - manually resolve a user page fault
 * @tsk:	the task_struct to use for page fault accounting, or
 *		NULL if faults are not to be recorded.
 * @mm:		mm_struct of target mm
 * @address:	user address
 * @fault_flags:flags to pass down to handle_mm_fault()
1079 1080
 * @unlocked:	did we unlock the mmap_sem while retrying, maybe NULL if caller
 *		does not allow retry
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
1092
 * get_user_pages() only guarantees to update these in the struct page.
1093 1094 1095 1096 1097 1098
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
1099 1100
 * This function will not return with an unlocked mmap_sem. So it has not the
 * same semantics wrt the @mm->mmap_sem as does filemap_fault().
1101 1102
 */
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1103 1104
		     unsigned long address, unsigned int fault_flags,
		     bool *unlocked)
1105 1106
{
	struct vm_area_struct *vma;
1107
	vm_fault_t ret, major = 0;
1108

1109 1110
	address = untagged_addr(address);

1111 1112
	if (unlocked)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1113

1114
retry:
1115 1116 1117 1118
	vma = find_extend_vma(mm, address);
	if (!vma || address < vma->vm_start)
		return -EFAULT;

1119
	if (!vma_permits_fault(vma, fault_flags))
1120 1121
		return -EFAULT;

1122
	ret = handle_mm_fault(vma, address, fault_flags);
1123
	major |= ret & VM_FAULT_MAJOR;
1124
	if (ret & VM_FAULT_ERROR) {
1125 1126 1127 1128
		int err = vm_fault_to_errno(ret, 0);

		if (err)
			return err;
1129 1130
		BUG();
	}
1131