gup.c 79.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
8
#include <linux/memremap.h>
9 10 11 12 13
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>

14
#include <linux/sched/signal.h>
15
#include <linux/rwsem.h>
16
#include <linux/hugetlb.h>
17 18 19
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
20

21
#include <asm/mmu_context.h>
22
#include <asm/pgtable.h>
23
#include <asm/tlbflush.h>
24

25 26
#include "internal.h"

27 28 29 30 31
struct follow_page_context {
	struct dev_pagemap *pgmap;
	unsigned int page_mask;
};

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
static void hpage_pincount_add(struct page *page, int refs)
{
	VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
	VM_BUG_ON_PAGE(page != compound_head(page), page);

	atomic_add(refs, compound_pincount_ptr(page));
}

static void hpage_pincount_sub(struct page *page, int refs)
{
	VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
	VM_BUG_ON_PAGE(page != compound_head(page), page);

	atomic_sub(refs, compound_pincount_ptr(page));
}

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
/*
 * Return the compound head page with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
	struct page *head = compound_head(page);

	if (WARN_ON_ONCE(page_ref_count(head) < 0))
		return NULL;
	if (unlikely(!page_cache_add_speculative(head, refs)))
		return NULL;
	return head;
}

John Hubbard's avatar
John Hubbard committed
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
/*
 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
 * flags-dependent amount.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: head page (with refcount appropriately incremented) for success, or
 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
 * considered failure, and furthermore, a likely bug in the caller, so a warning
 * is also emitted.
 */
static __maybe_unused struct page *try_grab_compound_head(struct page *page,
							  int refs,
							  unsigned int flags)
{
	if (flags & FOLL_GET)
		return try_get_compound_head(page, refs);
	else if (flags & FOLL_PIN) {
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
		/*
		 * When pinning a compound page of order > 1 (which is what
		 * hpage_pincount_available() checks for), use an exact count to
		 * track it, via hpage_pincount_add/_sub().
		 *
		 * However, be sure to *also* increment the normal page refcount
		 * field at least once, so that the page really is pinned.
		 */
		if (!hpage_pincount_available(page))
			refs *= GUP_PIN_COUNTING_BIAS;

		page = try_get_compound_head(page, refs);
		if (!page)
			return NULL;

		if (hpage_pincount_available(page))
			hpage_pincount_add(page, refs);

		return page;
John Hubbard's avatar
John Hubbard committed
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
	}

	WARN_ON_ONCE(1);
	return NULL;
}

/**
 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * @page:    pointer to page to be grabbed
 * @flags:   gup flags: these are the FOLL_* flag values.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time. Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: true for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 * FOLL_PIN was set, but the page could not be grabbed.
 */
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
	WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));

	if (flags & FOLL_GET)
		return try_get_page(page);
	else if (flags & FOLL_PIN) {
142 143
		int refs = 1;

John Hubbard's avatar
John Hubbard committed
144 145 146 147 148
		page = compound_head(page);

		if (WARN_ON_ONCE(page_ref_count(page) <= 0))
			return false;

149 150 151 152 153 154 155 156 157 158 159 160
		if (hpage_pincount_available(page))
			hpage_pincount_add(page, 1);
		else
			refs = GUP_PIN_COUNTING_BIAS;

		/*
		 * Similar to try_grab_compound_head(): even if using the
		 * hpage_pincount_add/_sub() routines, be sure to
		 * *also* increment the normal page refcount field at least
		 * once, so that the page really is pinned.
		 */
		page_ref_add(page, refs);
John Hubbard's avatar
John Hubbard committed
161 162 163 164 165 166 167 168
	}

	return true;
}

#ifdef CONFIG_DEV_PAGEMAP_OPS
static bool __unpin_devmap_managed_user_page(struct page *page)
{
169
	int count, refs = 1;
John Hubbard's avatar
John Hubbard committed
170 171 172 173

	if (!page_is_devmap_managed(page))
		return false;

174 175 176 177 178 179
	if (hpage_pincount_available(page))
		hpage_pincount_sub(page, 1);
	else
		refs = GUP_PIN_COUNTING_BIAS;

	count = page_ref_sub_return(page, refs);
John Hubbard's avatar
John Hubbard committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210

	/*
	 * devmap page refcounts are 1-based, rather than 0-based: if
	 * refcount is 1, then the page is free and the refcount is
	 * stable because nobody holds a reference on the page.
	 */
	if (count == 1)
		free_devmap_managed_page(page);
	else if (!count)
		__put_page(page);

	return true;
}
#else
static bool __unpin_devmap_managed_user_page(struct page *page)
{
	return false;
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
211 212
	int refs = 1;

John Hubbard's avatar
John Hubbard committed
213 214 215 216 217 218 219 220 221 222 223
	page = compound_head(page);

	/*
	 * For devmap managed pages we need to catch refcount transition from
	 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
	 * page is free and we need to inform the device driver through
	 * callback. See include/linux/memremap.h and HMM for details.
	 */
	if (__unpin_devmap_managed_user_page(page))
		return;

224 225 226 227 228 229
	if (hpage_pincount_available(page))
		hpage_pincount_sub(page, 1);
	else
		refs = GUP_PIN_COUNTING_BIAS;

	if (page_ref_sub_and_test(page, refs))
John Hubbard's avatar
John Hubbard committed
230 231 232 233
		__put_page(page);
}
EXPORT_SYMBOL(unpin_user_page);

234
/**
235
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
236
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
237
 * @npages: number of pages in the @pages array.
238
 * @make_dirty: whether to mark the pages dirty
239 240 241 242 243
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
244
 * compound page) dirty, if @make_dirty is true, and if the page was previously
245 246
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
247
 *
248
 * Please see the unpin_user_page() documentation for details.
249
 *
250 251 252
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
253
 * set_page_dirty_lock(), unpin_user_page().
254 255
 *
 */
256 257
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
				 bool make_dirty)
258
{
259
	unsigned long index;
260

261 262 263 264 265 266 267
	/*
	 * TODO: this can be optimized for huge pages: if a series of pages is
	 * physically contiguous and part of the same compound page, then a
	 * single operation to the head page should suffice.
	 */

	if (!make_dirty) {
268
		unpin_user_pages(pages, npages);
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
		return;
	}

	for (index = 0; index < npages; index++) {
		struct page *page = compound_head(pages[index]);
		/*
		 * Checking PageDirty at this point may race with
		 * clear_page_dirty_for_io(), but that's OK. Two key
		 * cases:
		 *
		 * 1) This code sees the page as already dirty, so it
		 * skips the call to set_page_dirty(). That could happen
		 * because clear_page_dirty_for_io() called
		 * page_mkclean(), followed by set_page_dirty().
		 * However, now the page is going to get written back,
		 * which meets the original intention of setting it
		 * dirty, so all is well: clear_page_dirty_for_io() goes
		 * on to call TestClearPageDirty(), and write the page
		 * back.
		 *
		 * 2) This code sees the page as clean, so it calls
		 * set_page_dirty(). The page stays dirty, despite being
		 * written back, so it gets written back again in the
		 * next writeback cycle. This is harmless.
		 */
		if (!PageDirty(page))
			set_page_dirty_lock(page);
296
		unpin_user_page(page);
297
	}
298
}
299
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
300 301

/**
302
 * unpin_user_pages() - release an array of gup-pinned pages.
303 304 305
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
306
 * For each page in the @pages array, release the page using unpin_user_page().
307
 *
308
 * Please see the unpin_user_page() documentation for details.
309
 */
310
void unpin_user_pages(struct page **pages, unsigned long npages)
311 312 313 314 315 316 317 318 319
{
	unsigned long index;

	/*
	 * TODO: this can be optimized for huge pages: if a series of pages is
	 * physically contiguous and part of the same compound page, then a
	 * single operation to the head page should suffice.
	 */
	for (index = 0; index < npages; index++)
320
		unpin_user_page(pages[index]);
321
}
322
EXPORT_SYMBOL(unpin_user_pages);
323

324
#ifdef CONFIG_MMU
325 326
static struct page *no_page_table(struct vm_area_struct *vma,
		unsigned int flags)
327
{
328 329 330 331 332 333 334 335 336 337 338 339
	/*
	 * When core dumping an enormous anonymous area that nobody
	 * has touched so far, we don't want to allocate unnecessary pages or
	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
	 * then get_dump_page() will return NULL to leave a hole in the dump.
	 * But we can only make this optimization where a hole would surely
	 * be zero-filled if handle_mm_fault() actually did handle it.
	 */
	if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
		return ERR_PTR(-EFAULT);
	return NULL;
}
340

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
		pte_t *pte, unsigned int flags)
{
	/* No page to get reference */
	if (flags & FOLL_GET)
		return -EFAULT;

	if (flags & FOLL_TOUCH) {
		pte_t entry = *pte;

		if (flags & FOLL_WRITE)
			entry = pte_mkdirty(entry);
		entry = pte_mkyoung(entry);

		if (!pte_same(*pte, entry)) {
			set_pte_at(vma->vm_mm, address, pte, entry);
			update_mmu_cache(vma, address, pte);
		}
	}

	/* Proper page table entry exists, but no corresponding struct page */
	return -EEXIST;
}

365 366 367 368 369 370
/*
 * FOLL_FORCE can write to even unwritable pte's, but only
 * after we've gone through a COW cycle and they are dirty.
 */
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
371
	return pte_write(pte) ||
372 373 374
		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}

375
static struct page *follow_page_pte(struct vm_area_struct *vma,
376 377
		unsigned long address, pmd_t *pmd, unsigned int flags,
		struct dev_pagemap **pgmap)
378 379 380 381 382
{
	struct mm_struct *mm = vma->vm_mm;
	struct page *page;
	spinlock_t *ptl;
	pte_t *ptep, pte;
383

384 385 386 387
	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
			 (FOLL_PIN | FOLL_GET)))
		return ERR_PTR(-EINVAL);
388
retry:
389
	if (unlikely(pmd_bad(*pmd)))
390
		return no_page_table(vma, flags);
391 392 393 394 395 396 397 398 399 400 401 402

	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
	pte = *ptep;
	if (!pte_present(pte)) {
		swp_entry_t entry;
		/*
		 * KSM's break_ksm() relies upon recognizing a ksm page
		 * even while it is being migrated, so for that case we
		 * need migration_entry_wait().
		 */
		if (likely(!(flags & FOLL_MIGRATION)))
			goto no_page;
403
		if (pte_none(pte))
404 405 406 407 408 409
			goto no_page;
		entry = pte_to_swp_entry(pte);
		if (!is_migration_entry(entry))
			goto no_page;
		pte_unmap_unlock(ptep, ptl);
		migration_entry_wait(mm, pmd, address);
410
		goto retry;
411
	}
412
	if ((flags & FOLL_NUMA) && pte_protnone(pte))
413
		goto no_page;
414
	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
415 416 417
		pte_unmap_unlock(ptep, ptl);
		return NULL;
	}
418 419

	page = vm_normal_page(vma, address, pte);
John Hubbard's avatar
John Hubbard committed
420
	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
421
		/*
John Hubbard's avatar
John Hubbard committed
422 423 424
		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
		 * case since they are only valid while holding the pgmap
		 * reference.
425
		 */
426 427
		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
		if (*pgmap)
428 429 430 431
			page = pte_page(pte);
		else
			goto no_page;
	} else if (unlikely(!page)) {
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
		if (flags & FOLL_DUMP) {
			/* Avoid special (like zero) pages in core dumps */
			page = ERR_PTR(-EFAULT);
			goto out;
		}

		if (is_zero_pfn(pte_pfn(pte))) {
			page = pte_page(pte);
		} else {
			int ret;

			ret = follow_pfn_pte(vma, address, ptep, flags);
			page = ERR_PTR(ret);
			goto out;
		}
447 448
	}

449 450 451 452 453 454 455 456 457 458 459 460 461
	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
		int ret;
		get_page(page);
		pte_unmap_unlock(ptep, ptl);
		lock_page(page);
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);
		if (ret)
			return ERR_PTR(ret);
		goto retry;
	}

John Hubbard's avatar
John Hubbard committed
462 463 464 465
	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
	if (unlikely(!try_grab_page(page, flags))) {
		page = ERR_PTR(-ENOMEM);
		goto out;
466
	}
467 468 469 470 471 472 473 474 475 476 477
	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		/*
		 * pte_mkyoung() would be more correct here, but atomic care
		 * is needed to avoid losing the dirty bit: it is easier to use
		 * mark_page_accessed().
		 */
		mark_page_accessed(page);
	}
Eric B Munson's avatar
Eric B Munson committed
478
	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
479 480 481 482
		/* Do not mlock pte-mapped THP */
		if (PageTransCompound(page))
			goto out;

483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
		/*
		 * The preliminary mapping check is mainly to avoid the
		 * pointless overhead of lock_page on the ZERO_PAGE
		 * which might bounce very badly if there is contention.
		 *
		 * If the page is already locked, we don't need to
		 * handle it now - vmscan will handle it later if and
		 * when it attempts to reclaim the page.
		 */
		if (page->mapping && trylock_page(page)) {
			lru_add_drain();  /* push cached pages to LRU */
			/*
			 * Because we lock page here, and migration is
			 * blocked by the pte's page reference, and we
			 * know the page is still mapped, we don't even
			 * need to check for file-cache page truncation.
			 */
			mlock_vma_page(page);
			unlock_page(page);
		}
	}
504
out:
505 506 507 508 509
	pte_unmap_unlock(ptep, ptl);
	return page;
no_page:
	pte_unmap_unlock(ptep, ptl);
	if (!pte_none(pte))
510 511 512 513
		return NULL;
	return no_page_table(vma, flags);
}

514 515
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
				    unsigned long address, pud_t *pudp,
516 517
				    unsigned int flags,
				    struct follow_page_context *ctx)
518
{
519
	pmd_t *pmd, pmdval;
520 521 522 523
	spinlock_t *ptl;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

524
	pmd = pmd_offset(pudp, address);
525 526 527 528 529 530
	/*
	 * The READ_ONCE() will stabilize the pmdval in a register or
	 * on the stack so that it will stop changing under the code.
	 */
	pmdval = READ_ONCE(*pmd);
	if (pmd_none(pmdval))
531
		return no_page_table(vma, flags);
532
	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
533 534 535 536
		page = follow_huge_pmd(mm, address, pmd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
537
	}
538
	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
539
		page = follow_huge_pd(vma, address,
540
				      __hugepd(pmd_val(pmdval)), flags,
541 542 543 544 545
				      PMD_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
546
retry:
547
	if (!pmd_present(pmdval)) {
548 549 550
		if (likely(!(flags & FOLL_MIGRATION)))
			return no_page_table(vma, flags);
		VM_BUG_ON(thp_migration_supported() &&
551 552
				  !is_pmd_migration_entry(pmdval));
		if (is_pmd_migration_entry(pmdval))
553
			pmd_migration_entry_wait(mm, pmd);
554 555 556 557 558 559 560
		pmdval = READ_ONCE(*pmd);
		/*
		 * MADV_DONTNEED may convert the pmd to null because
		 * mmap_sem is held in read mode
		 */
		if (pmd_none(pmdval))
			return no_page_table(vma, flags);
561 562
		goto retry;
	}
563
	if (pmd_devmap(pmdval)) {
564
		ptl = pmd_lock(mm, pmd);
565
		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
566 567 568 569
		spin_unlock(ptl);
		if (page)
			return page;
	}
570
	if (likely(!pmd_trans_huge(pmdval)))
571
		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
572

573
	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
574 575
		return no_page_table(vma, flags);

576
retry_locked:
577
	ptl = pmd_lock(mm, pmd);
578 579 580 581
	if (unlikely(pmd_none(*pmd))) {
		spin_unlock(ptl);
		return no_page_table(vma, flags);
	}
582 583 584 585 586 587 588
	if (unlikely(!pmd_present(*pmd))) {
		spin_unlock(ptl);
		if (likely(!(flags & FOLL_MIGRATION)))
			return no_page_table(vma, flags);
		pmd_migration_entry_wait(mm, pmd);
		goto retry_locked;
	}
589 590
	if (unlikely(!pmd_trans_huge(*pmd))) {
		spin_unlock(ptl);
591
		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
592
	}
Song Liu's avatar
Song Liu committed
593
	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
594 595 596 597 598
		int ret;
		page = pmd_page(*pmd);
		if (is_huge_zero_page(page)) {
			spin_unlock(ptl);
			ret = 0;
599
			split_huge_pmd(vma, pmd, address);
600 601
			if (pmd_trans_unstable(pmd))
				ret = -EBUSY;
Song Liu's avatar
Song Liu committed
602
		} else if (flags & FOLL_SPLIT) {
603 604 605 606
			if (unlikely(!try_get_page(page))) {
				spin_unlock(ptl);
				return ERR_PTR(-ENOMEM);
			}
607
			spin_unlock(ptl);
608 609 610 611
			lock_page(page);
			ret = split_huge_page(page);
			unlock_page(page);
			put_page(page);
612 613
			if (pmd_none(*pmd))
				return no_page_table(vma, flags);
Song Liu's avatar
Song Liu committed
614 615 616 617
		} else {  /* flags & FOLL_SPLIT_PMD */
			spin_unlock(ptl);
			split_huge_pmd(vma, pmd, address);
			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
618 619 620
		}

		return ret ? ERR_PTR(ret) :
621
			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
622
	}
623 624
	page = follow_trans_huge_pmd(vma, address, pmd, flags);
	spin_unlock(ptl);
625
	ctx->page_mask = HPAGE_PMD_NR - 1;
626
	return page;
627 628
}

629 630
static struct page *follow_pud_mask(struct vm_area_struct *vma,
				    unsigned long address, p4d_t *p4dp,
631 632
				    unsigned int flags,
				    struct follow_page_context *ctx)
633 634 635 636 637 638 639 640 641
{
	pud_t *pud;
	spinlock_t *ptl;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

	pud = pud_offset(p4dp, address);
	if (pud_none(*pud))
		return no_page_table(vma, flags);
642
	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
643 644 645 646 647
		page = follow_huge_pud(mm, address, pud, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
648 649 650 651 652 653 654 655
	if (is_hugepd(__hugepd(pud_val(*pud)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(pud_val(*pud)), flags,
				      PUD_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
656 657
	if (pud_devmap(*pud)) {
		ptl = pud_lock(mm, pud);
658
		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
659 660 661 662 663 664 665
		spin_unlock(ptl);
		if (page)
			return page;
	}
	if (unlikely(pud_bad(*pud)))
		return no_page_table(vma, flags);

666
	return follow_pmd_mask(vma, address, pud, flags, ctx);
667 668 669 670
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
				    unsigned long address, pgd_t *pgdp,
671 672
				    unsigned int flags,
				    struct follow_page_context *ctx)
673 674
{
	p4d_t *p4d;
675
	struct page *page;
676 677 678 679 680 681 682 683

	p4d = p4d_offset(pgdp, address);
	if (p4d_none(*p4d))
		return no_page_table(vma, flags);
	BUILD_BUG_ON(p4d_huge(*p4d));
	if (unlikely(p4d_bad(*p4d)))
		return no_page_table(vma, flags);

684 685 686 687 688 689 690 691
	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(p4d_val(*p4d)), flags,
				      P4D_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
692
	return follow_pud_mask(vma, address, p4d, flags, ctx);
693 694 695 696 697 698 699
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
700 701
 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 *       pointer to output page_mask
702 703 704
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
705 706 707 708 709 710
 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 *
 * On output, the @ctx->page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
711 712 713
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
714
static struct page *follow_page_mask(struct vm_area_struct *vma,
715
			      unsigned long address, unsigned int flags,
716
			      struct follow_page_context *ctx)
717 718 719 720 721
{
	pgd_t *pgd;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

722
	ctx->page_mask = 0;
723 724 725 726

	/* make this handle hugepd */
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
John Hubbard's avatar
John Hubbard committed
727
		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
728 729 730 731 732 733 734 735
		return page;
	}

	pgd = pgd_offset(mm, address);

	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		return no_page_table(vma, flags);

736 737 738 739 740 741
	if (pgd_huge(*pgd)) {
		page = follow_huge_pgd(mm, address, pgd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
742 743 744 745 746 747 748 749
	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
		page = follow_huge_pd(vma, address,
				      __hugepd(pgd_val(*pgd)), flags,
				      PGDIR_SHIFT);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
750

751 752 753 754 755 756 757 758 759 760 761 762 763
	return follow_p4d_mask(vma, address, pgd, flags, ctx);
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
			 unsigned int foll_flags)
{
	struct follow_page_context ctx = { NULL };
	struct page *page;

	page = follow_page_mask(vma, address, foll_flags, &ctx);
	if (ctx.pgmap)
		put_dev_pagemap(ctx.pgmap);
	return page;
764 765
}

766 767 768 769 770
static int get_gate_page(struct mm_struct *mm, unsigned long address,
		unsigned int gup_flags, struct vm_area_struct **vma,
		struct page **page)
{
	pgd_t *pgd;
771
	p4d_t *p4d;
772 773 774 775 776 777 778 779 780 781 782 783
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
	int ret = -EFAULT;

	/* user gate pages are read-only */
	if (gup_flags & FOLL_WRITE)
		return -EFAULT;
	if (address > TASK_SIZE)
		pgd = pgd_offset_k(address);
	else
		pgd = pgd_offset_gate(mm, address);
784 785
	if (pgd_none(*pgd))
		return -EFAULT;
786
	p4d = p4d_offset(pgd, address);
787 788
	if (p4d_none(*p4d))
		return -EFAULT;
789
	pud = pud_offset(p4d, address);
790 791
	if (pud_none(*pud))
		return -EFAULT;
792
	pmd = pmd_offset(pud, address);
793
	if (!pmd_present(*pmd))
794 795 796 797 798 799 800 801 802 803 804 805 806 807
		return -EFAULT;
	VM_BUG_ON(pmd_trans_huge(*pmd));
	pte = pte_offset_map(pmd, address);
	if (pte_none(*pte))
		goto unmap;
	*vma = get_gate_vma(mm);
	if (!page)
		goto out;
	*page = vm_normal_page(*vma, address, *pte);
	if (!*page) {
		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
			goto unmap;
		*page = pte_page(*pte);
	}
808 809 810 811
	if (unlikely(!try_get_page(*page))) {
		ret = -ENOMEM;
		goto unmap;
	}
812 813 814 815 816 817 818
out:
	ret = 0;
unmap:
	pte_unmap(pte);
	return ret;
}

819 820 821 822 823
/*
 * mmap_sem must be held on entry.  If @nonblocking != NULL and
 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 */
824 825 826 827
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
		unsigned long address, unsigned int *flags, int *nonblocking)
{
	unsigned int fault_flags = 0;
828
	vm_fault_t ret;
829

Eric B Munson's avatar
Eric B Munson committed
830 831 832
	/* mlock all present pages, but do not fault in new pages */
	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
		return -ENOENT;
833 834
	if (*flags & FOLL_WRITE)
		fault_flags |= FAULT_FLAG_WRITE;
835 836
	if (*flags & FOLL_REMOTE)
		fault_flags |= FAULT_FLAG_REMOTE;
837 838 839 840
	if (nonblocking)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
	if (*flags & FOLL_NOWAIT)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
841 842 843 844
	if (*flags & FOLL_TRIED) {
		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
		fault_flags |= FAULT_FLAG_TRIED;
	}
845

846
	ret = handle_mm_fault(vma, address, fault_flags);
847
	if (ret & VM_FAULT_ERROR) {
848 849 850 851
		int err = vm_fault_to_errno(ret, *flags);

		if (err)
			return err;
852 853 854 855 856 857 858 859 860 861 862
		BUG();
	}

	if (tsk) {
		if (ret & VM_FAULT_MAJOR)
			tsk->maj_flt++;
		else
			tsk->min_flt++;
	}

	if (ret & VM_FAULT_RETRY) {
863
		if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
864 865 866 867 868 869 870 871 872 873 874 875 876 877
			*nonblocking = 0;
		return -EBUSY;
	}

	/*
	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
	 * can thus safely do subsequent page lookups as if they were reads.
	 * But only do so when looping for pte_write is futile: in some cases
	 * userspace may also be wanting to write to the gotten user page,
	 * which a read fault here might prevent (a readonly page might get
	 * reCOWed by userspace write).
	 */
	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
878
		*flags |= FOLL_COW;
879 880 881
	return 0;
}

882 883 884
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
	vm_flags_t vm_flags = vma->vm_flags;
885 886
	int write = (gup_flags & FOLL_WRITE);
	int foreign = (gup_flags & FOLL_REMOTE);
887 888 889 890

	if (vm_flags & (VM_IO | VM_PFNMAP))
		return -EFAULT;

891 892 893
	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
		return -EFAULT;

894
	if (write) {
895 896 897 898 899 900 901 902 903 904 905 906
		if (!(vm_flags & VM_WRITE)) {
			if (!(gup_flags & FOLL_FORCE))
				return -EFAULT;
			/*
			 * We used to let the write,force case do COW in a
			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
			 * set a breakpoint in a read-only mapping of an
			 * executable, without corrupting the file (yet only
			 * when that file had been opened for writing!).
			 * Anon pages in shared mappings are surprising: now
			 * just reject it.
			 */
907
			if (!is_cow_mapping(vm_flags))
908 909 910 911 912 913 914 915 916 917 918 919
				return -EFAULT;
		}
	} else if (!(vm_flags & VM_READ)) {
		if (!(gup_flags & FOLL_FORCE))
			return -EFAULT;
		/*
		 * Is there actually any vma we can reach here which does not
		 * have VM_MAYREAD set?
		 */
		if (!(vm_flags & VM_MAYREAD))
			return -EFAULT;
	}
920 921 922 923 924
	/*
	 * gups are always data accesses, not instruction
	 * fetches, so execute=false here
	 */
	if (!arch_vma_access_permitted(vma, write, false, foreign))
925
		return -EFAULT;
926 927 928
	return 0;
}

929 930 931 932 933 934 935 936 937 938 939 940 941 942
/**
 * __get_user_pages() - pin user pages in memory
 * @tsk:	task_struct of target task
 * @mm:		mm_struct of target mm
 * @start:	starting user address
 * @nr_pages:	number of pages from start to pin
 * @gup_flags:	flags modifying pin behaviour
 * @pages:	array that receives pointers to the pages pinned.
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page.
 *		Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
943 944 945 946 947 948 949 950 951 952 953
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * @vmas are valid only as long as mmap_sem is held.
954
 *
955
 * Must be called with mmap_sem held.  It may be released.  See below.
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
978 979 980 981 982 983 984 985
 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 * this case.
 *
 * A caller using such a combination of @nonblocking and @gup_flags
 * must therefore hold the mmap_sem for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
986 987 988 989 990
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
991
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
992 993 994 995
		unsigned long start, unsigned long nr_pages,
		unsigned int gup_flags, struct page **pages,
		struct vm_area_struct **vmas, int *nonblocking)
{
996
	long ret = 0, i = 0;
997
	struct vm_area_struct *vma = NULL;
998
	struct follow_page_context ctx = { NULL };
999 1000 1001 1002

	if (!nr_pages)
		return 0;

1003 1004
	start = untagged_addr(start);

1005
	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015

	/*
	 * If FOLL_FORCE is set then do not force a full fault as the hinting
	 * fault information is unrelated to the reference behaviour of a task
	 * using the address space
	 */
	if (!(gup_flags & FOLL_FORCE))
		gup_flags |= FOLL_NUMA;

	do {
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
		struct page *page;
		unsigned int foll_flags = gup_flags;
		unsigned int page_increm;

		/* first iteration or cross vma bound */
		if (!vma || start >= vma->vm_end) {
			vma = find_extend_vma(mm, start);
			if (!vma && in_gate_area(mm, start)) {
				ret = get_gate_page(mm, start & PAGE_MASK,
						gup_flags, &vma,
						pages ? &pages[i] : NULL);
				if (ret)
1028
					goto out;
1029
				ctx.page_mask = 0;
1030 1031
				goto next_page;
			}
1032

1033 1034 1035 1036
			if (!vma || check_vma_flags(vma, gup_flags)) {
				ret = -EFAULT;
				goto out;
			}
1037 1038 1039
			if (is_vm_hugetlb_page(vma)) {
				i = follow_hugetlb_page(mm, vma, pages, vmas,
						&start, &nr_pages, i,
1040
						gup_flags, nonblocking);
1041
				continue;
1042
			}
1043 1044 1045 1046 1047 1048
		}
retry:
		/*
		 * If we have a pending SIGKILL, don't keep faulting pages and
		 * potentially allocating memory.
		 */
1049
		if (fatal_signal_pending(current)) {
1050 1051 1052
			ret = -ERESTARTSYS;
			goto out;
		}
1053
		cond_resched();
1054 1055

		page = follow_page_mask(vma, start, foll_flags, &ctx);
1056 1057 1058 1059 1060 1061
		if (!page) {
			ret = faultin_page(tsk, vma, start, &foll_flags,
					nonblocking);
			switch (ret) {
			case 0:
				goto retry;
1062 1063 1064
			case -EBUSY:
				ret = 0;
				/* FALLTHRU */
1065 1066 1067
			case -EFAULT:
			case -ENOMEM:
			case -EHWPOISON:
1068
				goto out;
1069 1070
			case -ENOENT:
				goto next_page;
1071
			}
1072
			BUG();
1073 1074 1075 1076 1077 1078 1079
		} else if (PTR_ERR(page) == -EEXIST) {
			/*
			 * Proper page table entry exists, but no corresponding
			 * struct page.
			 */
			goto next_page;
		} else if (IS_ERR(page)) {
1080 1081
			ret = PTR_ERR(page);
			goto out;
1082
		}
1083 1084 1085 1086
		if (pages) {
			pages[i] = page;
			flush_anon_page(vma, page, start);
			flush_dcache_page(page);
1087
			ctx.page_mask = 0;
1088 1089
		}
next_page:
1090 1091
		if (vmas) {
			vmas[i] = vma;
1092
			ctx.page_mask = 0;
1093
		}
1094
		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1095 1096 1097 1098 1099
		if (page_increm > nr_pages)
			page_increm = nr_pages;
		i += page_increm;
		start += page_increm * PAGE_SIZE;
		nr_pages -= page_increm;
1100
	} while (nr_pages);
1101 1102 1103 1104
out:
	if (ctx.pgmap)
		put_dev_pagemap(ctx.pgmap);
	return i ? i : ret;
1105 1106
}

1107 1108
static bool vma_permits_fault(struct vm_area_struct *vma,
			      unsigned int fault_flags)
1109
{
1110 1111
	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1112
	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1113 1114 1115 1116

	if (!(vm_flags & vma->vm_flags))
		return false;

1117 1118
	/*
	 * The architecture might have a hardware protection
1119
	 * mechanism other than read/write that can deny access.
1120 1121 1122
	 *
	 * gup always represents data access, not instruction
	 * fetches, so execute=false here:
1123
	 */
1124
	if (!arch_vma_access_permitted(vma, write, false, foreign))
1125 1126
		return false;

1127 1128 1129
	return true;
}

1130 1131 1132 1133 1134 1135 1136
/*
 * fixup_user_fault() - manually resolve a user page fault
 * @tsk:	the task_struct to use for page fault accounting, or
 *		NULL if faults are not to be recorded.
 * @mm:		mm_struct of target mm
 * @address:	user address
 * @fault_flags:flags to pass down to handle_mm_fault()
1137 1138
 * @unlocked:	did we unlock the mmap_sem while retrying, maybe NULL if caller
 *		does not allow retry
1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
1150
 * get_user_pages() only guarantees to update these in the struct page.
Kirill A. Shutemov's avatar