hugetlb.c 138 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3
/*
 * Generic hugetlb support.
4
 * (C) Nadia Yvette Chambers, April 2004
Linus Torvalds's avatar
Linus Torvalds committed
5 6 7 8
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
9
#include <linux/seq_file.h>
Linus Torvalds's avatar
Linus Torvalds committed
10 11
#include <linux/sysctl.h>
#include <linux/highmem.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
12
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
13
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
14
#include <linux/pagemap.h>
15
#include <linux/mempolicy.h>
16
#include <linux/compiler.h>
17
#include <linux/cpuset.h>
18
#include <linux/mutex.h>
19
#include <linux/memblock.h>
20
#include <linux/sysfs.h>
21
#include <linux/slab.h>
22
#include <linux/mmdebug.h>
23
#include <linux/sched/signal.h>
24
#include <linux/rmap.h>
25
#include <linux/string_helpers.h>
26 27
#include <linux/swap.h>
#include <linux/swapops.h>
28
#include <linux/jhash.h>
29
#include <linux/numa.h>
30
#include <linux/llist.h>
31

David Gibson's avatar
David Gibson committed
32 33
#include <asm/page.h>
#include <asm/pgtable.h>
34
#include <asm/tlb.h>
David Gibson's avatar
David Gibson committed
35

36
#include <linux/io.h>
David Gibson's avatar
David Gibson committed
37
#include <linux/hugetlb.h>
38
#include <linux/hugetlb_cgroup.h>
39
#include <linux/node.h>
40
#include <linux/userfaultfd_k.h>
41
#include <linux/page_owner.h>
42
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
43

44
int hugetlb_max_hstate __read_mostly;
45 46
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
47 48 49 50 51
/*
 * Minimum page order among possible hugepage sizes, set to a proper value
 * at boot time.
 */
static unsigned int minimum_order __read_mostly = UINT_MAX;
52

53 54
__initdata LIST_HEAD(huge_boot_pages);

55 56 57
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
58
static unsigned long __initdata default_hstate_size;
59
static bool __initdata parsed_valid_hugepagesz = true;
60

61
/*
62 63
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
64
 */
65
DEFINE_SPINLOCK(hugetlb_lock);
66

67 68 69 70 71
/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes;
72
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
73

74 75 76
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);

77 78 79 80 81 82 83
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
	bool free = (spool->count == 0) && (spool->used_hpages == 0);

	spin_unlock(&spool->lock);

	/* If no pages are used, and no other handles to the subpool
84 85 86 87 88 89
	 * remain, give up any reservations mased on minimum size and
	 * free the subpool */
	if (free) {
		if (spool->min_hpages != -1)
			hugetlb_acct_memory(spool->hstate,
						-spool->min_hpages);
90
		kfree(spool);
91
	}
92 93
}

94 95
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
						long min_hpages)
96 97 98
{
	struct hugepage_subpool *spool;

99
	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
100 101 102 103 104
	if (!spool)
		return NULL;

	spin_lock_init(&spool->lock);
	spool->count = 1;
105 106 107 108 109 110 111 112 113
	spool->max_hpages = max_hpages;
	spool->hstate = h;
	spool->min_hpages = min_hpages;

	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
		kfree(spool);
		return NULL;
	}
	spool->rsv_hpages = min_hpages;
114 115 116 117 118 119 120 121 122 123 124 125

	return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
	spin_lock(&spool->lock);
	BUG_ON(!spool->count);
	spool->count--;
	unlock_or_release_subpool(spool);
}

126 127 128 129 130 131 132 133 134
/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * the request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be manitained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
135 136
				      long delta)
{
137
	long ret = delta;
138 139

	if (!spool)
140
		return ret;
141 142

	spin_lock(&spool->lock);
143 144 145 146 147 148 149 150

	if (spool->max_hpages != -1) {		/* maximum size accounting */
		if ((spool->used_hpages + delta) <= spool->max_hpages)
			spool->used_hpages += delta;
		else {
			ret = -ENOMEM;
			goto unlock_ret;
		}
151 152
	}

153 154
	/* minimum size accounting */
	if (spool->min_hpages != -1 && spool->rsv_hpages) {
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
		if (delta > spool->rsv_hpages) {
			/*
			 * Asking for more reserves than those already taken on
			 * behalf of subpool.  Return difference.
			 */
			ret = delta - spool->rsv_hpages;
			spool->rsv_hpages = 0;
		} else {
			ret = 0;	/* reserves already accounted for */
			spool->rsv_hpages -= delta;
		}
	}

unlock_ret:
	spin_unlock(&spool->lock);
170 171 172
	return ret;
}

173 174 175 176 177 178 179
/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
180 181
				       long delta)
{
182 183
	long ret = delta;

184
	if (!spool)
185
		return delta;
186 187

	spin_lock(&spool->lock);
188 189 190 191

	if (spool->max_hpages != -1)		/* maximum size accounting */
		spool->used_hpages -= delta;

192 193
	 /* minimum size accounting */
	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
194 195 196 197 198 199 200 201 202 203 204 205 206 207
		if (spool->rsv_hpages + delta <= spool->min_hpages)
			ret = 0;
		else
			ret = spool->rsv_hpages + delta - spool->min_hpages;

		spool->rsv_hpages += delta;
		if (spool->rsv_hpages > spool->min_hpages)
			spool->rsv_hpages = spool->min_hpages;
	}

	/*
	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
	 * quota reference, free it now.
	 */
208
	unlock_or_release_subpool(spool);
209 210

	return ret;
211 212 213 214 215 216 217 218 219
}

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
	return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
Al Viro's avatar
Al Viro committed
220
	return subpool_inode(file_inode(vma->vm_file));
221 222
}

223 224 225
/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
226
 *
227 228 229 230 231 232 233 234 235 236 237 238 239 240
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indicies into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
241 242 243 244 245 246 247
 */
struct file_region {
	struct list_head link;
	long from;
	long to;
};

Mina Almasry's avatar
Mina Almasry committed
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
/* Must be called with resv->lock held. Calling this with count_only == true
 * will count the number of pages to be added but will not modify the linked
 * list.
 */
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
				     bool count_only)
{
	long chg = 0;
	struct list_head *head = &resv->regions;
	struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* We overlap with this area, if it extends further than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation.
		 */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;

		if (!count_only && rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}

	if (!count_only) {
		nrg->from = f;
		nrg->to = t;
	}

	return chg;
}

302 303
/*
 * Add the huge page range represented by [f, t) to the reserve
304 305 306 307
 * map.  Existing regions will be expanded to accommodate the specified
 * range, or a region will be taken from the cache.  Sufficient regions
 * must exist in the cache due to the previous call to region_chg with
 * the same range.
308 309 310
 *
 * Return the number of new huge pages added to the map.  This
 * number is greater than or equal to zero.
311
 */
312
static long region_add(struct resv_map *resv, long f, long t)
313
{
314
	struct list_head *head = &resv->regions;
Mina Almasry's avatar
Mina Almasry committed
315
	struct file_region *rg, *nrg;
316
	long add = 0;
317

318
	spin_lock(&resv->lock);
319 320 321 322 323
	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

324 325
	/*
	 * If no region exists which can be expanded to include the
326 327
	 * specified range, pull a region descriptor from the cache
	 * and use it for this range.
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
	 */
	if (&rg->link == head || t < rg->from) {
		VM_BUG_ON(resv->region_cache_count <= 0);

		resv->region_cache_count--;
		nrg = list_first_entry(&resv->region_cache, struct file_region,
					link);
		list_del(&nrg->link);

		nrg->from = f;
		nrg->to = t;
		list_add(&nrg->link, rg->link.prev);

		add += t - f;
		goto out_locked;
	}

Mina Almasry's avatar
Mina Almasry committed
345
	add = add_reservation_in_range(resv, f, t, false);
346

347 348
out_locked:
	resv->adds_in_progress--;
349
	spin_unlock(&resv->lock);
350 351
	VM_BUG_ON(add < 0);
	return add;
352 353
}

354 355 356 357 358 359 360
/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
361 362 363
 * map.  A new file_region structure is added to the cache
 * as a placeholder, so that the subsequent region_add
 * call will have all the regions it needs and will not fail.
364 365 366 367 368
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
369
 */
370
static long region_chg(struct resv_map *resv, long f, long t)
371 372 373
{
	long chg = 0;

374
	spin_lock(&resv->lock);
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
retry_locked:
	resv->adds_in_progress++;

	/*
	 * Check for sufficient descriptors in the cache to accommodate
	 * the number of in progress add operations.
	 */
	if (resv->adds_in_progress > resv->region_cache_count) {
		struct file_region *trg;

		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
		/* Must drop lock to allocate a new descriptor. */
		resv->adds_in_progress--;
		spin_unlock(&resv->lock);

		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
391
		if (!trg)
392 393 394 395 396 397 398 399
			return -ENOMEM;

		spin_lock(&resv->lock);
		list_add(&trg->link, &resv->region_cache);
		resv->region_cache_count++;
		goto retry_locked;
	}

Mina Almasry's avatar
Mina Almasry committed
400
	chg = add_reservation_in_range(resv, f, t, true);
401 402

	spin_unlock(&resv->lock);
403 404 405
	return chg;
}

406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t)
{
	spin_lock(&resv->lock);
	VM_BUG_ON(!resv->region_cache_count);
	resv->adds_in_progress--;
	spin_unlock(&resv->lock);
}

425
/*
426 427 428 429 430 431 432 433 434 435 436 437
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
438
 */
439
static long region_del(struct resv_map *resv, long f, long t)
440
{
441
	struct list_head *head = &resv->regions;
442
	struct file_region *rg, *trg;
443 444
	struct file_region *nrg = NULL;
	long del = 0;
445

446
retry:
447
	spin_lock(&resv->lock);
448
	list_for_each_entry_safe(rg, trg, head, link) {
449 450 451 452 453 454 455 456
		/*
		 * Skip regions before the range to be deleted.  file_region
		 * ranges are normally of the form [from, to).  However, there
		 * may be a "placeholder" entry in the map which is of the form
		 * (from, to) with from == to.  Check for placeholder entries
		 * at the beginning of the range to be deleted.
		 */
		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
457
			continue;
458

459
		if (rg->from >= t)
460 461
			break;

462 463 464 465 466 467 468 469 470 471 472 473 474
		if (f > rg->from && t < rg->to) { /* Must split region */
			/*
			 * Check for an entry in the cache before dropping
			 * lock and attempting allocation.
			 */
			if (!nrg &&
			    resv->region_cache_count > resv->adds_in_progress) {
				nrg = list_first_entry(&resv->region_cache,
							struct file_region,
							link);
				list_del(&nrg->link);
				resv->region_cache_count--;
			}
475

476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
			if (!nrg) {
				spin_unlock(&resv->lock);
				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
				if (!nrg)
					return -ENOMEM;
				goto retry;
			}

			del += t - f;

			/* New entry for end of split region */
			nrg->from = t;
			nrg->to = rg->to;
			INIT_LIST_HEAD(&nrg->link);

			/* Original entry is trimmed */
			rg->to = f;

			list_add(&nrg->link, &rg->link);
			nrg = NULL;
496
			break;
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
		}

		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
			del += rg->to - rg->from;
			list_del(&rg->link);
			kfree(rg);
			continue;
		}

		if (f <= rg->from) {	/* Trim beginning of region */
			del += t - rg->from;
			rg->from = t;
		} else {		/* Trim end of region */
			del += rg->to - f;
			rg->to = f;
		}
513
	}
514 515

	spin_unlock(&resv->lock);
516 517
	kfree(nrg);
	return del;
518 519
}

520 521 522 523 524 525 526 527 528
/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
529
void hugetlb_fix_reserve_counts(struct inode *inode)
530 531 532 533 534
{
	struct hugepage_subpool *spool = subpool_inode(inode);
	long rsv_adjust;

	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
535
	if (rsv_adjust) {
536 537 538 539 540 541
		struct hstate *h = hstate_inode(inode);

		hugetlb_acct_memory(h, 1);
	}
}

542 543 544 545
/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
546
static long region_count(struct resv_map *resv, long f, long t)
547
{
548
	struct list_head *head = &resv->regions;
549 550 551
	struct file_region *rg;
	long chg = 0;

552
	spin_lock(&resv->lock);
553 554
	/* Locate each segment we overlap with, and count that overlap. */
	list_for_each_entry(rg, head, link) {
555 556
		long seg_from;
		long seg_to;
557 558 559 560 561 562 563 564 565 566 567

		if (rg->to <= f)
			continue;
		if (rg->from >= t)
			break;

		seg_from = max(rg->from, f);
		seg_to = min(rg->to, t);

		chg += seg_to - seg_from;
	}
568
	spin_unlock(&resv->lock);
569 570 571 572

	return chg;
}

573 574 575 576
/*
 * Convert the address within this vma to the page offset within
 * the mapping, in pagecache page units; huge pages here.
 */
577 578
static pgoff_t vma_hugecache_offset(struct hstate *h,
			struct vm_area_struct *vma, unsigned long address)
579
{
580 581
	return ((address - vma->vm_start) >> huge_page_shift(h)) +
			(vma->vm_pgoff >> huge_page_order(h));
582 583
}

584 585 586 587 588
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
				     unsigned long address)
{
	return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
589
EXPORT_SYMBOL_GPL(linear_hugepage_index);
590

591 592 593 594 595 596
/*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
 */
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
597 598 599
	if (vma->vm_ops && vma->vm_ops->pagesize)
		return vma->vm_ops->pagesize(vma);
	return PAGE_SIZE;
600
}
601
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
602

603 604 605
/*
 * Return the page size being used by the MMU to back a VMA. In the majority
 * of cases, the page size used by the kernel matches the MMU size. On
606 607
 * architectures where it differs, an architecture-specific 'strong'
 * version of this symbol is required.
608
 */
609
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
610 611 612 613
{
	return vma_kernel_pagesize(vma);
}

614 615 616 617 618 619 620
/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
621
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
622

623 624 625 626 627 628 629 630 631
/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
632 633 634 635 636 637 638 639 640
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
641
 */
642 643 644 645 646 647 648 649 650 651 652
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
	return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
							unsigned long value)
{
	vma->vm_private_data = (void *)value;
}

653
struct resv_map *resv_map_alloc(void)
654 655
{
	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
656 657 658 659 660
	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

	if (!resv_map || !rg) {
		kfree(resv_map);
		kfree(rg);
661
		return NULL;
662
	}
663 664

	kref_init(&resv_map->refs);
665
	spin_lock_init(&resv_map->lock);
666 667
	INIT_LIST_HEAD(&resv_map->regions);

668 669 670 671 672 673
	resv_map->adds_in_progress = 0;

	INIT_LIST_HEAD(&resv_map->region_cache);
	list_add(&rg->link, &resv_map->region_cache);
	resv_map->region_cache_count = 1;

674 675 676
	return resv_map;
}

677
void resv_map_release(struct kref *ref)
678 679
{
	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
680 681
	struct list_head *head = &resv_map->region_cache;
	struct file_region *rg, *trg;
682 683

	/* Clear out any active regions before we release the map. */
684
	region_del(resv_map, 0, LONG_MAX);
685 686 687 688 689 690 691 692 693

	/* ... and any entries left in the cache */
	list_for_each_entry_safe(rg, trg, head, link) {
		list_del(&rg->link);
		kfree(rg);
	}

	VM_BUG_ON(resv_map->adds_in_progress);

694 695 696
	kfree(resv_map);
}

697 698
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
699 700 701 702 703 704 705 706 707
	/*
	 * At inode evict time, i_mapping may not point to the original
	 * address space within the inode.  This original address space
	 * contains the pointer to the resv_map.  So, always use the
	 * address space embedded within the inode.
	 * The VERY common case is inode->mapping == &inode->i_data but,
	 * this may not be true for device special inodes.
	 */
	return (struct resv_map *)(&inode->i_data)->private_data;
708 709
}

710
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
711
{
712
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
713 714 715 716 717 718 719
	if (vma->vm_flags & VM_MAYSHARE) {
		struct address_space *mapping = vma->vm_file->f_mapping;
		struct inode *inode = mapping->host;

		return inode_resv_map(inode);

	} else {
720 721
		return (struct resv_map *)(get_vma_private_data(vma) &
							~HPAGE_RESV_MASK);
722
	}
723 724
}

725
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
726
{
727 728
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
729

730 731
	set_vma_private_data(vma, (get_vma_private_data(vma) &
				HPAGE_RESV_MASK) | (unsigned long)map);
732 733 734 735
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
736 737
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
738 739

	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
740 741 742 743
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
744
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
745 746

	return (get_vma_private_data(vma) & flag) != 0;
747 748
}

749
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
750 751
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
752
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
753
	if (!(vma->vm_flags & VM_MAYSHARE))
754 755 756 757
		vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
758
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
759
{
760 761 762 763 764 765 766 767 768 769 770
	if (vma->vm_flags & VM_NORESERVE) {
		/*
		 * This address is already reserved by other process(chg == 0),
		 * so, we should decrement reserved count. Without decrementing,
		 * reserve count remains after releasing inode, because this
		 * allocated page will go into page cache and is regarded as
		 * coming from reserved pool in releasing step.  Currently, we
		 * don't have any other solution to deal with this situation
		 * properly, so add work-around here.
		 */
		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
771
			return true;
772
		else
773
			return false;
774
	}
775 776

	/* Shared mappings always use reserves */
777 778 779 780 781 782 783 784 785 786 787 788 789
	if (vma->vm_flags & VM_MAYSHARE) {
		/*
		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
		 * be a region map for all pages.  The only situation where
		 * there is no region map is if a hole was punched via
		 * fallocate.  In this case, there really are no reverves to
		 * use.  This situation is indicated if chg != 0.
		 */
		if (chg)
			return false;
		else
			return true;
	}
790 791 792 793 794

	/*
	 * Only the process that called mmap() has reserves for
	 * private mappings.
	 */
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		/*
		 * Like the shared case above, a hole punch or truncate
		 * could have been performed on the private mapping.
		 * Examine the value of chg to determine if reserves
		 * actually exist or were previously consumed.
		 * Very Subtle - The value of chg comes from a previous
		 * call to vma_needs_reserves().  The reserve map for
		 * private mappings has different (opposite) semantics
		 * than that of shared mappings.  vma_needs_reserves()
		 * has already taken this difference in semantics into
		 * account.  Therefore, the meaning of chg is the same
		 * as in the shared case above.  Code could easily be
		 * combined, but keeping it separate draws attention to
		 * subtle differences.
		 */
		if (chg)
			return false;
		else
			return true;
	}
816

817
	return false;
818 819
}

820
static void enqueue_huge_page(struct hstate *h, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
821 822
{
	int nid = page_to_nid(page);
823
	list_move(&page->lru, &h->hugepage_freelists[nid]);
824 825
	h->free_huge_pages++;
	h->free_huge_pages_node[nid]++;
Linus Torvalds's avatar
Linus Torvalds committed
826 827
}

828
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
829 830 831
{
	struct page *page;

832
	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
833
		if (!PageHWPoison(page))
834 835 836 837 838 839
			break;
	/*
	 * if 'non-isolated free hugepage' not found on the list,
	 * the allocation fails.
	 */
	if (&h->hugepage_freelists[nid] == &page->lru)
840
		return NULL;
841
	list_move(&page->lru, &h->hugepage_activelist);
842
	set_page_refcounted(page);
843 844 845 846 847
	h->free_huge_pages--;
	h->free_huge_pages_node[nid]--;
	return page;
}

848 849
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
		nodemask_t *nmask)
850
{
851 852 853 854
	unsigned int cpuset_mems_cookie;
	struct zonelist *zonelist;
	struct zone *zone;
	struct zoneref *z;
855
	int node = NUMA_NO_NODE;
856

857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
	zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
	cpuset_mems_cookie = read_mems_allowed_begin();
	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
		struct page *page;

		if (!cpuset_zone_allowed(zone, gfp_mask))
			continue;
		/*
		 * no need to ask again on the same node. Pool is node rather than
		 * zone aware
		 */
		if (zone_to_nid(zone) == node)
			continue;
		node = zone_to_nid(zone);
873 874 875 876 877

		page = dequeue_huge_page_node_exact(h, node);
		if (page)
			return page;
	}
878 879 880
	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
		goto retry_cpuset;

881 882 883
	return NULL;
}

884 885 886
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
887
	if (hugepage_movable_supported(h))
888 889 890 891 892
		return GFP_HIGHUSER_MOVABLE;
	else
		return GFP_HIGHUSER;
}

893 894
static struct page *dequeue_huge_page_vma(struct hstate *h,
				struct vm_area_struct *vma,
895 896
				unsigned long address, int avoid_reserve,
				long chg)
Linus Torvalds's avatar
Linus Torvalds committed
897
{
898
	struct page *page;
899
	struct mempolicy *mpol;
900
	gfp_t gfp_mask;
901
	nodemask_t *nodemask;
902
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
903

904 905 906 907 908
	/*
	 * A child process with MAP_PRIVATE mappings created by their parent
	 * have no page reserves. This check ensures that reservations are
	 * not "stolen". The child may still get SIGKILLed
	 */
909
	if (!vma_has_reserves(vma, chg) &&
910
			h->free_huge_pages - h->resv_huge_pages == 0)
911
		goto err;
912

913
	/* If reserves cannot be used, ensure enough pages are in the pool */
914
	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
915
		goto err;
916

917 918
	gfp_mask = htlb_alloc_mask(h);
	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
919 920 921 922
	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
		SetPagePrivate(page);
		h->resv_huge_pages--;
Linus Torvalds's avatar
Linus Torvalds committed
923
	}
924

925
	mpol_cond_put(mpol);
Linus Torvalds's avatar
Linus Torvalds committed
926
	return page;
927 928 929

err:
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
930 931
}

932 933 934 935 936 937 938 939 940
/*
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
941
	nid = next_node_in(nid, *nodes_allowed);
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
	VM_BUG_ON(nid >= MAX_NUMNODES);

	return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
	if (!node_isset(nid, *nodes_allowed))
		nid = next_node_allowed(nid, nodes_allowed);
	return nid;
}

/*
 * returns the previously saved node ["this node"] from which to
 * allocate a persistent huge page for the pool and advance the
 * next node from which to allocate, handling wrap at end of node
 * mask.
 */
static int hstate_next_node_to_alloc(struct hstate *h,
					nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);

	return nid;
}

/*
 * helper for free_pool_huge_page() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
 * next node id whether or not we find a free huge page to free so
 * that the next attempt to free addresses the next node.
 */
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

	return nid;
}

#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
		nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
		nr_nodes--)

1003
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1004
static void destroy_compound_gigantic_page(struct page *page,
1005
					unsigned int order)
1006 1007 1008 1009 1010
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

1011
	atomic_set(compound_mapcount_ptr(page), 0);
1012 1013 1014
	if (hpage_pincount_available(page))
		atomic_set(compound_pincount_ptr(page), 0);

1015
	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1016
		clear_compound_head(p);
1017 1018 1019 1020 1021 1022 1023
		set_page_refcounted(p);
	}

	set_compound_order(page, 0);
	__ClearPageHead(page);
}

1024
static void free_gigantic_page(struct page *page, unsigned int order)
1025 1026 1027 1028
{
	free_contig_range(page_to_pfn(page), 1 << order);
}

1029
#ifdef CONFIG_CONTIG_ALLOC
1030 1031
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
		int nid, nodemask_t *nodemask)
1032
{
1033
	unsigned long nr_pages = 1UL << huge_page_order(h);
1034

1035
	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1036 1037 1038
}

static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1039
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1040 1041 1042 1043 1044 1045 1046
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
					int nid, nodemask_t *nodemask)
{
	return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
1047

1048
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1049
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1050 1051 1052 1053
					int nid, nodemask_t *nodemask)
{
	return NULL;
}
1054
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1055
static inline void destroy_compound_gigantic_page(struct page *page,
1056
						unsigned int order) { }
1057 1058
#endif

1059
static void update_and_free_page(struct hstate *h, struct page *page)
1060 1061
{
	int i;
1062

1063
	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1064
		return;
1065

1066 1067 1068
	h->nr_huge_pages--;
	h->nr_huge_pages_node[page_to_nid(page)]--;
	for (i = 0;