page_alloc.c 236 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
20
#include <linux/highmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
24
#include <linux/jiffies.h>
25
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
26
#include <linux/compiler.h>
27
#include <linux/kernel.h>
28
#include <linux/kasan.h>
Linus Torvalds's avatar
Linus Torvalds committed
29 30 31 32 33
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
34
#include <linux/ratelimit.h>
35
#include <linux/oom.h>
Linus Torvalds's avatar
Linus Torvalds committed
36 37 38 39
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
40
#include <linux/memory_hotplug.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
43
#include <linux/vmstat.h>
44
#include <linux/mempolicy.h>
45
#include <linux/memremap.h>
46
#include <linux/stop_machine.h>
47
#include <linux/random.h>
48 49
#include <linux/sort.h>
#include <linux/pfn.h>
50
#include <linux/backing-dev.h>
51
#include <linux/fault-inject.h>
52
#include <linux/page-isolation.h>
53
#include <linux/debugobjects.h>
54
#include <linux/kmemleak.h>
55
#include <linux/compaction.h>
56
#include <trace/events/kmem.h>
57
#include <trace/events/oom.h>
58
#include <linux/prefetch.h>
59
#include <linux/mm_inline.h>
60
#include <linux/migrate.h>
61
#include <linux/hugetlb.h>
62
#include <linux/sched/rt.h>
63
#include <linux/sched/mm.h>
64
#include <linux/page_owner.h>
65
#include <linux/kthread.h>
66
#include <linux/memcontrol.h>
67
#include <linux/ftrace.h>
68
#include <linux/lockdep.h>
69
#include <linux/nmi.h>
70
#include <linux/psi.h>
Linus Torvalds's avatar
Linus Torvalds committed
71

72
#include <asm/sections.h>
Linus Torvalds's avatar
Linus Torvalds committed
73
#include <asm/tlbflush.h>
74
#include <asm/div64.h>
Linus Torvalds's avatar
Linus Torvalds committed
75
#include "internal.h"
76
#include "shuffle.h"
Linus Torvalds's avatar
Linus Torvalds committed
77

78 79
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
80
#define MIN_PERCPU_PAGELIST_FRACTION	(8)
81

82 83 84 85 86
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif

87 88
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);

89 90 91 92 93 94 95 96 97
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 * defined in <linux/topology.h>.
 */
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
98
int _node_numa_mem_[MAX_NUMNODES];
99 100
#endif

101
/* work_structs for global per-cpu drains */
102 103 104 105
struct pcpu_drain {
	struct zone *zone;
	struct work_struct work;
};
106
DEFINE_MUTEX(pcpu_drain_mutex);
107
DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
108

109
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
110
volatile unsigned long latent_entropy __latent_entropy;
111 112 113
EXPORT_SYMBOL(latent_entropy);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
114
/*
115
 * Array of node states.
Linus Torvalds's avatar
Linus Torvalds committed
116
 */
117 118 119 120 121 122 123
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
124 125
#endif
	[N_MEMORY] = { { [0] = 1UL } },
126 127 128 129 130
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

131 132
atomic_long_t _totalram_pages __read_mostly;
EXPORT_SYMBOL(_totalram_pages);
133
unsigned long totalreserve_pages __read_mostly;
134
unsigned long totalcma_pages __read_mostly;
135

136
int percpu_pagelist_fraction;
137
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
DEFINE_STATIC_KEY_TRUE(init_on_alloc);
#else
DEFINE_STATIC_KEY_FALSE(init_on_alloc);
#endif
EXPORT_SYMBOL(init_on_alloc);

#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
DEFINE_STATIC_KEY_TRUE(init_on_free);
#else
DEFINE_STATIC_KEY_FALSE(init_on_free);
#endif
EXPORT_SYMBOL(init_on_free);

static int __init early_init_on_alloc(char *buf)
{
	int ret;
	bool bool_result;

	if (!buf)
		return -EINVAL;
	ret = kstrtobool(buf, &bool_result);
	if (bool_result && page_poisoning_enabled())
		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
	if (bool_result)
		static_branch_enable(&init_on_alloc);
	else
		static_branch_disable(&init_on_alloc);
	return ret;
}
early_param("init_on_alloc", early_init_on_alloc);

static int __init early_init_on_free(char *buf)
{
	int ret;
	bool bool_result;

	if (!buf)
		return -EINVAL;
	ret = kstrtobool(buf, &bool_result);
	if (bool_result && page_poisoning_enabled())
		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
	if (bool_result)
		static_branch_enable(&init_on_free);
	else
		static_branch_disable(&init_on_free);
	return ret;
}
early_param("init_on_free", early_init_on_free);
Linus Torvalds's avatar
Linus Torvalds committed
187

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
/*
 * A cached value of the page's pageblock's migratetype, used when the page is
 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
 * Also the migratetype set in the page does not necessarily match the pcplist
 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
 * other index - this ensures that it will be put on the correct CMA freelist.
 */
static inline int get_pcppage_migratetype(struct page *page)
{
	return page->index;
}

static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
	page->index = migratetype;
}

206 207 208 209 210
#ifdef CONFIG_PM_SLEEP
/*
 * The following functions are used by the suspend/hibernate code to temporarily
 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
 * while devices are suspended.  To avoid races with the suspend/hibernate code,
211 212 213 214
 * they should always be called with system_transition_mutex held
 * (gfp_allowed_mask also should only be modified with system_transition_mutex
 * held, unless the suspend/hibernate code is guaranteed not to run in parallel
 * with that modification).
215
 */
216 217 218 219

static gfp_t saved_gfp_mask;

void pm_restore_gfp_mask(void)
220
{
221
	WARN_ON(!mutex_is_locked(&system_transition_mutex));
222 223 224 225
	if (saved_gfp_mask) {
		gfp_allowed_mask = saved_gfp_mask;
		saved_gfp_mask = 0;
	}
226 227
}

228
void pm_restrict_gfp_mask(void)
229
{
230
	WARN_ON(!mutex_is_locked(&system_transition_mutex));
231 232
	WARN_ON(saved_gfp_mask);
	saved_gfp_mask = gfp_allowed_mask;
233
	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
234
}
235 236 237

bool pm_suspended_storage(void)
{
238
	if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
239 240 241
		return false;
	return true;
}
242 243
#endif /* CONFIG_PM_SLEEP */

244
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
245
unsigned int pageblock_order __read_mostly;
246 247
#endif

248
static void __free_pages_ok(struct page *page, unsigned int order);
249

Linus Torvalds's avatar
Linus Torvalds committed
250 251 252 253 254 255
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
Yaowei Bai's avatar
Yaowei Bai committed
256
 *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
257 258 259
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
Linus Torvalds's avatar
Linus Torvalds committed
260
 */
261
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
262
#ifdef CONFIG_ZONE_DMA
263
	[ZONE_DMA] = 256,
264
#endif
265
#ifdef CONFIG_ZONE_DMA32
266
	[ZONE_DMA32] = 256,
267
#endif
268
	[ZONE_NORMAL] = 32,
269
#ifdef CONFIG_HIGHMEM
270
	[ZONE_HIGHMEM] = 0,
271
#endif
272
	[ZONE_MOVABLE] = 0,
273
};
Linus Torvalds's avatar
Linus Torvalds committed
274

275
static char * const zone_names[MAX_NR_ZONES] = {
276
#ifdef CONFIG_ZONE_DMA
277
	 "DMA",
278
#endif
279
#ifdef CONFIG_ZONE_DMA32
280
	 "DMA32",
281
#endif
282
	 "Normal",
283
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
284
	 "HighMem",
285
#endif
Mel Gorman's avatar
Mel Gorman committed
286
	 "Movable",
287 288 289
#ifdef CONFIG_ZONE_DEVICE
	 "Device",
#endif
290 291
};

292
const char * const migratetype_names[MIGRATE_TYPES] = {
293 294 295 296 297 298 299 300 301 302 303 304
	"Unmovable",
	"Movable",
	"Reclaimable",
	"HighAtomic",
#ifdef CONFIG_CMA
	"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	"Isolate",
#endif
};

305 306 307 308 309 310
compound_page_dtor * const compound_page_dtors[] = {
	NULL,
	free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
	free_huge_page,
#endif
311 312 313
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	free_transhuge_page,
#endif
314 315
};

Linus Torvalds's avatar
Linus Torvalds committed
316
int min_free_kbytes = 1024;
317
int user_min_free_kbytes = -1;
318 319 320 321 322 323 324 325 326 327 328 329
#ifdef CONFIG_DISCONTIGMEM
/*
 * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
 * are not on separate NUMA nodes. Functionally this works but with
 * watermark_boost_factor, it can reclaim prematurely as the ranges can be
 * quite small. By default, do not boost watermarks on discontigmem as in
 * many cases very high-order allocations like THP are likely to be
 * unsupported and the premature reclaim offsets the advantage of long-term
 * fragmentation avoidance.
 */
int watermark_boost_factor __read_mostly;
#else
330
int watermark_boost_factor __read_mostly = 15000;
331
#endif
332
int watermark_scale_factor = 10;
Linus Torvalds's avatar
Linus Torvalds committed
333

334 335 336
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
Linus Torvalds's avatar
Linus Torvalds committed
337

Tejun Heo's avatar
Tejun Heo committed
338
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
339 340
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
341
static unsigned long required_kernelcore __initdata;
342
static unsigned long required_kernelcore_percent __initdata;
343
static unsigned long required_movablecore __initdata;
344
static unsigned long required_movablecore_percent __initdata;
345
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
346
static bool mirrored_kernelcore __meminitdata;
Tejun Heo's avatar
Tejun Heo committed
347 348 349 350 351

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
352

Miklos Szeredi's avatar
Miklos Szeredi committed
353
#if MAX_NUMNODES > 1
354
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
355
unsigned int nr_online_nodes __read_mostly = 1;
Miklos Szeredi's avatar
Miklos Szeredi committed
356
EXPORT_SYMBOL(nr_node_ids);
357
EXPORT_SYMBOL(nr_online_nodes);
Miklos Szeredi's avatar
Miklos Szeredi committed
358 359
#endif

360 361
int page_group_by_mobility_disabled __read_mostly;

362
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
/*
 * During boot we initialize deferred pages on-demand, as needed, but once
 * page_alloc_init_late() has finished, the deferred pages are all initialized,
 * and we can permanently disable that path.
 */
static DEFINE_STATIC_KEY_TRUE(deferred_pages);

/*
 * Calling kasan_free_pages() only after deferred memory initialization
 * has completed. Poisoning pages during deferred memory init will greatly
 * lengthen the process and cause problem in large memory systems as the
 * deferred pages initialization is done with interrupt disabled.
 *
 * Assuming that there will be no reference to those newly initialized
 * pages before they are ever allocated, this should have no effect on
 * KASAN memory tracking as the poison will be properly inserted at page
 * allocation time. The only corner case is when pages are allocated by
 * on-demand allocation and then freed again before the deferred pages
 * initialization is done, but this is not likely to happen.
 */
static inline void kasan_free_nondeferred_pages(struct page *page, int order)
{
	if (!static_branch_unlikely(&deferred_pages))
		kasan_free_pages(page, order);
}

389
/* Returns true if the struct page for the pfn is uninitialised */
390
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
391
{
392 393 394
	int nid = early_pfn_to_nid(pfn);

	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
395 396 397 398 399 400
		return true;

	return false;
}

/*
401
 * Returns true when the remaining initialisation should be deferred until
402 403
 * later in the boot cycle when it can be parallelised.
 */
404 405
static bool __meminit
defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
406
{
407 408 409 410 411 412 413 414 415 416 417
	static unsigned long prev_end_pfn, nr_initialised;

	/*
	 * prev_end_pfn static that contains the end of previous zone
	 * No need to protect because called very early in boot before smp_init.
	 */
	if (prev_end_pfn != end_pfn) {
		prev_end_pfn = end_pfn;
		nr_initialised = 0;
	}

418
	/* Always populate low zones for address-constrained allocations */
419
	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
420
		return false;
421 422 423 424 425

	/*
	 * We start only with one section of pages, more pages are added as
	 * needed until the rest of deferred pages are initialized.
	 */
426
	nr_initialised++;
427
	if ((nr_initialised > PAGES_PER_SECTION) &&
428 429 430
	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
		NODE_DATA(nid)->first_deferred_pfn = pfn;
		return true;
431
	}
432
	return false;
433 434
}
#else
435 436
#define kasan_free_nondeferred_pages(p, o)	kasan_free_pages(p, o)

437 438 439 440 441
static inline bool early_page_uninitialised(unsigned long pfn)
{
	return false;
}

442
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
443
{
444
	return false;
445 446 447
}
#endif

448 449 450 451 452
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
							unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
453
	return section_to_usemap(__pfn_to_section(pfn));
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
#else
	return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}

static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	pfn &= (PAGES_PER_SECTION-1);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}

/**
 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest to retrieve
 * @mask: mask of bits that the caller is interested in
 *
 * Return: pageblock_bits flags
 */
static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long word;

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	word = bitmap[word_bitidx];
	bitidx += end_bitidx;
	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
}

static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
}

/**
 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @flags: The flags to set
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest
 * @mask: mask of bits that the caller is interested in
 */
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long old_word, word;

	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
528
	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

	bitidx += end_bitidx;
	mask <<= (BITS_PER_LONG - bitidx - 1);
	flags <<= (BITS_PER_LONG - bitidx - 1);

	word = READ_ONCE(bitmap[word_bitidx]);
	for (;;) {
		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
		if (word == old_word)
			break;
		word = old_word;
	}
}
549

550
void set_pageblock_migratetype(struct page *page, int migratetype)
551
{
552 553
	if (unlikely(page_group_by_mobility_disabled &&
		     migratetype < MIGRATE_PCPTYPES))
554 555
		migratetype = MIGRATE_UNMOVABLE;

556 557 558 559
	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}

Nick Piggin's avatar
Nick Piggin committed
560
#ifdef CONFIG_DEBUG_VM
561
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
562
{
563 564 565
	int ret = 0;
	unsigned seq;
	unsigned long pfn = page_to_pfn(page);
566
	unsigned long sp, start_pfn;
567

568 569
	do {
		seq = zone_span_seqbegin(zone);
570 571
		start_pfn = zone->zone_start_pfn;
		sp = zone->spanned_pages;
572
		if (!zone_spans_pfn(zone, pfn))
573 574 575
			ret = 1;
	} while (zone_span_seqretry(zone, seq));

576
	if (ret)
577 578 579
		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
			pfn, zone_to_nid(zone), zone->name,
			start_pfn, start_pfn + sp);
580

581
	return ret;
582 583 584 585
}

static int page_is_consistent(struct zone *zone, struct page *page)
{
586
	if (!pfn_valid_within(page_to_pfn(page)))
587
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
588
	if (zone != page_zone(page))
589 590 591 592 593 594 595
		return 0;

	return 1;
}
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
596
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
597 598
{
	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds's avatar
Linus Torvalds committed
599
		return 1;
600 601 602
	if (!page_is_consistent(zone, page))
		return 1;

Linus Torvalds's avatar
Linus Torvalds committed
603 604
	return 0;
}
Nick Piggin's avatar
Nick Piggin committed
605
#else
606
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
Nick Piggin's avatar
Nick Piggin committed
607 608 609 610 611
{
	return 0;
}
#endif

612 613
static void bad_page(struct page *page, const char *reason,
		unsigned long bad_flags)
Linus Torvalds's avatar
Linus Torvalds committed
614
{
615 616 617 618 619 620 621 622 623 624 625 626 627 628
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			goto out;
		}
		if (nr_unshown) {
629
			pr_alert(
630
			      "BUG: Bad page state: %lu messages suppressed\n",
631 632 633 634 635 636 637 638
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;

639
	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
640
		current->comm, page_to_pfn(page));
641 642 643 644 645
	__dump_page(page, reason);
	bad_flags &= page->flags;
	if (bad_flags)
		pr_alert("bad because of flags: %#lx(%pGp)\n",
						bad_flags, &bad_flags);
646
	dump_page_owner(page);
647

648
	print_modules();
Linus Torvalds's avatar
Linus Torvalds committed
649
	dump_stack();
650
out:
651
	/* Leave bad fields for debug, except PageBuddy could make trouble */
652
	page_mapcount_reset(page); /* remove PageBuddy */
653
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Linus Torvalds's avatar
Linus Torvalds committed
654 655 656 657 658
}

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
659
 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
Linus Torvalds's avatar
Linus Torvalds committed
660
 *
661 662
 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
Linus Torvalds's avatar
Linus Torvalds committed
663
 *
664 665
 * The first tail page's ->compound_dtor holds the offset in array of compound
 * page destructors. See compound_page_dtors.
Linus Torvalds's avatar
Linus Torvalds committed
666
 *
667
 * The first tail page's ->compound_order holds the order of allocation.
668
 * This usage means that zero-order pages may not be compound.
Linus Torvalds's avatar
Linus Torvalds committed
669
 */
670

671
void free_compound_page(struct page *page)
672
{
673
	__free_pages_ok(page, compound_order(page));
674 675
}

676
void prep_compound_page(struct page *page, unsigned int order)
677 678 679 680
{
	int i;
	int nr_pages = 1 << order;

681
	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
682 683 684 685
	set_compound_order(page, order);
	__SetPageHead(page);
	for (i = 1; i < nr_pages; i++) {
		struct page *p = page + i;
686
		set_page_count(p, 0);
687
		p->mapping = TAIL_MAPPING;
688
		set_compound_head(p, page);
689
	}
690
	atomic_set(compound_mapcount_ptr(page), -1);
691 692
}

693 694
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
695 696 697 698 699 700

#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
#else
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
#endif
701
EXPORT_SYMBOL(_debug_pagealloc_enabled);
702 703

DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
704

705 706
static int __init early_debug_pagealloc(char *buf)
{
707 708 709
	bool enable = false;

	if (kstrtobool(buf, &enable))
710
		return -EINVAL;
711 712 713 714 715

	if (enable)
		static_branch_enable(&_debug_pagealloc_enabled);

	return 0;
716 717 718
}
early_param("debug_pagealloc", early_debug_pagealloc);

719 720
static void init_debug_guardpage(void)
{
721 722 723
	if (!debug_pagealloc_enabled())
		return;

724 725 726
	if (!debug_guardpage_minorder())
		return;

727
	static_branch_enable(&_debug_guardpage_enabled);
728 729
}

730 731 732 733 734
static int __init debug_guardpage_minorder_setup(char *buf)
{
	unsigned long res;

	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
735
		pr_err("Bad debug_guardpage_minorder value\n");
736 737 738
		return 0;
	}
	_debug_guardpage_minorder = res;
739
	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
740 741
	return 0;
}
742
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
743

744
static inline bool set_page_guard(struct zone *zone, struct page *page,
745
				unsigned int order, int migratetype)
746
{
747
	if (!debug_guardpage_enabled())
748 749 750 751
		return false;

	if (order >= debug_guardpage_minorder())
		return false;
752

753
	__SetPageGuard(page);
754 755 756 757
	INIT_LIST_HEAD(&page->lru);
	set_page_private(page, order);
	/* Guard pages are not available for any usage */
	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
758 759

	return true;
760 761
}

762 763
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype)
764
{
765 766 767
	if (!debug_guardpage_enabled())
		return;

768
	__ClearPageGuard(page);
769

770 771 772
	set_page_private(page, 0);
	if (!is_migrate_isolate(migratetype))
		__mod_zone_freepage_state(zone, (1 << order), migratetype);
773 774
}
#else
775 776
static inline bool set_page_guard(struct zone *zone, struct page *page,
			unsigned int order, int migratetype) { return false; }
777 778
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype) {}
779 780
#endif

781
static inline void set_page_order(struct page *page, unsigned int order)
782
{
783
	set_page_private(page, order);
784
	__SetPageBuddy(page);
Linus Torvalds's avatar
Linus Torvalds committed
785 786 787 788
}

/*
 * This function checks whether a page is free && is the buddy
789
 * we can coalesce a page and its buddy if
790
 * (a) the buddy is not in a hole (check before calling!) &&
791
 * (b) the buddy is in the buddy system &&
792 793
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
794
 *
795 796
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
Linus Torvalds's avatar
Linus Torvalds committed
797
 *
798
 * For recording page's order, we use page_private(page).
Linus Torvalds's avatar
Linus Torvalds committed
799
 */
800
static inline int page_is_buddy(struct page *page, struct page *buddy,
801
							unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
802
{
803
	if (page_is_guard(buddy) && page_order(buddy) == order) {
804 805 806
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

807 808
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

809 810 811
		return 1;
	}

812
	if (PageBuddy(buddy) && page_order(buddy) == order) {
813 814 815 816 817 818 819 820
		/*
		 * zone check is done late to avoid uselessly
		 * calculating zone/node ids for pages that could
		 * never merge.
		 */
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

821 822
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

823
		return 1;
824
	}
825
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
826 827
}

828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
	struct capture_control *capc = current->capture_control;

	return capc &&
		!(current->flags & PF_KTHREAD) &&
		!capc->page &&
		capc->cc->zone == zone &&
		capc->cc->direct_compaction ? capc : NULL;
}

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
		   int order, int migratetype)
{
	if (!capc || order != capc->cc->order)
		return false;

	/* Do not accidentally pollute CMA or isolated regions*/
	if (is_migrate_cma(migratetype) ||
	    is_migrate_isolate(migratetype))
		return false;

	/*
	 * Do not let lower order allocations polluate a movable pageblock.
	 * This might let an unmovable request use a reclaimable pageblock
	 * and vice-versa but no more than normal fallback logic which can
	 * have trouble finding a high-order free page.
	 */
	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
		return false;

	capc->page = page;
	return true;
}

#else
static inline struct capture_control *task_capc(struct zone *zone)
{
	return NULL;
}

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
		   int order, int migratetype)
{
	return false;
}
#endif /* CONFIG_COMPACTION */

Linus Torvalds's avatar
Linus Torvalds committed
879 880 881 882 883 884 885 886 887 888 889 890 891
/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
892 893
 * free pages of length of (1 << order) and marked with PageBuddy.
 * Page's order is recorded in page_private(page) field.
Linus Torvalds's avatar
Linus Torvalds committed
894
 * So when we are allocating or freeing one, we can derive the state of the
895 896
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
Linus Torvalds's avatar
Linus Torvalds committed
897
 * If a block is freed, and its buddy is also free, then this
898
 * triggers coalescing into a block of larger size.
Linus Torvalds's avatar
Linus Torvalds committed
899
 *
900
 * -- nyc
Linus Torvalds's avatar
Linus Torvalds committed
901 902
 */

Nick Piggin's avatar
Nick Piggin committed
903
static inline void __free_one_page(struct page *page,
904
		unsigned long pfn,
905 906
		struct zone *zone, unsigned int order,
		int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
907
{
908 909
	unsigned long combined_pfn;
	unsigned long uninitialized_var(buddy_pfn);
910
	struct page *buddy;
911
	unsigned int max_order;
912
	struct capture_control *capc = task_capc(zone);
913 914

	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
Linus Torvalds's avatar
Linus Torvalds committed
915

916
	VM_BUG_ON(!zone_is_initialized(zone));
917
	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
Linus Torvalds's avatar
Linus Torvalds committed
918

919
	VM_BUG_ON(migratetype == -1);
920
	if (likely(!is_migrate_isolate(migratetype)))
921
		__mod_zone_freepage_state(zone, 1 << order, migratetype);
922

923
	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
924
	VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds's avatar
Linus Torvalds committed
925

926
continue_merging:
927
	while (order < max_order - 1) {
928 929 930 931 932
		if (compaction_capture(capc, page, order, migratetype)) {
			__mod_zone_freepage_state(zone, -(1 << order),
								migratetype);
			return;
		}
933 934
		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);
935 936 937

		if (!pfn_valid_within(buddy_pfn))
			goto done_merging;
938
		if (!page_is_buddy(page, buddy, order))
939
			goto done_merging;
940 941 942 943
		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
944
		if (page_is_guard(buddy))
945
			clear_page_guard(zone, buddy, order, migratetype);
946 947
		else
			del_page_from_free_area(buddy, &zone->free_area[order]);
948 949 950
		combined_pfn = buddy_pfn & pfn;
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
951 952
		order++;
	}
953 954 955 956 957 958 959 960 961 962 963 964
	if (max_order < MAX_ORDER) {
		/* If we are here, it means order is >= pageblock_order.
		 * We want to prevent merge between freepages on isolate
		 * pageblock and normal pageblock. Without this, pageblock
		 * isolation could cause incorrect freepage or CMA accounting.
		 *
		 * We don't want to hit this code for the more frequent
		 * low-order merging.
		 */
		if (unlikely(has_isolate_pageblock(zone))) {
			int buddy_mt;

965 966
			buddy_pfn = __find_buddy_pfn(pfn, order);
			buddy = page + (buddy_pfn - pfn);
967