mm.h 88.6 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>

#ifdef __KERNEL__

9
#include <linux/mmdebug.h>
Linus Torvalds's avatar
Linus Torvalds committed
10
#include <linux/gfp.h>
11
#include <linux/bug.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
15
#include <linux/atomic.h>
16
#include <linux/debug_locks.h>
17
#include <linux/mm_types.h>
18
#include <linux/range.h>
19
#include <linux/pfn.h>
20
#include <linux/percpu-refcount.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
21
#include <linux/bit_spinlock.h>
22
#include <linux/shrinker.h>
23
#include <linux/resource.h>
24
#include <linux/page_ext.h>
25
#include <linux/err.h>
26
#include <linux/page_ref.h>
27
#include <linux/memremap.h>
28
#include <linux/overflow.h>
29
#include <linux/sizes.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
31
32

struct mempolicy;
struct anon_vma;
33
struct anon_vma_chain;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
34
struct file_ra_state;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
35
struct user_struct;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
36
struct writeback_control;
37
struct bdi_writeback;
Linus Torvalds's avatar
Linus Torvalds committed
38

39
40
void init_mm_internals(void);

41
#ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
Linus Torvalds's avatar
Linus Torvalds committed
42
extern unsigned long max_mapnr;
43
44
45
46
47
48
49

static inline void set_max_mapnr(unsigned long limit)
{
	max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
Linus Torvalds's avatar
Linus Torvalds committed
50
51
#endif

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
	return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
	atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
	atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
	atomic_long_add(count, &_totalram_pages);
}

static inline void totalram_pages_set(long val)
{
	atomic_long_set(&_totalram_pages, val);
}

Linus Torvalds's avatar
Linus Torvalds committed
78
79
80
81
82
83
84
85
86
extern void * high_memory;
extern int page_cluster;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

87
88
89
90
91
92
93
94
95
96
97
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern const int mmap_rnd_bits_max;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
98
99
100
101
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>

102
103
104
105
106
107
108
109
110
111
112
/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 * It's defined as noop for arcitectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

113
114
115
116
#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

117
118
119
120
#ifndef page_to_virt
#define page_to_virt(x)	__va(PFN_PHYS(page_to_pfn(x)))
#endif

Laura Abbott's avatar
Laura Abbott committed
121
122
123
124
#ifndef lm_alias
#define lm_alias(x)	__va(__pa_symbol(x))
#endif

125
126
127
128
129
130
131
132
133
134
135
/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)	(0)
#endif

136
137
/*
 * On some architectures it is expensive to call memset() for small sizes.
138
139
140
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
141
 */
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 80
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statments if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
	unsigned long *_pp = (void *)page;

	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
	BUILD_BUG_ON(sizeof(struct page) & 7);
	BUILD_BUG_ON(sizeof(struct page) < 56);
	BUILD_BUG_ON(sizeof(struct page) > 80);

	switch (sizeof(struct page)) {
	case 80:
		_pp[9] = 0;	/* fallthrough */
	case 72:
		_pp[8] = 0;	/* fallthrough */
	case 64:
		_pp[7] = 0;	/* fallthrough */
	case 56:
		_pp[6] = 0;
		_pp[5] = 0;
		_pp[4] = 0;
		_pp[3] = 0;
		_pp[2] = 0;
		_pp[1] = 0;
		_pp[0] = 0;
	}
}
#else
177
178
179
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN	(5)
#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

201
extern unsigned long sysctl_user_reserve_kbytes;
202
extern unsigned long sysctl_admin_reserve_kbytes;
203

204
205
206
207
208
209
210
211
212
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;

extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
				    size_t *, loff_t *);
extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
				    size_t *, loff_t *);

Linus Torvalds's avatar
Linus Torvalds committed
213
214
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))

215
216
217
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

218
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
219
#define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
220

221
222
#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))

Linus Torvalds's avatar
Linus Torvalds committed
223
224
225
226
227
228
229
230
231
/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

232
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
233
234
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
235

Linus Torvalds's avatar
Linus Torvalds committed
236
#ifndef CONFIG_MMU
237
238
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;
Linus Torvalds's avatar
Linus Torvalds committed
239
240
241
242
243

extern unsigned int kobjsize(const void *objp);
#endif

/*
Hugh Dickins's avatar
Hugh Dickins committed
244
 * vm_flags in vm_area_struct, see mm_types.h.
245
 * When changing, update also include/trace/events/mmflags.h
Linus Torvalds's avatar
Linus Torvalds committed
246
 */
247
248
#define VM_NONE		0x00000000

Linus Torvalds's avatar
Linus Torvalds committed
249
250
251
252
253
#define VM_READ		0x00000001	/* currently active flags */
#define VM_WRITE	0x00000002
#define VM_EXEC		0x00000004
#define VM_SHARED	0x00000008

254
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
Linus Torvalds's avatar
Linus Torvalds committed
255
256
257
258
259
260
#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
#define VM_MAYWRITE	0x00000020
#define VM_MAYEXEC	0x00000040
#define VM_MAYSHARE	0x00000080

#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
261
#define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
262
#define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
Linus Torvalds's avatar
Linus Torvalds committed
263
#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
264
#define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
Linus Torvalds's avatar
Linus Torvalds committed
265
266
267
268
269
270
271
272
273
274

#define VM_LOCKED	0x00002000
#define VM_IO           0x00004000	/* Memory mapped I/O or similar */

					/* Used by sys_madvise() */
#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */

#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
Eric B Munson's avatar
Eric B Munson committed
275
#define VM_LOCKONFAULT	0x00080000	/* Lock the pages covered when they are faulted in */
Linus Torvalds's avatar
Linus Torvalds committed
276
#define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
277
#define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
Linus Torvalds's avatar
Linus Torvalds committed
278
#define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
279
#define VM_SYNC		0x00800000	/* Synchronous page faults */
280
#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
281
#define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
282
#define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
283

284
285
286
287
288
289
#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY	0x08000000	/* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY	0
#endif

Jared Hulbert's avatar
Jared Hulbert committed
290
#define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
291
292
#define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
Hugh Dickins's avatar
Hugh Dickins committed
293
#define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
Linus Torvalds's avatar
Linus Torvalds committed
294

295
296
297
298
299
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0	32	/* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1	33	/* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
300
#define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
301
302
303
304
#define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
305
#define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
306
307
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

308
#ifdef CONFIG_ARCH_HAS_PKEYS
309
310
# define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
311
# define VM_PKEY_BIT1	VM_HIGH_ARCH_1	/* on x86 and 5-bit value on ppc64   */
312
313
# define VM_PKEY_BIT2	VM_HIGH_ARCH_2
# define VM_PKEY_BIT3	VM_HIGH_ARCH_3
314
315
316
317
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
318
#endif
319
320
321
322
#endif /* CONFIG_ARCH_HAS_PKEYS */

#if defined(CONFIG_X86)
# define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
323
324
325
326
327
328
#elif defined(CONFIG_PPC)
# define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP	VM_ARCH_1
#elif defined(CONFIG_IA64)
# define VM_GROWSUP	VM_ARCH_1
329
330
331
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
# define VM_ARCH_CLEAR	VM_SPARC_ADI
332
333
334
335
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
#endif

336
#if defined(CONFIG_X86_INTEL_MPX)
337
/* MPX specific bounds table or bounds directory */
338
# define VM_MPX		VM_HIGH_ARCH_4
339
340
#else
# define VM_MPX		VM_NONE
341
342
#endif

343
344
345
346
#ifndef VM_GROWSUP
# define VM_GROWSUP	VM_NONE
#endif

347
348
349
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)

Linus Torvalds's avatar
Linus Torvalds committed
350
351
352
353
354
#ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#ifdef CONFIG_STACK_GROWSUP
355
#define VM_STACK	VM_GROWSUP
Linus Torvalds's avatar
Linus Torvalds committed
356
#else
357
#define VM_STACK	VM_GROWSDOWN
Linus Torvalds's avatar
Linus Torvalds committed
358
359
#endif

360
361
#define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

362
/*
363
364
 * Special vmas that are non-mergable, non-mlock()able.
 * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
365
 */
366
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
367

368
369
370
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK	VM_NOHUGEPAGE

Eric B Munson's avatar
Eric B Munson committed
371
372
373
/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK	(~(VM_LOCKED | VM_LOCKONFAULT))

374
375
376
377
378
379
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR	VM_NONE
#endif
#define VM_FLAGS_CLEAR	(ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

Linus Torvalds's avatar
Linus Torvalds committed
380
381
382
383
384
385
/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */
extern pgprot_t protection_map[16];

Nick Piggin's avatar
Nick Piggin committed
386
#define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
387
388
389
390
391
392
#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */
#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */
#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */
#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */
#define FAULT_FLAG_TRIED	0x20	/* Second try */
#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
393
#define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
394
#define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
Nick Piggin's avatar
Nick Piggin committed
395

396
397
398
399
400
401
402
403
404
405
406
#define FAULT_FLAG_TRACE \
	{ FAULT_FLAG_WRITE,		"WRITE" }, \
	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
	{ FAULT_FLAG_ALLOW_RETRY,	"ALLOW_RETRY" }, \
	{ FAULT_FLAG_RETRY_NOWAIT,	"RETRY_NOWAIT" }, \
	{ FAULT_FLAG_KILLABLE,		"KILLABLE" }, \
	{ FAULT_FLAG_TRIED,		"TRIED" }, \
	{ FAULT_FLAG_USER,		"USER" }, \
	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }

407
/*
Nick Piggin's avatar
Nick Piggin committed
408
 * vm_fault is filled by the the pagefault handler and passed to the vma's
Nick Piggin's avatar
Nick Piggin committed
409
410
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
411
 *
412
413
414
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
415
 * pgoff should be used in favour of virtual_address, if possible.
416
 */
Nick Piggin's avatar
Nick Piggin committed
417
struct vm_fault {
418
	struct vm_area_struct *vma;	/* Target VMA */
Nick Piggin's avatar
Nick Piggin committed
419
	unsigned int flags;		/* FAULT_FLAG_xxx flags */
420
	gfp_t gfp_mask;			/* gfp mask to be used for allocations */
Nick Piggin's avatar
Nick Piggin committed
421
	pgoff_t pgoff;			/* Logical page offset based on vma */
422
423
	unsigned long address;		/* Faulting virtual address */
	pmd_t *pmd;			/* Pointer to pmd entry matching
424
					 * the 'address' */
425
426
427
	pud_t *pud;			/* Pointer to pud entry matching
					 * the 'address'
					 */
428
	pte_t orig_pte;			/* Value of PTE at the time of fault */
Nick Piggin's avatar
Nick Piggin committed
429

430
431
	struct page *cow_page;		/* Page handler may use for COW fault */
	struct mem_cgroup *memcg;	/* Cgroup cow_page belongs to */
Nick Piggin's avatar
Nick Piggin committed
432
	struct page *page;		/* ->fault handlers should return a
Nick Piggin's avatar
Nick Piggin committed
433
					 * page here, unless VM_FAULT_NOPAGE
Nick Piggin's avatar
Nick Piggin committed
434
					 * is set (which is also implied by
Nick Piggin's avatar
Nick Piggin committed
435
					 * VM_FAULT_ERROR).
Nick Piggin's avatar
Nick Piggin committed
436
					 */
437
	/* These three entries are valid only while holding ptl lock */
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
438
439
440
441
442
443
444
445
	pte_t *pte;			/* Pointer to pte entry matching
					 * the 'address'. NULL if the page
					 * table hasn't been allocated.
					 */
	spinlock_t *ptl;		/* Page table lock.
					 * Protects pte page table if 'pte'
					 * is not NULL, otherwise pmd.
					 */
446
447
448
449
450
451
452
	pgtable_t prealloc_pte;		/* Pre-allocated pte page table.
					 * vm_ops->map_pages() calls
					 * alloc_set_pte() from atomic context.
					 * do_fault_around() pre-allocates
					 * page table to avoid allocation from
					 * atomic context.
					 */
453
};
Linus Torvalds's avatar
Linus Torvalds committed
454

455
456
457
458
459
460
461
/* page entry size for vm->huge_fault() */
enum page_entry_size {
	PE_SIZE_PTE = 0,
	PE_SIZE_PMD,
	PE_SIZE_PUD,
};

Linus Torvalds's avatar
Linus Torvalds committed
462
463
464
/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
465
 * to the functions called when a no-page or a wp-page exception occurs.
Linus Torvalds's avatar
Linus Torvalds committed
466
467
468
469
 */
struct vm_operations_struct {
	void (*open)(struct vm_area_struct * area);
	void (*close)(struct vm_area_struct * area);
470
	int (*split)(struct vm_area_struct * area, unsigned long addr);
471
	int (*mremap)(struct vm_area_struct * area);
472
473
474
	vm_fault_t (*fault)(struct vm_fault *vmf);
	vm_fault_t (*huge_fault)(struct vm_fault *vmf,
			enum page_entry_size pe_size);
475
	void (*map_pages)(struct vm_fault *vmf,
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
476
			pgoff_t start_pgoff, pgoff_t end_pgoff);
477
	unsigned long (*pagesize)(struct vm_area_struct * area);
478
479
480

	/* notification that a previously read-only page is about to become
	 * writable, if an error is returned it will cause a SIGBUS */
481
	vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
482

483
	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
484
	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
485

486
487
488
489
490
	/* called by access_process_vm when get_user_pages() fails, typically
	 * for use by special VMAs that can switch between memory and hardware
	 */
	int (*access)(struct vm_area_struct *vma, unsigned long addr,
		      void *buf, int len, int write);
491
492
493
494
495
496

	/* Called by the /proc/PID/maps code to ask the vma whether it
	 * has a special name.  Returning non-NULL will also cause this
	 * vma to be dumped unconditionally. */
	const char *(*name)(struct vm_area_struct *vma);

Linus Torvalds's avatar
Linus Torvalds committed
497
#ifdef CONFIG_NUMA
498
499
500
501
502
503
504
	/*
	 * set_policy() op must add a reference to any non-NULL @new mempolicy
	 * to hold the policy upon return.  Caller should pass NULL @new to
	 * remove a policy and fall back to surrounding context--i.e. do not
	 * install a MPOL_DEFAULT policy, nor the task or system default
	 * mempolicy.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
505
	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
506
507
508
509
510
511
512
513
514
515
516

	/*
	 * get_policy() op must add reference [mpol_get()] to any policy at
	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
	 * in mm/mempolicy.c will do this automatically.
	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
	 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
	 * must return NULL--i.e., do not "fallback" to task or system default
	 * policy.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
517
518
519
	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
					unsigned long addr);
#endif
520
521
522
523
524
525
526
	/*
	 * Called by vm_normal_page() for special PTEs to find the
	 * page for @addr.  This is useful if the default behavior
	 * (using pte_page()) would not find the correct page.
	 */
	struct page *(*find_special_page)(struct vm_area_struct *vma,
					  unsigned long addr);
Linus Torvalds's avatar
Linus Torvalds committed
527
528
};

Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
529
530
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
531
532
	static const struct vm_operations_struct dummy_vm_ops = {};

533
	memset(vma, 0, sizeof(*vma));
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
534
	vma->vm_mm = mm;
535
	vma->vm_ops = &dummy_vm_ops;
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
536
537
538
	INIT_LIST_HEAD(&vma->anon_vma_chain);
}

539
540
541
542
543
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
	vma->vm_ops = NULL;
}

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
	return !vma->vm_ops;
}

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

561
562
563
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

Linus Torvalds's avatar
Linus Torvalds committed
564
565
566
struct mmu_gather;
struct inode;

567
#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
568
569
570
571
static inline int pmd_devmap(pmd_t pmd)
{
	return 0;
}
572
573
574
575
static inline int pud_devmap(pud_t pud)
{
	return 0;
}
576
577
578
579
static inline int pgd_devmap(pgd_t pgd)
{
	return 0;
}
580
581
#endif

Linus Torvalds's avatar
Linus Torvalds committed
582
583
584
585
586
/*
 * FIXME: take this include out, include page-flags.h in
 * files which need it (119 of them)
 */
#include <linux/page-flags.h>
587
#include <linux/huge_mm.h>
Linus Torvalds's avatar
Linus Torvalds committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
Nick Piggin's avatar
Nick Piggin committed
603
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
Linus Torvalds's avatar
Linus Torvalds committed
604
 */
605
606
static inline int put_page_testzero(struct page *page)
{
607
608
	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
	return page_ref_dec_and_test(page);
609
}
Linus Torvalds's avatar
Linus Torvalds committed
610
611

/*
612
613
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
614
615
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
Linus Torvalds's avatar
Linus Torvalds committed
616
 */
617
618
static inline int get_page_unless_zero(struct page *page)
{
619
	return page_ref_add_unless(page, 1, 0);
620
}
Linus Torvalds's avatar
Linus Torvalds committed
621

622
extern int page_is_ram(unsigned long pfn);
623
624
625
626
627
628
629

enum {
	REGION_INTERSECTS,
	REGION_DISJOINT,
	REGION_MIXED,
};

630
631
int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
		      unsigned long desc);
632

633
/* Support for virtually mapped pages */
634
635
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);
636

637
638
639
640
641
642
/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
643
static inline bool is_vmalloc_addr(const void *x)
644
{
645
#ifdef CONFIG_MMU
646
647
648
	unsigned long addr = (unsigned long)x;

	return addr >= VMALLOC_START && addr < VMALLOC_END;
649
#else
650
	return false;
651
#endif
652
}
653
654
655
656
657

#ifndef is_ioremap_addr
#define is_ioremap_addr(x) is_vmalloc_addr(x)
#endif

658
659
660
#ifdef CONFIG_MMU
extern int is_vmalloc_or_module_addr(const void *x);
#else
661
static inline int is_vmalloc_or_module_addr(const void *x)
662
663
664
665
{
	return 0;
}
#endif
666

667
668
669
670
671
672
673
674
675
676
677
678
679
680
extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
static inline void *kvmalloc(size_t size, gfp_t flags)
{
	return kvmalloc_node(size, flags, NUMA_NO_NODE);
}
static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
{
	return kvmalloc_node(size, flags | __GFP_ZERO, node);
}
static inline void *kvzalloc(size_t size, gfp_t flags)
{
	return kvmalloc(size, flags | __GFP_ZERO);
}

681
682
static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
{
683
684
685
	size_t bytes;

	if (unlikely(check_mul_overflow(n, size, &bytes)))
686
687
		return NULL;

688
	return kvmalloc(bytes, flags);
689
690
}

Kees Cook's avatar
Kees Cook committed
691
692
693
694
695
static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
{
	return kvmalloc_array(n, size, flags | __GFP_ZERO);
}

Al Viro's avatar
Al Viro committed
696
697
extern void kvfree(const void *addr);

698
699
700
701
702
703
704
static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
	return &page[1].compound_mapcount;
}

static inline int compound_mapcount(struct page *page)
{
705
	VM_BUG_ON_PAGE(!PageCompound(page), page);
706
707
708
709
	page = compound_head(page);
	return atomic_read(compound_mapcount_ptr(page)) + 1;
}

710
711
712
713
714
/*
 * The atomic page->_mapcount, starts from -1: so that transitions
 * both from it and to it can be tracked, using atomic_inc_and_test
 * and atomic_add_negative(-1).
 */
715
static inline void page_mapcount_reset(struct page *page)
716
717
718
719
{
	atomic_set(&(page)->_mapcount, -1);
}

720
721
int __page_mapcount(struct page *page);

722
723
static inline int page_mapcount(struct page *page)
{
724
	VM_BUG_ON_PAGE(PageSlab(page), page);
725

726
727
728
729
730
731
732
	if (unlikely(PageCompound(page)))
		return __page_mapcount(page);
	return atomic_read(&page->_mapcount) + 1;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
733
int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
734
735
736
737
#else
static inline int total_mapcount(struct page *page)
{
	return page_mapcount(page);
738
}
739
740
741
742
743
744
745
746
static inline int page_trans_huge_mapcount(struct page *page,
					   int *total_mapcount)
{
	int mapcount = page_mapcount(page);
	if (total_mapcount)
		*total_mapcount = mapcount;
	return mapcount;
}
747
#endif
748

749
750
751
static inline struct page *virt_to_head_page(const void *x)
{
	struct page *page = virt_to_page(x);
752

753
	return compound_head(page);
754
755
}

756
757
void __put_page(struct page *page);

758
void put_pages_list(struct list_head *pages);
Linus Torvalds's avatar
Linus Torvalds committed
759

760
761
void split_page(struct page *page, unsigned int order);

762
763
764
/*
 * Compound pages have a destructor function.  Provide a
 * prototype for that function and accessor functions.
765
 * These are _only_ valid on the head of a compound page.
766
 */
767
768
769
770
771
772
773
774
typedef void compound_page_dtor(struct page *);

/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
enum compound_dtor_id {
	NULL_COMPOUND_DTOR,
	COMPOUND_PAGE_DTOR,
#ifdef CONFIG_HUGETLB_PAGE
	HUGETLB_PAGE_DTOR,
775
776
777
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	TRANSHUGE_PAGE_DTOR,
778
779
780
781
#endif
	NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[];
782
783

static inline void set_compound_page_dtor(struct page *page,
784
		enum compound_dtor_id compound_dtor)
785
{
786
787
	VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
	page[1].compound_dtor = compound_dtor;
788
789
790
791
}

static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
{
792
793
	VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
	return compound_page_dtors[page[1].compound_dtor];
794
795
}

796
static inline unsigned int compound_order(struct page *page)
797
{
798
	if (!PageHead(page))
799
		return 0;
800
	return page[1].compound_order;
801
802
}

803
static inline void set_compound_order(struct page *page, unsigned int order)
804
{
805
	page[1].compound_order = order;
806
807
}

808
809
810
811
812
813
/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
	return 1UL << compound_order(page);
}

814
815
816
817
818
819
/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
	return PAGE_SIZE << compound_order(page);
}

820
821
822
823
824
825
/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
	return PAGE_SHIFT + compound_order(page);
}

826
827
void free_compound_page(struct page *page);

828
#ifdef CONFIG_MMU
Andrea Arcangeli's avatar
Andrea Arcangeli committed
829
830
831
832
833
834
835
836
837
838
839
840
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
	if (likely(vma->vm_flags & VM_WRITE))
		pte = pte_mkwrite(pte);
	return pte;
}
841

842
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
843
		struct page *page);
844
845
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
846
#endif
Andrea Arcangeli's avatar
Andrea Arcangeli committed
847

Linus Torvalds's avatar
Linus Torvalds committed
848
849
850
851
852
853
854
/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
855
856
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
Nick Piggin's avatar
Nick Piggin committed
857
 *   page_count() > 0  means the page has been allocated.
Linus Torvalds's avatar
Linus Torvalds committed
858
 *
Nick Piggin's avatar
Nick Piggin committed
859
860
861
862
863
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
Linus Torvalds's avatar
Linus Torvalds committed
864
 *
Nick Piggin's avatar
Nick Piggin committed
865
866
867
868
869
870
871
872
873
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
Linus Torvalds's avatar
Linus Torvalds committed
874
875
876
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
Nick Piggin's avatar
Nick Piggin committed
877
878
879
880
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
Linus Torvalds's avatar
Linus Torvalds committed
881
 *
Nick Piggin's avatar
Nick Piggin committed
882
883
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
884
 * in units of PAGE_SIZE.
Linus Torvalds's avatar
Linus Torvalds committed
885
 *
Nick Piggin's avatar
Nick Piggin committed
886
887
888
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
Linus Torvalds's avatar
Linus Torvalds committed
889
 *
Nick Piggin's avatar
Nick Piggin committed
890
891
892
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
Linus Torvalds's avatar
Linus Torvalds committed
893
 *
Nick Piggin's avatar
Nick Piggin committed
894
 * The pagecache pages are stored in a per-mapping radix tree, which is
Matthew Wilcox's avatar
Matthew Wilcox committed
895
 * rooted at mapping->i_pages, and indexed by offset.
Nick Piggin's avatar
Nick Piggin committed
896
897
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
Linus Torvalds's avatar
Linus Torvalds committed
898
 *
Nick Piggin's avatar
Nick Piggin committed
899
 * All pagecache pages may be subject to I/O:
Linus Torvalds's avatar
Linus Torvalds committed
900
901
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
Nick Piggin's avatar
Nick Piggin committed
902
903
904
905
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
Linus Torvalds's avatar
Linus Torvalds committed
906
907
908
909
910
911
 */

/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */
912

913
/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
914
#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
915
916
#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
917
#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
918
#define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
919

920
/*
Lucas De Marchi's avatar
Lucas De Marchi committed
921
 * Define the bit shifts to access each section.  For non-existent
922
923
924
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
925
926
927
#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
928
#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
929
#define KASAN_TAG_PGSHIFT	(KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
930

931
932
/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
933
#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
934
935
#define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF)? \
						SECTIONS_PGOFF : ZONES_PGOFF)
936
#else
937
#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
938
939
#define ZONEID_PGOFF		((NODES_PGOFF < ZONES_PGOFF)? \
						NODES_PGOFF : ZONES_PGOFF)
940
941
#endif

942
#define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
943

944
945
#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
946
947
#endif

948
949
950
#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
951
#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_SHIFT) - 1)
952
#define KASAN_TAG_MASK		((1UL << KASAN_TAG_WIDTH) - 1)
953
#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
954

955
static inline enum zone_type page_zonenum(const struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
956
{
957
	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
958
959
}

960
961
962
963
964
#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
	return page_zonenum(page) == ZONE_DEVICE;
}
965
966
extern void memmap_init_zone_device(struct zone *, unsigned long,
				    unsigned long, struct dev_pagemap *);
967
968
969
970
971
#else
static inline bool is_zone_device_page(const struct page *page)
{
	return false;
}
972
#endif
973

974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
#ifdef CONFIG_DEV_PAGEMAP_OPS
void __put_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
static inline bool put_devmap_managed_page(struct page *page)
{
	if (!static_branch_unlikely(&devmap_managed_key))
		return false;
	if (!is_zone_device_page(page))
		return false;
	switch (page->pgmap->type) {
	case MEMORY_DEVICE_PRIVATE:
	case MEMORY_DEVICE_FS_DAX:
		__put_devmap_managed_page(page);
		return true;
	default:
		break;
	}
	return false;
}

#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline bool put_devmap_managed_page(struct page *page)
{
	return false;
}
999
#endif /* CONFIG_DEV_PAGEMAP_OPS */
1000

1001
1002
static inline bool is_device_private_page(const struct page *page)
{
1003
1004
1005
1006
	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
		is_zone_device_page(page) &&
		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
1007
}
1008

1009
1010
static inline bool is_pci_p2pdma_page(const struct page *page)
{
1011
1012
1013
1014
	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
		IS_ENABLED(CONFIG_PCI_P2PDMA) &&
		is_zone_device_page(page) &&
		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
1015
}
1016

1017
1018
1019
1020
/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
	((unsigned int) page_ref_count(page) + 127u <= 127u)

1021
1022
1023
1024
1025
static inline void get_page(struct page *page)
{
	page = compound_head(page);
	/*
	 * Getting a normal page or the head of a compound page
1026
	 * requires to already have an elevated page->_refcount.
1027
	 */
1028
	VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
1029
	page_ref_inc(page);
1030
1031
}

1032
1033
1034
1035
1036
static inline __must_check bool try_get_page(struct page *page)
{
	page = compound_head(page);
	if (WARN_ON_ONCE(page_ref_count(page) <= 0))
		return false;
1037
	page_ref_inc(page);
1038
	return true;
1039
1040
1041
1042
1043
1044
}

static inline void put_page(struct page *page)
{
	page = compound_head(page);

1045
	/*
1046
1047
1048
	 * For devmap managed pages we need to catch refcount transition from
	 * 2 to 1, when refcount reach one it means the page is free and we
	 * need to inform the device driver through callback. See
1049
1050
	 * include/linux/memremap.h and HMM for details.
	 */
1051
	if (put_devmap_managed_page(page))
1052
1053
		return;

1054
1055
1056
1057
	if (put_page_testzero(page))
		__put_page(page);
}

1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
/**
 * put_user_page() - release a gup-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via get_user_pages*() must be released via
 * either put_user_page(), or one of the put_user_pages*() routines
 * below. This is so that eventually, pages that are pinned via
 * get_user_pages*() can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special
 * handling.
 *
 * put_user_page() and put_page() are not interchangeable, despite this early
 * implementation that makes them look the same. put_user_page() calls must
 * be perfectly matched up with get_user_page() calls.
 */
static inline void put_user_page(struct page *page)
{
	put_page(page);
}

1078
1079
1080
void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
			       bool make_dirty);

1081
1082
void put_user_pages(struct page **pages, unsigned long npages);

Cody P Schafer's avatar
Cody P Schafer committed
1083
1084
1085
1086
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

1087
/*
1088
1089
1090
1091
1092
1093
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
1094
 */
1095
1096
static inline int page_zone_id(struct page *page)
{
1097
	return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
1098
1099
}

1100
#ifdef NODE_NOT_IN_PAGE_FLAGS
1101
extern int page_to_nid(const struct page *page);
1102
#else
1103
static inline int page_to_nid(const struct page *page)
1104
{
1105
1106
1107
	struct page *p = (struct page *)page;

	return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
1108
}
1109
1110
#endif

1111
#ifdef CONFIG_NUMA_BALANCING
1112
static inline int cpu_pid_to_cpupid(int cpu, int pid)
1113
{
1114
	return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
1115
1116
}

1117
static inline int cpupid_to_pid(int cpupid)
1118
{
1119
	return cpupid & LAST__PID_MASK;
1120
}
1121

1122
static inline int cpupid_to_cpu(int cpupid)
1123
{
1124
	return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
1125
1126
}

1127
static inline int cpupid_to_nid(int cpupid)
1128
{
1129
	return cpu_to_node(cpupid_to_cpu(cpupid));
1130
1131
}

1132
static inline bool cpupid_pid_unset(int cpupid)
1133
{
1134
	return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
1135
1136
}

1137
static inline bool cpupid_cpu_unset(int cpupid)
1138
{
1139
	return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
1140
1141
}

Peter Zijlstra's avatar