ksm.c 89.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3 4 5 6 7
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
8
 * Copyright (C) 2008-2009 Red Hat, Inc.
9 10 11 12
 * Authors:
 *	Izik Eidus
 *	Andrea Arcangeli
 *	Chris Wright
13
 *	Hugh Dickins
14 15 16
 */

#include <linux/errno.h>
17 18
#include <linux/mm.h>
#include <linux/fs.h>
19
#include <linux/mman.h>
20
#include <linux/sched.h>
21
#include <linux/sched/mm.h>
22
#include <linux/sched/coredump.h>
23 24 25 26
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
27
#include <linux/xxhash.h>
28 29 30 31 32
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
33
#include <linux/memory.h>
34
#include <linux/mmu_notifier.h>
35
#include <linux/swap.h>
36
#include <linux/ksm.h>
37
#include <linux/hashtable.h>
38
#include <linux/freezer.h>
39
#include <linux/oom.h>
40
#include <linux/numa.h>
41

42
#include <asm/tlbflush.h>
43
#include "internal.h"
44

Hugh Dickins's avatar
Hugh Dickins committed
45 46 47 48 49 50 51 52
#ifdef CONFIG_NUMA
#define NUMA(x)		(x)
#define DO_NUMA(x)	do { (x); } while (0)
#else
#define NUMA(x)		(0)
#define DO_NUMA(x)	do { } while (0)
#endif

53 54 55
/**
 * DOC: Overview
 *
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
 *
 * In order to reduce excessive scanning, KSM sorts the memory pages by their
 * contents into a data structure that holds pointers to the pages' locations.
 *
 * Since the contents of the pages may change at any moment, KSM cannot just
 * insert the pages into a normal sorted tree and expect it to find anything.
 * Therefore KSM uses two data structures - the stable and the unstable tree.
 *
 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 * by their contents.  Because each such page is write-protected, searching on
 * this tree is fully assured to be working (except when pages are unmapped),
 * and therefore this tree is called the stable tree.
 *
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
 * The stable tree node includes information required for reverse
 * mapping from a KSM page to virtual addresses that map this page.
 *
 * In order to avoid large latencies of the rmap walks on KSM pages,
 * KSM maintains two types of nodes in the stable tree:
 *
 * * the regular nodes that keep the reverse mapping structures in a
 *   linked list
 * * the "chains" that link nodes ("dups") that represent the same
 *   write protected memory content, but each "dup" corresponds to a
 *   different KSM page copy of that content
 *
 * Internally, the regular nodes, "dups" and "chains" are represented
 * using the same :c:type:`struct stable_node` structure.
 *
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
 * In addition to the stable tree, KSM uses a second data structure called the
 * unstable tree: this tree holds pointers to pages which have been found to
 * be "unchanged for a period of time".  The unstable tree sorts these pages
 * by their contents, but since they are not write-protected, KSM cannot rely
 * upon the unstable tree to work correctly - the unstable tree is liable to
 * be corrupted as its contents are modified, and so it is called unstable.
 *
 * KSM solves this problem by several techniques:
 *
 * 1) The unstable tree is flushed every time KSM completes scanning all
 *    memory areas, and then the tree is rebuilt again from the beginning.
 * 2) KSM will only insert into the unstable tree, pages whose hash value
 *    has not changed since the previous scan of all memory areas.
 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 *    colors of the nodes and not on their contents, assuring that even when
 *    the tree gets "corrupted" it won't get out of balance, so scanning time
 *    remains the same (also, searching and inserting nodes in an rbtree uses
 *    the same algorithm, so we have no overhead when we flush and rebuild).
 * 4) KSM never flushes the stable tree, which means that even if it were to
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
Hugh Dickins's avatar
Hugh Dickins committed
108 109 110
 *
 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 * stable trees and multiple unstable trees: one of each for each NUMA node.
111 112 113 114 115 116
 */

/**
 * struct mm_slot - ksm information per mm that is being scanned
 * @link: link to the mm_slots hash list
 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
117
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
118 119 120 121 122
 * @mm: the mm that this information is valid for
 */
struct mm_slot {
	struct hlist_node link;
	struct list_head mm_list;
123
	struct rmap_item *rmap_list;
124 125 126 127 128 129 130
	struct mm_struct *mm;
};

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
131
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
132 133 134 135 136 137 138
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
	struct mm_slot *mm_slot;
	unsigned long address;
139
	struct rmap_item **rmap_list;
140 141 142
	unsigned long seqnr;
};

Hugh Dickins's avatar
Hugh Dickins committed
143 144 145
/**
 * struct stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
146
 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
147
 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
148
 * @list: linked into migrate_nodes, pending placement in the proper node tree
Hugh Dickins's avatar
Hugh Dickins committed
149
 * @hlist: hlist head of rmap_items using this ksm page
150
 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
151 152
 * @chain_prune_time: time of the last full garbage collection
 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
153
 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
Hugh Dickins's avatar
Hugh Dickins committed
154 155
 */
struct stable_node {
156 157 158 159
	union {
		struct rb_node node;	/* when node of stable tree */
		struct {		/* when listed for migration */
			struct list_head *head;
160 161 162 163
			struct {
				struct hlist_node hlist_dup;
				struct list_head list;
			};
164 165
		};
	};
Hugh Dickins's avatar
Hugh Dickins committed
166
	struct hlist_head hlist;
167 168 169 170 171 172 173 174 175 176 177
	union {
		unsigned long kpfn;
		unsigned long chain_prune_time;
	};
	/*
	 * STABLE_NODE_CHAIN can be any negative number in
	 * rmap_hlist_len negative range, but better not -1 to be able
	 * to reliably detect underflows.
	 */
#define STABLE_NODE_CHAIN -1024
	int rmap_hlist_len;
178 179 180
#ifdef CONFIG_NUMA
	int nid;
#endif
Hugh Dickins's avatar
Hugh Dickins committed
181 182
};

183 184
/**
 * struct rmap_item - reverse mapping item for virtual addresses
185
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
186
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
187
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
188 189 190
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
Hugh Dickins's avatar
Hugh Dickins committed
191 192 193
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
194 195
 */
struct rmap_item {
196
	struct rmap_item *rmap_list;
197 198 199 200 201 202
	union {
		struct anon_vma *anon_vma;	/* when stable */
#ifdef CONFIG_NUMA
		int nid;		/* when node of unstable tree */
#endif
	};
203 204
	struct mm_struct *mm;
	unsigned long address;		/* + low bits used for flags below */
Hugh Dickins's avatar
Hugh Dickins committed
205
	unsigned int oldchecksum;	/* when unstable */
206
	union {
Hugh Dickins's avatar
Hugh Dickins committed
207 208 209 210 211
		struct rb_node node;	/* when node of unstable tree */
		struct {		/* when listed from stable tree */
			struct stable_node *head;
			struct hlist_node hlist;
		};
212 213 214 215
	};
};

#define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
Hugh Dickins's avatar
Hugh Dickins committed
216 217
#define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
#define STABLE_FLAG	0x200	/* is listed from the stable tree */
218 219
#define KSM_FLAG_MASK	(SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
				/* to mask all the flags */
220 221

/* The stable and unstable tree heads */
222 223 224 225
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;
226

227 228
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
229
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230

231 232
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
233 234 235 236 237 238 239 240 241

static struct mm_slot ksm_mm_head = {
	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
static struct ksm_scan ksm_scan = {
	.mm_slot = &ksm_mm_head,
};

static struct kmem_cache *rmap_item_cache;
Hugh Dickins's avatar
Hugh Dickins committed
242
static struct kmem_cache *stable_node_cache;
243 244 245
static struct kmem_cache *mm_slot_cache;

/* The number of nodes in the stable tree */
246
static unsigned long ksm_pages_shared;
247

248
/* The number of page slots additionally sharing those nodes */
249
static unsigned long ksm_pages_sharing;
250

251 252 253 254 255 256
/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;

/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;

257 258 259 260 261 262 263 264 265 266 267 268
/* The number of stable_node chains */
static unsigned long ksm_stable_node_chains;

/* The number of stable_node dups linked to the stable_node chains */
static unsigned long ksm_stable_node_dups;

/* Delay in pruning stale stable_node_dups in the stable_node_chains */
static int ksm_stable_node_chains_prune_millisecs = 2000;

/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;

269
/* Number of pages ksmd should scan in one batch */
270
static unsigned int ksm_thread_pages_to_scan = 100;
271 272

/* Milliseconds ksmd should sleep between batches */
Hugh Dickins's avatar
Hugh Dickins committed
273
static unsigned int ksm_thread_sleep_millisecs = 20;
274

275 276 277 278 279 280
/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;

/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;

Hugh Dickins's avatar
Hugh Dickins committed
281
#ifdef CONFIG_NUMA
282 283
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
284
static int ksm_nr_node_ids = 1;
Hugh Dickins's avatar
Hugh Dickins committed
285 286
#else
#define ksm_merge_across_nodes	1U
287
#define ksm_nr_node_ids		1
Hugh Dickins's avatar
Hugh Dickins committed
288
#endif
289

290 291 292
#define KSM_RUN_STOP	0
#define KSM_RUN_MERGE	1
#define KSM_RUN_UNMERGE	2
293 294 295
#define KSM_RUN_OFFLINE	4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);
296 297

static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299 300 301 302 303 304 305 306 307 308 309 310 311
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
		sizeof(struct __struct), __alignof__(struct __struct),\
		(__flags), NULL)

static int __init ksm_slab_init(void)
{
	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
	if (!rmap_item_cache)
		goto out;

Hugh Dickins's avatar
Hugh Dickins committed
312 313 314 315
	stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
	if (!stable_node_cache)
		goto out_free1;

316 317
	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
	if (!mm_slot_cache)
Hugh Dickins's avatar
Hugh Dickins committed
318
		goto out_free2;
319 320 321

	return 0;

Hugh Dickins's avatar
Hugh Dickins committed
322 323 324
out_free2:
	kmem_cache_destroy(stable_node_cache);
out_free1:
325 326 327 328 329 330 331 332
	kmem_cache_destroy(rmap_item_cache);
out:
	return -ENOMEM;
}

static void __init ksm_slab_free(void)
{
	kmem_cache_destroy(mm_slot_cache);
Hugh Dickins's avatar
Hugh Dickins committed
333
	kmem_cache_destroy(stable_node_cache);
334 335 336 337
	kmem_cache_destroy(rmap_item_cache);
	mm_slot_cache = NULL;
}

338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
static __always_inline bool is_stable_node_chain(struct stable_node *chain)
{
	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}

static __always_inline bool is_stable_node_dup(struct stable_node *dup)
{
	return dup->head == STABLE_NODE_DUP_HEAD;
}

static inline void stable_node_chain_add_dup(struct stable_node *dup,
					     struct stable_node *chain)
{
	VM_BUG_ON(is_stable_node_dup(dup));
	dup->head = STABLE_NODE_DUP_HEAD;
	VM_BUG_ON(!is_stable_node_chain(chain));
	hlist_add_head(&dup->hlist_dup, &chain->hlist);
	ksm_stable_node_dups++;
}

static inline void __stable_node_dup_del(struct stable_node *dup)
{
360
	VM_BUG_ON(!is_stable_node_dup(dup));
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
	hlist_del(&dup->hlist_dup);
	ksm_stable_node_dups--;
}

static inline void stable_node_dup_del(struct stable_node *dup)
{
	VM_BUG_ON(is_stable_node_chain(dup));
	if (is_stable_node_dup(dup))
		__stable_node_dup_del(dup);
	else
		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
#ifdef CONFIG_DEBUG_VM
	dup->head = NULL;
#endif
}

377 378
static inline struct rmap_item *alloc_rmap_item(void)
{
379 380
	struct rmap_item *rmap_item;

381 382
	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
						__GFP_NORETRY | __GFP_NOWARN);
383 384 385
	if (rmap_item)
		ksm_rmap_items++;
	return rmap_item;
386 387 388 389
}

static inline void free_rmap_item(struct rmap_item *rmap_item)
{
390
	ksm_rmap_items--;
391 392 393 394
	rmap_item->mm = NULL;	/* debug safety */
	kmem_cache_free(rmap_item_cache, rmap_item);
}

Hugh Dickins's avatar
Hugh Dickins committed
395 396
static inline struct stable_node *alloc_stable_node(void)
{
397 398 399 400 401 402
	/*
	 * The allocation can take too long with GFP_KERNEL when memory is under
	 * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
	 * grants access to memory reserves, helping to avoid this problem.
	 */
	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
Hugh Dickins's avatar
Hugh Dickins committed
403 404 405 406
}

static inline void free_stable_node(struct stable_node *stable_node)
{
407 408
	VM_BUG_ON(stable_node->rmap_hlist_len &&
		  !is_stable_node_chain(stable_node));
Hugh Dickins's avatar
Hugh Dickins committed
409 410 411
	kmem_cache_free(stable_node_cache, stable_node);
}

412 413 414 415 416 417 418 419 420 421 422 423 424 425
static inline struct mm_slot *alloc_mm_slot(void)
{
	if (!mm_slot_cache)	/* initialization failed */
		return NULL;
	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}

static inline void free_mm_slot(struct mm_slot *mm_slot)
{
	kmem_cache_free(mm_slot_cache, mm_slot);
}

static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
426 427
	struct mm_slot *slot;

428
	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 430
		if (slot->mm == mm)
			return slot;
431 432 433 434 435 436 437 438

	return NULL;
}

static void insert_to_mm_slots_hash(struct mm_struct *mm,
				    struct mm_slot *mm_slot)
{
	mm_slot->mm = mm;
439
	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440 441
}

442 443 444 445 446 447 448 449 450 451 452 453 454
/*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
 * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
 * a special flag: they can just back out as soon as mm_users goes to zero.
 * ksm_test_exit() is used throughout to make this test for exit: in some
 * places for correctness, in some places just to avoid unnecessary work.
 */
static inline bool ksm_test_exit(struct mm_struct *mm)
{
	return atomic_read(&mm->mm_users) == 0;
}

455 456 457
/*
 * We use break_ksm to break COW on a ksm page: it's a stripped down
 *
458
 *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
459 460 461 462 463 464
 *		put_page(page);
 *
 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
465 466 467 468
 *
 * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
 * of the process that owns 'vma'.  We also do not want to enforce
 * protection keys here anyway.
469
 */
470
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471 472
{
	struct page *page;
473
	vm_fault_t ret = 0;
474 475 476

	do {
		cond_resched();
477 478
		page = follow_page(vma, addr,
				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479
		if (IS_ERR_OR_NULL(page))
480 481
			break;
		if (PageKsm(page))
482 483
			ret = handle_mm_fault(vma, addr,
					FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 485 486
		else
			ret = VM_FAULT_WRITE;
		put_page(page);
487
	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
	/*
	 * We must loop because handle_mm_fault() may back out if there's
	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
	 *
	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
	 * COW has been broken, even if the vma does not permit VM_WRITE;
	 * but note that a concurrent fault might break PageKsm for us.
	 *
	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
	 * backing file, which also invalidates anonymous pages: that's
	 * okay, that truncation will have unmapped the PageKsm for us.
	 *
	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
	 * current task has TIF_MEMDIE set, and will be OOM killed on return
	 * to user; and ksmd, having no mm, would never be chosen for that.
	 *
	 * But if the mm is in a limited mem_cgroup, then the fault may fail
	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
	 * even ksmd can fail in this way - though it's usually breaking ksm
	 * just to undo a merge it made a moment before, so unlikely to oom.
	 *
	 * That's a pity: we might therefore have more kernel pages allocated
	 * than we're counting as nodes in the stable tree; but ksm_do_scan
	 * will retry to break_cow on each pass, so should recover the page
	 * in due course.  The important thing is to not let VM_MERGEABLE
	 * be cleared while any such pages might remain in the area.
	 */
	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517 518
}

519 520 521 522 523 524 525 526 527 528 529 530 531 532
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
		unsigned long addr)
{
	struct vm_area_struct *vma;
	if (ksm_test_exit(mm))
		return NULL;
	vma = find_vma(mm, addr);
	if (!vma || vma->vm_start > addr)
		return NULL;
	if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
		return NULL;
	return vma;
}

533
static void break_cow(struct rmap_item *rmap_item)
534
{
535 536
	struct mm_struct *mm = rmap_item->mm;
	unsigned long addr = rmap_item->address;
537 538
	struct vm_area_struct *vma;

539 540 541 542
	/*
	 * It is not an accident that whenever we want to break COW
	 * to undo, we also need to drop a reference to the anon_vma.
	 */
543
	put_anon_vma(rmap_item->anon_vma);
544

Hugh Dickins's avatar
Hugh Dickins committed
545
	down_read(&mm->mmap_sem);
546 547 548
	vma = find_mergeable_vma(mm, addr);
	if (vma)
		break_ksm(vma, addr);
549 550 551 552 553 554 555 556 557 558 559
	up_read(&mm->mmap_sem);
}

static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
	struct mm_struct *mm = rmap_item->mm;
	unsigned long addr = rmap_item->address;
	struct vm_area_struct *vma;
	struct page *page;

	down_read(&mm->mmap_sem);
560 561
	vma = find_mergeable_vma(mm, addr);
	if (!vma)
562 563 564
		goto out;

	page = follow_page(vma, addr, FOLL_GET);
565
	if (IS_ERR_OR_NULL(page))
566
		goto out;
567
	if (PageAnon(page)) {
568 569 570 571
		flush_anon_page(vma, page, addr);
		flush_dcache_page(page);
	} else {
		put_page(page);
572 573
out:
		page = NULL;
574 575 576 577 578
	}
	up_read(&mm->mmap_sem);
	return page;
}

579 580 581 582 583 584 585 586
/*
 * This helper is used for getting right index into array of tree roots.
 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
 * every node has its own stable and unstable tree.
 */
static inline int get_kpfn_nid(unsigned long kpfn)
{
587
	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588 589
}

590 591 592 593 594 595 596 597 598 599
static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
						   struct rb_root *root)
{
	struct stable_node *chain = alloc_stable_node();
	VM_BUG_ON(is_stable_node_chain(dup));
	if (likely(chain)) {
		INIT_HLIST_HEAD(&chain->hlist);
		chain->chain_prune_time = jiffies;
		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600
		chain->nid = NUMA_NO_NODE; /* debug */
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
#endif
		ksm_stable_node_chains++;

		/*
		 * Put the stable node chain in the first dimension of
		 * the stable tree and at the same time remove the old
		 * stable node.
		 */
		rb_replace_node(&dup->node, &chain->node, root);

		/*
		 * Move the old stable node to the second dimension
		 * queued in the hlist_dup. The invariant is that all
		 * dup stable_nodes in the chain->hlist point to pages
		 * that are wrprotected and have the exact same
		 * content.
		 */
		stable_node_chain_add_dup(dup, chain);
	}
	return chain;
}

static inline void free_stable_node_chain(struct stable_node *chain,
					  struct rb_root *root)
{
	rb_erase(&chain->node, root);
	free_stable_node(chain);
	ksm_stable_node_chains--;
}

631 632 633 634
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
	struct rmap_item *rmap_item;

635 636 637
	/* check it's not STABLE_NODE_CHAIN or negative */
	BUG_ON(stable_node->rmap_hlist_len < 0);

638
	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 640 641 642
		if (rmap_item->hlist.next)
			ksm_pages_sharing--;
		else
			ksm_pages_shared--;
643 644
		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
		stable_node->rmap_hlist_len--;
645
		put_anon_vma(rmap_item->anon_vma);
646 647 648 649
		rmap_item->address &= PAGE_MASK;
		cond_resched();
	}

650 651 652 653 654
	/*
	 * We need the second aligned pointer of the migrate_nodes
	 * list_head to stay clear from the rb_parent_color union
	 * (aligned and different than any node) and also different
	 * from &migrate_nodes. This will verify that future list.h changes
655
	 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
656
	 */
657
#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 659 660 661
	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
#endif

662 663 664
	if (stable_node->head == &migrate_nodes)
		list_del(&stable_node->list);
	else
665
		stable_node_dup_del(stable_node);
666 667 668
	free_stable_node(stable_node);
}

669 670 671 672 673 674
enum get_ksm_page_flags {
	GET_KSM_PAGE_NOLOCK,
	GET_KSM_PAGE_LOCK,
	GET_KSM_PAGE_TRYLOCK
};

675 676 677 678 679 680
/*
 * get_ksm_page: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
681
 * But beware, the stable node's page might be being migrated.
682 683 684 685 686 687 688 689 690 691 692 693
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
694 695
static struct page *get_ksm_page(struct stable_node *stable_node,
				 enum get_ksm_page_flags flags)
696 697 698
{
	struct page *page;
	void *expected_mapping;
699
	unsigned long kpfn;
700

701 702
	expected_mapping = (void *)((unsigned long)stable_node |
					PAGE_MAPPING_KSM);
703
again:
704
	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
705
	page = pfn_to_page(kpfn);
706
	if (READ_ONCE(page->mapping) != expected_mapping)
707
		goto stale;
708 709 710 711 712

	/*
	 * We cannot do anything with the page while its refcount is 0.
	 * Usually 0 means free, or tail of a higher-order page: in which
	 * case this node is no longer referenced, and should be freed;
713
	 * however, it might mean that the page is under page_ref_freeze().
714
	 * The __remove_mapping() case is easy, again the node is now stale;
715 716 717
	 * the same is in reuse_ksm_page() case; but if page is swapcache
	 * in migrate_page_move_mapping(), it might still be our page,
	 * in which case it's essential to keep the node.
718 719 720 721 722 723 724
	 */
	while (!get_page_unless_zero(page)) {
		/*
		 * Another check for page->mapping != expected_mapping would
		 * work here too.  We have chosen the !PageSwapCache test to
		 * optimize the common case, when the page is or is about to
		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
725
		 * in the ref_freeze section of __remove_mapping(); but Anon
726 727 728 729 730 731 732
		 * page->mapping reset to NULL later, in free_pages_prepare().
		 */
		if (!PageSwapCache(page))
			goto stale;
		cpu_relax();
	}

733
	if (READ_ONCE(page->mapping) != expected_mapping) {
734 735 736
		put_page(page);
		goto stale;
	}
737

738 739 740 741 742 743
	if (flags == GET_KSM_PAGE_TRYLOCK) {
		if (!trylock_page(page)) {
			put_page(page);
			return ERR_PTR(-EBUSY);
		}
	} else if (flags == GET_KSM_PAGE_LOCK)
Hugh Dickins's avatar
Hugh Dickins committed
744
		lock_page(page);
745 746

	if (flags != GET_KSM_PAGE_NOLOCK) {
747
		if (READ_ONCE(page->mapping) != expected_mapping) {
Hugh Dickins's avatar
Hugh Dickins committed
748 749 750 751 752
			unlock_page(page);
			put_page(page);
			goto stale;
		}
	}
753
	return page;
754

755
stale:
756 757 758 759 760 761 762
	/*
	 * We come here from above when page->mapping or !PageSwapCache
	 * suggests that the node is stale; but it might be under migration.
	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
	 * before checking whether node->kpfn has been changed.
	 */
	smp_rmb();
763
	if (READ_ONCE(stable_node->kpfn) != kpfn)
764
		goto again;
765 766 767 768
	remove_node_from_stable_tree(stable_node);
	return NULL;
}

769 770 771 772 773 774
/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
Hugh Dickins's avatar
Hugh Dickins committed
775 776
	if (rmap_item->address & STABLE_FLAG) {
		struct stable_node *stable_node;
777
		struct page *page;
778

Hugh Dickins's avatar
Hugh Dickins committed
779
		stable_node = rmap_item->head;
780
		page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 782
		if (!page)
			goto out;
783

Hugh Dickins's avatar
Hugh Dickins committed
784
		hlist_del(&rmap_item->hlist);
785 786
		unlock_page(page);
		put_page(page);
787

788
		if (!hlist_empty(&stable_node->hlist))
789 790
			ksm_pages_sharing--;
		else
Hugh Dickins's avatar
Hugh Dickins committed
791
			ksm_pages_shared--;
792 793
		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
		stable_node->rmap_hlist_len--;
794

795
		put_anon_vma(rmap_item->anon_vma);
796
		rmap_item->address &= PAGE_MASK;
797

Hugh Dickins's avatar
Hugh Dickins committed
798
	} else if (rmap_item->address & UNSTABLE_FLAG) {
799 800
		unsigned char age;
		/*
Hugh Dickins's avatar
Hugh Dickins committed
801
		 * Usually ksmd can and must skip the rb_erase, because
802
		 * root_unstable_tree was already reset to RB_ROOT.
Hugh Dickins's avatar
Hugh Dickins committed
803 804 805
		 * But be careful when an mm is exiting: do the rb_erase
		 * if this rmap_item was inserted by this scan, rather
		 * than left over from before.
806 807
		 */
		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808
		BUG_ON(age > 1);
809
		if (!age)
810
			rb_erase(&rmap_item->node,
811
				 root_unstable_tree + NUMA(rmap_item->nid));
812
		ksm_pages_unshared--;
813
		rmap_item->address &= PAGE_MASK;
814
	}
815
out:
816 817 818 819
	cond_resched();		/* we're called from many long loops */
}

static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
820
				       struct rmap_item **rmap_list)
821
{
822 823 824
	while (*rmap_list) {
		struct rmap_item *rmap_item = *rmap_list;
		*rmap_list = rmap_item->rmap_list;
825 826 827 828 829 830
		remove_rmap_item_from_tree(rmap_item);
		free_rmap_item(rmap_item);
	}
}

/*
Hugh Dickins's avatar
Hugh Dickins committed
831
 * Though it's very tempting to unmerge rmap_items from stable tree rather
832 833 834 835 836
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
Hugh Dickins's avatar
Hugh Dickins committed
837 838 839 840 841
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
842
 */
843 844
static int unmerge_ksm_pages(struct vm_area_struct *vma,
			     unsigned long start, unsigned long end)
845 846
{
	unsigned long addr;
847
	int err = 0;
848

849
	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
Hugh Dickins's avatar
Hugh Dickins committed
850 851
		if (ksm_test_exit(vma->vm_mm))
			break;
852 853 854 855 856 857
		if (signal_pending(current))
			err = -ERESTARTSYS;
		else
			err = break_ksm(vma, addr);
	}
	return err;
858 859
}

860 861 862 863 864 865 866 867 868 869 870
static inline struct stable_node *page_stable_node(struct page *page)
{
	return PageKsm(page) ? page_rmapping(page) : NULL;
}

static inline void set_page_stable_node(struct page *page,
					struct stable_node *stable_node)
{
	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}

Hugh Dickins's avatar
Hugh Dickins committed
871 872 873 874
#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */
875 876 877 878 879
static int remove_stable_node(struct stable_node *stable_node)
{
	struct page *page;
	int err;

880
	page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 882 883 884 885 886 887
	if (!page) {
		/*
		 * get_ksm_page did remove_node_from_stable_tree itself.
		 */
		return 0;
	}

Hugh Dickins's avatar
Hugh Dickins committed
888 889 890 891 892
	if (WARN_ON_ONCE(page_mapped(page))) {
		/*
		 * This should not happen: but if it does, just refuse to let
		 * merge_across_nodes be switched - there is no need to panic.
		 */
893
		err = -EBUSY;
Hugh Dickins's avatar
Hugh Dickins committed
894
	} else {
895
		/*
Hugh Dickins's avatar
Hugh Dickins committed
896 897 898
		 * The stable node did not yet appear stale to get_ksm_page(),
		 * since that allows for an unmapped ksm page to be recognized
		 * right up until it is freed; but the node is safe to remove.
899 900 901 902 903 904 905 906 907 908 909 910 911 912
		 * This page might be in a pagevec waiting to be freed,
		 * or it might be PageSwapCache (perhaps under writeback),
		 * or it might have been removed from swapcache a moment ago.
		 */
		set_page_stable_node(page, NULL);
		remove_node_from_stable_tree(stable_node);
		err = 0;
	}

	unlock_page(page);
	put_page(page);
	return err;
}

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
static int remove_stable_node_chain(struct stable_node *stable_node,
				    struct rb_root *root)
{
	struct stable_node *dup;
	struct hlist_node *hlist_safe;

	if (!is_stable_node_chain(stable_node)) {
		VM_BUG_ON(is_stable_node_dup(stable_node));
		if (remove_stable_node(stable_node))
			return true;
		else
			return false;
	}

	hlist_for_each_entry_safe(dup, hlist_safe,
				  &stable_node->hlist, hlist_dup) {
		VM_BUG_ON(!is_stable_node_dup(dup));
		if (remove_stable_node(dup))
			return true;
	}
	BUG_ON(!hlist_empty(&stable_node->hlist));
	free_stable_node_chain(stable_node, root);
	return false;
}

938 939
static int remove_all_stable_nodes(void)
{
940
	struct stable_node *stable_node, *next;
941 942 943
	int nid;
	int err = 0;

944
	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 946 947
		while (root_stable_tree[nid].rb_node) {
			stable_node = rb_entry(root_stable_tree[nid].rb_node,
						struct stable_node, node);
948 949
			if (remove_stable_node_chain(stable_node,
						     root_stable_tree + nid)) {
950 951 952 953 954 955
				err = -EBUSY;
				break;	/* proceed to next nid */
			}
			cond_resched();
		}
	}
956
	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 958 959 960
		if (remove_stable_node(stable_node))
			err = -EBUSY;
		cond_resched();
	}
961 962 963
	return err;
}

964
static int unmerge_and_remove_all_rmap_items(void)
965 966 967 968
{
	struct mm_slot *mm_slot;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
969 970 971
	int err = 0;

	spin_lock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
972
	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
973 974
						struct mm_slot, mm_list);
	spin_unlock(&ksm_mmlist_lock);
975

Hugh Dickins's avatar
Hugh Dickins committed
976 977
	for (mm_slot = ksm_scan.mm_slot;
			mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
978 979 980
		mm = mm_slot->mm;
		down_read(&mm->mmap_sem);
		for (vma = mm->mmap; vma; vma = vma->vm_next) {
Hugh Dickins's avatar
Hugh Dickins committed
981 982
			if (ksm_test_exit(mm))
				break;
983 984
			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
				continue;
985 986
			err = unmerge_ksm_pages(vma,
						vma->vm_start, vma->vm_end);
Hugh Dickins's avatar
Hugh Dickins committed
987 988
			if (err)
				goto error;
989
		}
Hugh Dickins's avatar
Hugh Dickins committed
990

991
		remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992
		up_read(&mm->mmap_sem);
993 994

		spin_lock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
995
		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
996
						struct mm_slot, mm_list);
Hugh Dickins's avatar
Hugh Dickins committed
997
		if (ksm_test_exit(mm)) {
998
			hash_del(&mm_slot->link);
Hugh Dickins's avatar
Hugh Dickins committed
999 1000 1001 1002 1003 1004
			list_del(&mm_slot->mm_list);
			spin_unlock(&ksm_mmlist_lock);

			free_mm_slot(mm_slot);
			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
			mmdrop(mm);
1005
		} else
Hugh Dickins's avatar
Hugh Dickins committed
1006
			spin_unlock(&ksm_mmlist_lock);
1007 1008
	}

1009 1010
	/* Clean up stable nodes, but don't worry if some are still busy */
	remove_all_stable_nodes();
1011
	ksm_scan.seqnr = 0;
Hugh Dickins's avatar
Hugh Dickins committed
1012 1013 1014 1015
	return 0;

error:
	up_read(&mm->mmap_sem);
1016
	spin_lock(&ksm_mmlist_lock);
1017
	ksm_scan.mm_slot = &ksm_mm_head;
1018
	spin_unlock(&ksm_mmlist_lock);
1019
	return err;
1020
}
Hugh Dickins's avatar
Hugh Dickins committed
1021
#endif /* CONFIG_SYSFS */
1022 1023 1024 1025

static u32 calc_checksum(struct page *page)
{
	u32 checksum;
1026
	void *addr = kmap_atomic(page);
1027
	checksum = xxhash(addr, PAGE_SIZE, 0);
1028
	kunmap_atomic(addr);
1029 1030 1031 1032 1033 1034 1035 1036
	return checksum;
}

static int memcmp_pages(struct page *page1, struct page *page2)
{
	char *addr1, *addr2;
	int ret;

1037 1038
	addr1 = kmap_atomic(page1);
	addr2 = kmap_atomic(page2);
1039
	ret = memcmp(addr1, addr2, PAGE_SIZE);
1040 1041
	kunmap_atomic(addr2);
	kunmap_atomic(addr1);
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
	return ret;
}

static inline int pages_identical(struct page *page1, struct page *page2)
{
	return !memcmp_pages(page1, page2);
}

static int write_protect_page(struct vm_area_struct *vma, struct page *page,
			      pte_t *orig_pte)
{
	struct mm_struct *mm = vma->vm_mm;
1054 1055 1056 1057
	struct page_vma_mapped_walk pvmw = {
		.page = page,
		.vma = vma,
	};
1058 1059
	int swapped;
	int err = -EFAULT;
1060
	struct mmu_notifier_range range;
1061

1062 1063
	pvmw.address = page_address_in_vma(page, vma);
	if (pvmw.address == -EFAULT)
1064 1065
		goto out;

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1066
	BUG_ON(PageTransCompound(page));
1067

1068
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1069
				pvmw.address,
1070 1071
				pvmw.address + PAGE_SIZE);
	mmu_notifier_invalidate_range_start(&range);
1072

1073
	if (!page_vma_mapped_walk(&pvmw))
1074
		goto out_mn;
1075 1076
	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
		goto out_unlock;
1077

1078
	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1079 1080
	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
						mm_tlb_flush_pending(mm)) {
1081 1082 1083
		pte_t entry;

		swapped = PageSwapCache(page);
1084
		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1085
		/*
1086
		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
1087 1088 1089 1090 1091 1092
		 * take any lock, therefore the check that we are going to make
		 * with the pagecount against the mapcount is racey and
		 * O_DIRECT can happen right after the check.
		 * So we clear the pte and flush the tlb before the check
		 * this assure us that no O_DIRECT can happen after the check
		 * or in the middle of the check.
1093 1094 1095 1096
		 *
		 * No need to notify as we are downgrading page table to read
		 * only not changing it to point to a new page.
		 *
1097
		 * See Documentation/vm/mmu_notifier.rst
1098
		 */
1099
		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1100 1101 1102 1103
		/*
		 * Check that no O_DIRECT or similar I/O is in progress on the
		 * page
		 */
1104
		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1105
			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1106 1107
			goto out_unlock;
		}
1108 1109
		if (pte_dirty(entry))
			set_page_dirty(page);
1110 1111 1112 1113 1114

		if (pte_protnone(entry))
			entry = pte_mkclean(pte_clear_savedwrite(entry));
		else
			entry = pte_mkclean(pte_wrprotect(entry));
1115
		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1116
	}
1117
	*orig_pte = *pvmw.pte;
1118 1119 1120
	err = 0;

out_unlock:
1121
	page_vma_mapped_walk_done(&pvmw);
1122
out_mn:
1123
	mmu_notifier_invalidate_range_end(&range);
1124 1125 1126 1127 1128 1129
out:
	return err;
}

/**
 * replace_page - replace page in vma by new ksm page
1130 1131 1132
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
1133 1134 1135 1136
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
1137 1138
static int replace_page(struct vm_area_struct *vma, struct page *page,
			struct page *kpage, pte_t orig_pte)
1139 1140 1141 1142
{
	struct mm_struct *mm = vma->vm_mm;
	pmd_t *pmd;
	pte_t *ptep;
1143
	pte_t newpte;
1144 1145 1146
	spinlock_t *ptl;
	unsigned long addr;
	int err = -EFAULT;
1147
	struct mmu_notifier_range range;
1148

1149
	addr = page_address_in_vma(page, vma);
1150 1151 1152
	if (addr == -EFAULT)
		goto out;

Bob Liu's avatar
Bob Liu committed
1153 1154
	pmd = mm_find_pmd(mm, addr);
	if (!pmd)
1155 1156
		goto out;

1157
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1158
				addr + PAGE_SIZE);
1159
	mmu_notifier_invalidate_range_start(&range);
1160

1161 1162 1163
	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_same(*ptep, orig_pte)) {
		pte_unmap_unlock(ptep, ptl);
1164
		goto out_mn;
1165 1166
	}

1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
	/*
	 * No need to check ksm_use_zero_pages here: we can only have a
	 * zero_page here if ksm_use_zero_pages was enabled alreaady.
	 */
	if (!is_zero_pfn(page_to_pfn(kpage))) {
		get_page(kpage);
		page_add_anon_rmap(kpage, vma, addr, false);
		newpte = mk_pte(kpage, vma->vm_page_prot);
	} else {
		newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
					       vma->vm_page_prot));
1178 1179 1180 1181 1182 1183 1184
		/*
		 * We're replacing an anonymous page with a zero page, which is
		 * not anonymous. We need to do proper accounting otherwise we
		 * will get wrong values in /proc, and a BUG message in dmesg
		 * when tearing down the mm.
		 */
		dec_mm_counter(mm, MM_ANONPAGES);
1185
	}
1186 1187

	flush_cache_page(vma, addr, pte_pfn(*ptep));
1188 1189 1190 1191
	/*
	 * No need to notify as we are replacing a read only page with another
	 * read only page with the same content.
	 *
1192
	 * See Documentation/vm/mmu_notifier.rst
1193 1194
	 */
	ptep_clear_flush(vma, addr, ptep);
1195
	set_pte_at_notify(mm, addr, ptep, newpte);
1196

1197
	page_remove_rmap(page, false);
1198 1199
	if (!page_mapped(page))
		try_to_free_swap(page);
1200
	put_page(page);
1201 1202 1203

	pte_unmap_unlock(ptep, ptl);
	err = 0;
1204
out_mn:
1205
	mmu_notifier_invalidate_range_end(&range);
1206 1207 1208 1209 1210 1211
out:
	return err;
}

/*
 * try_to_merge_one_page - take two pages and merge them into one
1212 1213
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
1214 1215
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
1216 1217 1218 1219
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
1220
				 struct page *page, struct page *kpage)
1221 1222 1223 1224
{
	pte_t orig_pte = __pte(0);
	int err = -EFAULT;

1225 1226 1227
	if (page == kpage)			/* ksm page forked */
		return 0;

1228
	if (!PageAnon(page))
1229 1230 1231 1232 1233 1234 1235 1236 1237
		goto out;

	/*
	 * We need the page lock to read a stable PageSwapCache in
	 * write_protect_page().  We use trylock_page() instead of
	 * lock_page() because we don't want to wait here - we
	 * prefer to continue scanning and merging different pages,
	 * then come back to this page when it is unlocked.
	 */
1238
	if (!trylock_page(page))
1239
		goto out;
1240 1241

	if (PageTransCompound(page)) {
1242
		if (split_huge_page(page))
1243 1244 1245
			goto out_unlock;
	}

1246 1247 1248 1249 1250 1251
	/*
	 * If this anonymous page is mapped only here, its pte may need
	 * to be write-protected.  If it's mapped elsewhere, all of its
	 * ptes are necessarily already write-protected.  But in either
	 * case, we need to lock and check page_count is not raised.
	 */
1252 1253 1254 1255 1256 1257 1258 1259 1260
	if (write_protect_page(vma, page, &orig_pte) == 0) {
		if (!kpage) {
			/*
			 * While we hold page lock, upgrade page from
			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
			 * stable_tree_insert() will update stable_node.
			 */
			set_page_stable_node(page, NULL);
			mark_page_accessed(page);
1261 1262 1263 1264 1265 1266
			/*
			 * Page reclaim just frees a clean page with no dirty
			 * ptes: make sure that the ksm page would be swapped.
			 */
			if (!PageDirty(page))
				SetPageDirty(page);
1267 1268 1269 1270
			err = 0;
		} else if (pages_identical(page, kpage))
			err = replace_page(vma, page, kpage, orig_pte);
	}
1271

1272
	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1273
		munlock_vma_page(page);
1274 1275 1276 1277 1278 1279 1280
		if (!PageMlocked(kpage)) {
			unlock_page(page);
			lock_page(kpage);
			mlock_vma_page(kpage);
			page = kpage;		/* for final unlock */
		}
	}
1281

1282
out_unlock:
1283
	unlock_page(page);
1284 1285 1286 1287
out:
	return err;
}

Hugh Dickins's avatar
Hugh Dickins committed
1288 1289 1290
/*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
1291 1292
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
Hugh Dickins's avatar
Hugh Dickins committed
1293
 */
1294 1295
static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
				      struct page *page, struct page *kpage)
Hugh Dickins's avatar
Hugh Dickins committed
1296
{
1297
	struct mm_struct *mm = rmap_item->mm;
Hugh Dickins's avatar
Hugh Dickins committed
1298 1299 1300
	struct vm_area_struct *vma;
	int err = -EFAULT;

1301
	down_read(&mm->mmap_sem);
1302 1303
	vma = find_mergeable_vma(mm, rmap_item->address);
	if (!vma)
Hugh Dickins's avatar
Hugh Dickins committed
1304 1305
		goto out;

1306
	err = try_to_merge_one_page(vma, page, kpage);
1307 1308 1309
	if (err)
		goto out;

1310 1311 1312
	/* Unstable nid is in union with stable anon_vma: remove first */
	remove_rmap_item_from_tree(rmap_item);

1313
	/* Must get reference to anon_vma while still holding mmap_sem */
1314 1315
	rmap_item->anon_vma = vma->anon_vma;
	get_anon_vma(vma->anon_vma);
Hugh Dickins's avatar
Hugh Dickins committed
1316
out:
1317
	up_read(&mm->mmap_sem);
Hugh Dickins's avatar
Hugh Dickins committed
1318 1319 1320
	return err;
}

1321 1322 1323 1324
/*
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
1325 1326
 * This function returns the kpage if we successfully merged two identical
 * pages into one ksm page, NULL otherwise.
1327
 *
1328
 * Note that this function upgrades page to ksm page: if one of the pages
1329 1330
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
1331 1332 1333 1334
static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
					   struct page *page,
					   struct rmap_item *tree_rmap_item,
					   struct page *tree_page)
1335
{
1336
	int err;
1337

1338
	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1339
	if (!err) {
1340
		err = try_to_merge_with_ksm_page(tree_rmap_item,
1341
							tree_page, page);
1342
		/*
Hugh Dickins's avatar
Hugh Dickins committed
1343 1344
		 * If that fails, we have a ksm page with only one pte
		 * pointing to it: so break it.
1345
		 */
1346
		if (err)
1347
			break_cow(rmap_item);
1348
	}
1349
	return err ? NULL : page;
1350 1351
}

1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
static __always_inline
bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
{
	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
	/*
	 * Check that at least one mapping still exists, otherwise
	 * there's no much point to merge and share with this
	 * stable_node, as the underlying tree_page of the other
	 * sharer is going to be freed soon.
	 */
	return stable_node->rmap_hlist_len &&
		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
}

static __always_inline
bool is_page_sharing_candidate(struct stable_node *stable_node)
{
	return __is_page_sharing_candidate(stable_node, 0);
}

1372 1373 1374 1375
static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
				    struct stable_node **_stable_node,
				    struct rb_root *root,
				    bool prune_stale_stable_nodes)
1376
{
1377
	struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1378
	struct hlist_node *hlist_safe;
1379
	struct page *_tree_page, *tree_page = NULL;
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
	int nr = 0;
	int found_rmap_hlist_len;

	if (!prune_stale_stable_nodes ||
	    time_before(jiffies, stable_node->chain_prune_time +
			msecs_to_jiffies(
				ksm_stable_node_chains_prune_millisecs)))
		prune_stale_stable_nodes = false;
	else
		stable_node->chain_prune_time = jiffies;

	hlist_for_each_entry_safe(dup, hlist_safe,
				  &stable_node->hlist, hlist_dup) {
		cond_resched();
		/*
		 * We must walk all stable_node_dup to prune the stale
		 * stable nodes during lookup.
		 *
		 * get_ksm_page can drop the nodes from the
		 * stable_node->hlist if they point to freed pages
		 * (that's why we do a _safe walk). The "dup"
		 * stable_node parameter itself will be freed from
		 * under us if it returns NULL.
		 */
1404
		_tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1405 1406 1407 1408 1409 1410 1411
		if (!_tree_page)
			continue;
		nr += 1;
		if (is_page_sharing_candidate(dup)) {
			if (!found ||
			    dup->rmap_hlist_len > found_rmap_hlist_len) {
				if (found)
1412
					put_page(tree_page);
1413 1414
				found = dup;
				found_rmap_hlist_len = found->rmap_hlist_len;
1415
				tree_page = _tree_page;
1416

1417
				/* skip put_page for found dup */
1418 1419 1420 1421 1422 1423 1424 1425
				if (!prune_stale_stable_nodes)
					break;
				continue;
			}
		}
		put_page(_tree_page);
	}

1426 1427 1428 1429 1430 1431 1432 1433
	if (found) {
		/*
		 * nr is counting all dups in the chain only if
		 * prune_stale_stable_nodes is true, otherwise we may
		 * break the loop at nr == 1 even if there are
		 * multiple entries.
		 */
		if (prune_stale_stable_nodes && nr == 1) {
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
			/*
			 * If there's not just one entry it would
			 * corrupt memory, better BUG_ON. In KSM
			 * context with no lock held it's not even
			 * fatal.
			 */
			BUG_ON(stable_node->hlist.first->next);

			/*
			 * There's just one entry and it is below the
			 * deduplication limit so drop the chain.
			 */
			rb_replace_node(&stable_node->node, &found->node,
					root);
			free_stable_node(stable_node);
			ksm_stable_node_chains--;
			ksm_stable_node_dups--;
1451
			/*
1452 1453 1454
			 * NOTE: the caller depends on the stable_node
			 * to be equal to stable_node_dup if the chain
			 * was collapsed.
1455
			 */
1456 1457 1458 1459 1460 1461 1462 1463
			*_stable_node = found;
			/*
			 * Just for robustneess as stable_node is
			 * otherwise left as a stable pointer, the
			 * compiler shall optimize it away at build
			 * time.
			 */
			stable_node = NULL;
1464 1465
		} else if (stable_node->hlist.first != &found->hlist_dup &&
			   __is_page_sharing_candidate(found, 1)) {
1466
			/*
1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
			 * If the found stable_node dup can accept one
			 * more future merge (in addition to the one
			 * that is underway) and is not at the head of
			 * the chain, put it there so next search will
			 * be quicker in the !prune_stale_stable_nodes
			 * case.
			 *
			 * NOTE: it would be inaccurate to use nr > 1
			 * instead of checking the hlist.first pointer
			 * directly, because in the
			 * prune_stale_stable_nodes case "nr" isn't
			 * the position of the found dup in the chain,
			 * but the total number of dups in the chain.
1480 1481 1482 1483 1484