ksm.c 89.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Hugh Dickins's avatar
Hugh Dickins committed
2
/*
Izik Eidus's avatar
Izik Eidus committed
3 4 5 6 7
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
Izik Eidus's avatar
Izik Eidus committed
8
 * Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus's avatar
Izik Eidus committed
9 10 11 12
 * Authors:
 *	Izik Eidus
 *	Andrea Arcangeli
 *	Chris Wright
Izik Eidus's avatar
Izik Eidus committed
13
 *	Hugh Dickins
Hugh Dickins's avatar
Hugh Dickins committed
14 15 16
 */

#include <linux/errno.h>
Izik Eidus's avatar
Izik Eidus committed
17 18
#include <linux/mm.h>
#include <linux/fs.h>
Hugh Dickins's avatar
Hugh Dickins committed
19
#include <linux/mman.h>
Izik Eidus's avatar
Izik Eidus committed
20
#include <linux/sched.h>
21
#include <linux/sched/mm.h>
22
#include <linux/sched/coredump.h>
Izik Eidus's avatar
Izik Eidus committed
23 24 25 26
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
27
#include <linux/xxhash.h>
Izik Eidus's avatar
Izik Eidus committed
28 29 30 31 32
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
33
#include <linux/memory.h>
Izik Eidus's avatar
Izik Eidus committed
34
#include <linux/mmu_notifier.h>
35
#include <linux/swap.h>
Hugh Dickins's avatar
Hugh Dickins committed
36
#include <linux/ksm.h>
37
#include <linux/hashtable.h>
38
#include <linux/freezer.h>
39
#include <linux/oom.h>
Petr Holasek's avatar
Petr Holasek committed
40
#include <linux/numa.h>
Hugh Dickins's avatar
Hugh Dickins committed
41

Izik Eidus's avatar
Izik Eidus committed
42
#include <asm/tlbflush.h>
43
#include "internal.h"
Izik Eidus's avatar
Izik Eidus committed
44

Hugh Dickins's avatar
Hugh Dickins committed
45 46 47 48 49 50 51 52
#ifdef CONFIG_NUMA
#define NUMA(x)		(x)
#define DO_NUMA(x)	do { (x); } while (0)
#else
#define NUMA(x)		(0)
#define DO_NUMA(x)	do { } while (0)
#endif

53 54 55
/**
 * DOC: Overview
 *
Izik Eidus's avatar
Izik Eidus committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
 *
 * In order to reduce excessive scanning, KSM sorts the memory pages by their
 * contents into a data structure that holds pointers to the pages' locations.
 *
 * Since the contents of the pages may change at any moment, KSM cannot just
 * insert the pages into a normal sorted tree and expect it to find anything.
 * Therefore KSM uses two data structures - the stable and the unstable tree.
 *
 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 * by their contents.  Because each such page is write-protected, searching on
 * this tree is fully assured to be working (except when pages are unmapped),
 * and therefore this tree is called the stable tree.
 *
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
 * The stable tree node includes information required for reverse
 * mapping from a KSM page to virtual addresses that map this page.
 *
 * In order to avoid large latencies of the rmap walks on KSM pages,
 * KSM maintains two types of nodes in the stable tree:
 *
 * * the regular nodes that keep the reverse mapping structures in a
 *   linked list
 * * the "chains" that link nodes ("dups") that represent the same
 *   write protected memory content, but each "dup" corresponds to a
 *   different KSM page copy of that content
 *
 * Internally, the regular nodes, "dups" and "chains" are represented
 * using the same :c:type:`struct stable_node` structure.
 *
Izik Eidus's avatar
Izik Eidus committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
 * In addition to the stable tree, KSM uses a second data structure called the
 * unstable tree: this tree holds pointers to pages which have been found to
 * be "unchanged for a period of time".  The unstable tree sorts these pages
 * by their contents, but since they are not write-protected, KSM cannot rely
 * upon the unstable tree to work correctly - the unstable tree is liable to
 * be corrupted as its contents are modified, and so it is called unstable.
 *
 * KSM solves this problem by several techniques:
 *
 * 1) The unstable tree is flushed every time KSM completes scanning all
 *    memory areas, and then the tree is rebuilt again from the beginning.
 * 2) KSM will only insert into the unstable tree, pages whose hash value
 *    has not changed since the previous scan of all memory areas.
 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 *    colors of the nodes and not on their contents, assuring that even when
 *    the tree gets "corrupted" it won't get out of balance, so scanning time
 *    remains the same (also, searching and inserting nodes in an rbtree uses
 *    the same algorithm, so we have no overhead when we flush and rebuild).
 * 4) KSM never flushes the stable tree, which means that even if it were to
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
Hugh Dickins's avatar
Hugh Dickins committed
108 109 110
 *
 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 * stable trees and multiple unstable trees: one of each for each NUMA node.
Izik Eidus's avatar
Izik Eidus committed
111 112 113 114 115 116
 */

/**
 * struct mm_slot - ksm information per mm that is being scanned
 * @link: link to the mm_slots hash list
 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
Hugh Dickins's avatar
Hugh Dickins committed
117
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
Izik Eidus's avatar
Izik Eidus committed
118 119 120 121 122
 * @mm: the mm that this information is valid for
 */
struct mm_slot {
	struct hlist_node link;
	struct list_head mm_list;
Hugh Dickins's avatar
Hugh Dickins committed
123
	struct rmap_item *rmap_list;
Izik Eidus's avatar
Izik Eidus committed
124 125 126 127 128 129 130
	struct mm_struct *mm;
};

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
Hugh Dickins's avatar
Hugh Dickins committed
131
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
Izik Eidus's avatar
Izik Eidus committed
132 133 134 135 136 137 138
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
	struct mm_slot *mm_slot;
	unsigned long address;
Hugh Dickins's avatar
Hugh Dickins committed
139
	struct rmap_item **rmap_list;
Izik Eidus's avatar
Izik Eidus committed
140 141 142
	unsigned long seqnr;
};

Hugh Dickins's avatar
Hugh Dickins committed
143 144 145
/**
 * struct stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
146
 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
147
 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
148
 * @list: linked into migrate_nodes, pending placement in the proper node tree
Hugh Dickins's avatar
Hugh Dickins committed
149
 * @hlist: hlist head of rmap_items using this ksm page
150
 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
151 152
 * @chain_prune_time: time of the last full garbage collection
 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
153
 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
Hugh Dickins's avatar
Hugh Dickins committed
154 155
 */
struct stable_node {
156 157 158 159
	union {
		struct rb_node node;	/* when node of stable tree */
		struct {		/* when listed for migration */
			struct list_head *head;
160 161 162 163
			struct {
				struct hlist_node hlist_dup;
				struct list_head list;
			};
164 165
		};
	};
Hugh Dickins's avatar
Hugh Dickins committed
166
	struct hlist_head hlist;
167 168 169 170 171 172 173 174 175 176 177
	union {
		unsigned long kpfn;
		unsigned long chain_prune_time;
	};
	/*
	 * STABLE_NODE_CHAIN can be any negative number in
	 * rmap_hlist_len negative range, but better not -1 to be able
	 * to reliably detect underflows.
	 */
#define STABLE_NODE_CHAIN -1024
	int rmap_hlist_len;
178 179 180
#ifdef CONFIG_NUMA
	int nid;
#endif
Hugh Dickins's avatar
Hugh Dickins committed
181 182
};

Izik Eidus's avatar
Izik Eidus committed
183 184
/**
 * struct rmap_item - reverse mapping item for virtual addresses
Hugh Dickins's avatar
Hugh Dickins committed
185
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
Hugh Dickins's avatar
Hugh Dickins committed
186
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
187
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
Izik Eidus's avatar
Izik Eidus committed
188 189 190
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
Hugh Dickins's avatar
Hugh Dickins committed
191 192 193
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
Izik Eidus's avatar
Izik Eidus committed
194 195
 */
struct rmap_item {
Hugh Dickins's avatar
Hugh Dickins committed
196
	struct rmap_item *rmap_list;
197 198 199 200 201 202
	union {
		struct anon_vma *anon_vma;	/* when stable */
#ifdef CONFIG_NUMA
		int nid;		/* when node of unstable tree */
#endif
	};
Izik Eidus's avatar
Izik Eidus committed
203 204
	struct mm_struct *mm;
	unsigned long address;		/* + low bits used for flags below */
Hugh Dickins's avatar
Hugh Dickins committed
205
	unsigned int oldchecksum;	/* when unstable */
Izik Eidus's avatar
Izik Eidus committed
206
	union {
Hugh Dickins's avatar
Hugh Dickins committed
207 208 209 210 211
		struct rb_node node;	/* when node of unstable tree */
		struct {		/* when listed from stable tree */
			struct stable_node *head;
			struct hlist_node hlist;
		};
Izik Eidus's avatar
Izik Eidus committed
212 213 214 215
	};
};

#define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
Hugh Dickins's avatar
Hugh Dickins committed
216 217
#define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
#define STABLE_FLAG	0x200	/* is listed from the stable tree */
218 219
#define KSM_FLAG_MASK	(SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
				/* to mask all the flags */
Izik Eidus's avatar
Izik Eidus committed
220 221

/* The stable and unstable tree heads */
Hugh Dickins's avatar
Hugh Dickins committed
222 223 224 225
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;
Izik Eidus's avatar
Izik Eidus committed
226

227 228
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
229
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230

231 232
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
Izik Eidus's avatar
Izik Eidus committed
233 234 235 236 237 238 239 240 241

static struct mm_slot ksm_mm_head = {
	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
static struct ksm_scan ksm_scan = {
	.mm_slot = &ksm_mm_head,
};

static struct kmem_cache *rmap_item_cache;
Hugh Dickins's avatar
Hugh Dickins committed
242
static struct kmem_cache *stable_node_cache;
Izik Eidus's avatar
Izik Eidus committed
243 244 245
static struct kmem_cache *mm_slot_cache;

/* The number of nodes in the stable tree */
246
static unsigned long ksm_pages_shared;
Izik Eidus's avatar
Izik Eidus committed
247

Hugh Dickins's avatar
Hugh Dickins committed
248
/* The number of page slots additionally sharing those nodes */
249
static unsigned long ksm_pages_sharing;
Izik Eidus's avatar
Izik Eidus committed
250

251 252 253 254 255 256
/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;

/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;

257 258 259 260 261 262 263 264 265 266 267 268
/* The number of stable_node chains */
static unsigned long ksm_stable_node_chains;

/* The number of stable_node dups linked to the stable_node chains */
static unsigned long ksm_stable_node_dups;

/* Delay in pruning stale stable_node_dups in the stable_node_chains */
static int ksm_stable_node_chains_prune_millisecs = 2000;

/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;

Izik Eidus's avatar
Izik Eidus committed
269
/* Number of pages ksmd should scan in one batch */
270
static unsigned int ksm_thread_pages_to_scan = 100;
Izik Eidus's avatar
Izik Eidus committed
271 272

/* Milliseconds ksmd should sleep between batches */
Hugh Dickins's avatar
Hugh Dickins committed
273
static unsigned int ksm_thread_sleep_millisecs = 20;
Izik Eidus's avatar
Izik Eidus committed
274

275 276 277 278 279 280
/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;

/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;

Hugh Dickins's avatar
Hugh Dickins committed
281
#ifdef CONFIG_NUMA
Petr Holasek's avatar
Petr Holasek committed
282 283
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
Hugh Dickins's avatar
Hugh Dickins committed
284
static int ksm_nr_node_ids = 1;
Hugh Dickins's avatar
Hugh Dickins committed
285 286
#else
#define ksm_merge_across_nodes	1U
Hugh Dickins's avatar
Hugh Dickins committed
287
#define ksm_nr_node_ids		1
Hugh Dickins's avatar
Hugh Dickins committed
288
#endif
Petr Holasek's avatar
Petr Holasek committed
289

Izik Eidus's avatar
Izik Eidus committed
290 291 292
#define KSM_RUN_STOP	0
#define KSM_RUN_MERGE	1
#define KSM_RUN_UNMERGE	2
293 294 295
#define KSM_RUN_OFFLINE	4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);
Izik Eidus's avatar
Izik Eidus committed
296 297

static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
Izik Eidus's avatar
Izik Eidus committed
299 300 301 302 303 304 305 306 307 308 309 310 311
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
		sizeof(struct __struct), __alignof__(struct __struct),\
		(__flags), NULL)

static int __init ksm_slab_init(void)
{
	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
	if (!rmap_item_cache)
		goto out;

Hugh Dickins's avatar
Hugh Dickins committed
312 313 314 315
	stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
	if (!stable_node_cache)
		goto out_free1;

Izik Eidus's avatar
Izik Eidus committed
316 317
	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
	if (!mm_slot_cache)
Hugh Dickins's avatar
Hugh Dickins committed
318
		goto out_free2;
Izik Eidus's avatar
Izik Eidus committed
319 320 321

	return 0;

Hugh Dickins's avatar
Hugh Dickins committed
322 323 324
out_free2:
	kmem_cache_destroy(stable_node_cache);
out_free1:
Izik Eidus's avatar
Izik Eidus committed
325 326 327 328 329 330 331 332
	kmem_cache_destroy(rmap_item_cache);
out:
	return -ENOMEM;
}

static void __init ksm_slab_free(void)
{
	kmem_cache_destroy(mm_slot_cache);
Hugh Dickins's avatar
Hugh Dickins committed
333
	kmem_cache_destroy(stable_node_cache);
Izik Eidus's avatar
Izik Eidus committed
334 335 336 337
	kmem_cache_destroy(rmap_item_cache);
	mm_slot_cache = NULL;
}

338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
static __always_inline bool is_stable_node_chain(struct stable_node *chain)
{
	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}

static __always_inline bool is_stable_node_dup(struct stable_node *dup)
{
	return dup->head == STABLE_NODE_DUP_HEAD;
}

static inline void stable_node_chain_add_dup(struct stable_node *dup,
					     struct stable_node *chain)
{
	VM_BUG_ON(is_stable_node_dup(dup));
	dup->head = STABLE_NODE_DUP_HEAD;
	VM_BUG_ON(!is_stable_node_chain(chain));
	hlist_add_head(&dup->hlist_dup, &chain->hlist);
	ksm_stable_node_dups++;
}

static inline void __stable_node_dup_del(struct stable_node *dup)
{
360
	VM_BUG_ON(!is_stable_node_dup(dup));
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
	hlist_del(&dup->hlist_dup);
	ksm_stable_node_dups--;
}

static inline void stable_node_dup_del(struct stable_node *dup)
{
	VM_BUG_ON(is_stable_node_chain(dup));
	if (is_stable_node_dup(dup))
		__stable_node_dup_del(dup);
	else
		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
#ifdef CONFIG_DEBUG_VM
	dup->head = NULL;
#endif
}

Izik Eidus's avatar
Izik Eidus committed
377 378
static inline struct rmap_item *alloc_rmap_item(void)
{
379 380
	struct rmap_item *rmap_item;

381 382
	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
						__GFP_NORETRY | __GFP_NOWARN);
383 384 385
	if (rmap_item)
		ksm_rmap_items++;
	return rmap_item;
Izik Eidus's avatar
Izik Eidus committed
386 387 388 389
}

static inline void free_rmap_item(struct rmap_item *rmap_item)
{
390
	ksm_rmap_items--;
Izik Eidus's avatar
Izik Eidus committed
391 392 393 394
	rmap_item->mm = NULL;	/* debug safety */
	kmem_cache_free(rmap_item_cache, rmap_item);
}

Hugh Dickins's avatar
Hugh Dickins committed
395 396
static inline struct stable_node *alloc_stable_node(void)
{
397 398 399 400 401 402
	/*
	 * The allocation can take too long with GFP_KERNEL when memory is under
	 * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
	 * grants access to memory reserves, helping to avoid this problem.
	 */
	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
Hugh Dickins's avatar
Hugh Dickins committed
403 404 405 406
}

static inline void free_stable_node(struct stable_node *stable_node)
{
407 408
	VM_BUG_ON(stable_node->rmap_hlist_len &&
		  !is_stable_node_chain(stable_node));
Hugh Dickins's avatar
Hugh Dickins committed
409 410 411
	kmem_cache_free(stable_node_cache, stable_node);
}

Izik Eidus's avatar
Izik Eidus committed
412 413 414 415 416 417 418 419 420 421 422 423 424 425
static inline struct mm_slot *alloc_mm_slot(void)
{
	if (!mm_slot_cache)	/* initialization failed */
		return NULL;
	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}

static inline void free_mm_slot(struct mm_slot *mm_slot)
{
	kmem_cache_free(mm_slot_cache, mm_slot);
}

static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
426 427
	struct mm_slot *slot;

428
	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 430
		if (slot->mm == mm)
			return slot;
Izik Eidus's avatar
Izik Eidus committed
431 432 433 434 435 436 437 438

	return NULL;
}

static void insert_to_mm_slots_hash(struct mm_struct *mm,
				    struct mm_slot *mm_slot)
{
	mm_slot->mm = mm;
439
	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
Izik Eidus's avatar
Izik Eidus committed
440 441
}

442 443 444 445 446 447 448 449 450 451 452 453 454
/*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
 * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
 * a special flag: they can just back out as soon as mm_users goes to zero.
 * ksm_test_exit() is used throughout to make this test for exit: in some
 * places for correctness, in some places just to avoid unnecessary work.
 */
static inline bool ksm_test_exit(struct mm_struct *mm)
{
	return atomic_read(&mm->mm_users) == 0;
}

Izik Eidus's avatar
Izik Eidus committed
455 456 457
/*
 * We use break_ksm to break COW on a ksm page: it's a stripped down
 *
458
 *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
Izik Eidus's avatar
Izik Eidus committed
459 460 461 462 463 464
 *		put_page(page);
 *
 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
465 466 467 468
 *
 * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
 * of the process that owns 'vma'.  We also do not want to enforce
 * protection keys here anyway.
Izik Eidus's avatar
Izik Eidus committed
469
 */
Hugh Dickins's avatar
Hugh Dickins committed
470
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
Izik Eidus's avatar
Izik Eidus committed
471 472
{
	struct page *page;
473
	vm_fault_t ret = 0;
Izik Eidus's avatar
Izik Eidus committed
474 475 476

	do {
		cond_resched();
477 478
		page = follow_page(vma, addr,
				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479
		if (IS_ERR_OR_NULL(page))
Izik Eidus's avatar
Izik Eidus committed
480 481
			break;
		if (PageKsm(page))
482 483
			ret = handle_mm_fault(vma, addr,
					FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
Izik Eidus's avatar
Izik Eidus committed
484 485 486
		else
			ret = VM_FAULT_WRITE;
		put_page(page);
487
	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
Hugh Dickins's avatar
Hugh Dickins committed
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
	/*
	 * We must loop because handle_mm_fault() may back out if there's
	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
	 *
	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
	 * COW has been broken, even if the vma does not permit VM_WRITE;
	 * but note that a concurrent fault might break PageKsm for us.
	 *
	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
	 * backing file, which also invalidates anonymous pages: that's
	 * okay, that truncation will have unmapped the PageKsm for us.
	 *
	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
	 * current task has TIF_MEMDIE set, and will be OOM killed on return
	 * to user; and ksmd, having no mm, would never be chosen for that.
	 *
	 * But if the mm is in a limited mem_cgroup, then the fault may fail
	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
	 * even ksmd can fail in this way - though it's usually breaking ksm
	 * just to undo a merge it made a moment before, so unlikely to oom.
	 *
	 * That's a pity: we might therefore have more kernel pages allocated
	 * than we're counting as nodes in the stable tree; but ksm_do_scan
	 * will retry to break_cow on each pass, so should recover the page
	 * in due course.  The important thing is to not let VM_MERGEABLE
	 * be cleared while any such pages might remain in the area.
	 */
	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
Izik Eidus's avatar
Izik Eidus committed
517 518
}

519 520 521 522 523 524 525 526 527 528 529 530 531 532
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
		unsigned long addr)
{
	struct vm_area_struct *vma;
	if (ksm_test_exit(mm))
		return NULL;
	vma = find_vma(mm, addr);
	if (!vma || vma->vm_start > addr)
		return NULL;
	if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
		return NULL;
	return vma;
}

533
static void break_cow(struct rmap_item *rmap_item)
Izik Eidus's avatar
Izik Eidus committed
534
{
535 536
	struct mm_struct *mm = rmap_item->mm;
	unsigned long addr = rmap_item->address;
Izik Eidus's avatar
Izik Eidus committed
537 538
	struct vm_area_struct *vma;

539 540 541 542
	/*
	 * It is not an accident that whenever we want to break COW
	 * to undo, we also need to drop a reference to the anon_vma.
	 */
543
	put_anon_vma(rmap_item->anon_vma);
544

Hugh Dickins's avatar
Hugh Dickins committed
545
	down_read(&mm->mmap_sem);
546 547 548
	vma = find_mergeable_vma(mm, addr);
	if (vma)
		break_ksm(vma, addr);
Izik Eidus's avatar
Izik Eidus committed
549 550 551 552 553 554 555 556 557 558 559
	up_read(&mm->mmap_sem);
}

static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
	struct mm_struct *mm = rmap_item->mm;
	unsigned long addr = rmap_item->address;
	struct vm_area_struct *vma;
	struct page *page;

	down_read(&mm->mmap_sem);
560 561
	vma = find_mergeable_vma(mm, addr);
	if (!vma)
Izik Eidus's avatar
Izik Eidus committed
562 563 564
		goto out;

	page = follow_page(vma, addr, FOLL_GET);
565
	if (IS_ERR_OR_NULL(page))
Izik Eidus's avatar
Izik Eidus committed
566
		goto out;
567
	if (PageAnon(page)) {
Izik Eidus's avatar
Izik Eidus committed
568 569 570 571
		flush_anon_page(vma, page, addr);
		flush_dcache_page(page);
	} else {
		put_page(page);
572 573
out:
		page = NULL;
Izik Eidus's avatar
Izik Eidus committed
574 575 576 577 578
	}
	up_read(&mm->mmap_sem);
	return page;
}

Petr Holasek's avatar
Petr Holasek committed
579 580 581 582 583 584 585 586
/*
 * This helper is used for getting right index into array of tree roots.
 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
 * every node has its own stable and unstable tree.
 */
static inline int get_kpfn_nid(unsigned long kpfn)
{
587
	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
Petr Holasek's avatar
Petr Holasek committed
588 589
}

590 591 592 593 594 595 596 597 598 599
static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
						   struct rb_root *root)
{
	struct stable_node *chain = alloc_stable_node();
	VM_BUG_ON(is_stable_node_chain(dup));
	if (likely(chain)) {
		INIT_HLIST_HEAD(&chain->hlist);
		chain->chain_prune_time = jiffies;
		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600
		chain->nid = NUMA_NO_NODE; /* debug */
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
#endif
		ksm_stable_node_chains++;

		/*
		 * Put the stable node chain in the first dimension of
		 * the stable tree and at the same time remove the old
		 * stable node.
		 */
		rb_replace_node(&dup->node, &chain->node, root);

		/*
		 * Move the old stable node to the second dimension
		 * queued in the hlist_dup. The invariant is that all
		 * dup stable_nodes in the chain->hlist point to pages
		 * that are wrprotected and have the exact same
		 * content.
		 */
		stable_node_chain_add_dup(dup, chain);
	}
	return chain;
}

static inline void free_stable_node_chain(struct stable_node *chain,
					  struct rb_root *root)
{
	rb_erase(&chain->node, root);
	free_stable_node(chain);
	ksm_stable_node_chains--;
}

631 632 633 634
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
	struct rmap_item *rmap_item;

635 636 637
	/* check it's not STABLE_NODE_CHAIN or negative */
	BUG_ON(stable_node->rmap_hlist_len < 0);

638
	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 640 641 642
		if (rmap_item->hlist.next)
			ksm_pages_sharing--;
		else
			ksm_pages_shared--;
643 644
		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
		stable_node->rmap_hlist_len--;
645
		put_anon_vma(rmap_item->anon_vma);
646 647 648 649
		rmap_item->address &= PAGE_MASK;
		cond_resched();
	}

650 651 652 653 654
	/*
	 * We need the second aligned pointer of the migrate_nodes
	 * list_head to stay clear from the rb_parent_color union
	 * (aligned and different than any node) and also different
	 * from &migrate_nodes. This will verify that future list.h changes
655
	 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
656
	 */
657
#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 659 660 661
	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
#endif

662 663 664
	if (stable_node->head == &migrate_nodes)
		list_del(&stable_node->list);
	else
665
		stable_node_dup_del(stable_node);
666 667 668
	free_stable_node(stable_node);
}

669 670 671 672 673 674
enum get_ksm_page_flags {
	GET_KSM_PAGE_NOLOCK,
	GET_KSM_PAGE_LOCK,
	GET_KSM_PAGE_TRYLOCK
};

675 676 677 678 679 680
/*
 * get_ksm_page: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
681
 * But beware, the stable node's page might be being migrated.
682 683 684 685 686 687 688 689 690 691 692 693
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
694 695
static struct page *get_ksm_page(struct stable_node *stable_node,
				 enum get_ksm_page_flags flags)
696 697 698
{
	struct page *page;
	void *expected_mapping;
699
	unsigned long kpfn;
700

701 702
	expected_mapping = (void *)((unsigned long)stable_node |
					PAGE_MAPPING_KSM);
703
again:
704
	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
705
	page = pfn_to_page(kpfn);
706
	if (READ_ONCE(page->mapping) != expected_mapping)
707
		goto stale;
708 709 710 711 712

	/*
	 * We cannot do anything with the page while its refcount is 0.
	 * Usually 0 means free, or tail of a higher-order page: in which
	 * case this node is no longer referenced, and should be freed;
713
	 * however, it might mean that the page is under page_ref_freeze().
714
	 * The __remove_mapping() case is easy, again the node is now stale;
715 716 717
	 * the same is in reuse_ksm_page() case; but if page is swapcache
	 * in migrate_page_move_mapping(), it might still be our page,
	 * in which case it's essential to keep the node.
718 719 720 721 722 723 724
	 */
	while (!get_page_unless_zero(page)) {
		/*
		 * Another check for page->mapping != expected_mapping would
		 * work here too.  We have chosen the !PageSwapCache test to
		 * optimize the common case, when the page is or is about to
		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
725
		 * in the ref_freeze section of __remove_mapping(); but Anon
726 727 728 729 730 731 732
		 * page->mapping reset to NULL later, in free_pages_prepare().
		 */
		if (!PageSwapCache(page))
			goto stale;
		cpu_relax();
	}

733
	if (READ_ONCE(page->mapping) != expected_mapping) {
734 735 736
		put_page(page);
		goto stale;
	}
737

738 739 740 741 742 743
	if (flags == GET_KSM_PAGE_TRYLOCK) {
		if (!trylock_page(page)) {
			put_page(page);
			return ERR_PTR(-EBUSY);
		}
	} else if (flags == GET_KSM_PAGE_LOCK)
Hugh Dickins's avatar
Hugh Dickins committed
744
		lock_page(page);
745 746

	if (flags != GET_KSM_PAGE_NOLOCK) {
747
		if (READ_ONCE(page->mapping) != expected_mapping) {
Hugh Dickins's avatar
Hugh Dickins committed
748 749 750 751 752
			unlock_page(page);
			put_page(page);
			goto stale;
		}
	}
753
	return page;
754

755
stale:
756 757 758 759 760 761 762
	/*
	 * We come here from above when page->mapping or !PageSwapCache
	 * suggests that the node is stale; but it might be under migration.
	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
	 * before checking whether node->kpfn has been changed.
	 */
	smp_rmb();
763
	if (READ_ONCE(stable_node->kpfn) != kpfn)
764
		goto again;
765 766 767 768
	remove_node_from_stable_tree(stable_node);
	return NULL;
}

Izik Eidus's avatar
Izik Eidus committed
769 770 771 772 773 774
/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
Hugh Dickins's avatar
Hugh Dickins committed
775 776
	if (rmap_item->address & STABLE_FLAG) {
		struct stable_node *stable_node;
777
		struct page *page;
Izik Eidus's avatar
Izik Eidus committed
778

Hugh Dickins's avatar
Hugh Dickins committed
779
		stable_node = rmap_item->head;
780
		page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 782
		if (!page)
			goto out;
783

Hugh Dickins's avatar
Hugh Dickins committed
784
		hlist_del(&rmap_item->hlist);
785 786
		unlock_page(page);
		put_page(page);
787

788
		if (!hlist_empty(&stable_node->hlist))
789 790
			ksm_pages_sharing--;
		else
Hugh Dickins's avatar
Hugh Dickins committed
791
			ksm_pages_shared--;
792 793
		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
		stable_node->rmap_hlist_len--;
Izik Eidus's avatar
Izik Eidus committed
794

795
		put_anon_vma(rmap_item->anon_vma);
796
		rmap_item->address &= PAGE_MASK;
Izik Eidus's avatar
Izik Eidus committed
797

Hugh Dickins's avatar
Hugh Dickins committed
798
	} else if (rmap_item->address & UNSTABLE_FLAG) {
Izik Eidus's avatar
Izik Eidus committed
799 800
		unsigned char age;
		/*
Hugh Dickins's avatar
Hugh Dickins committed
801
		 * Usually ksmd can and must skip the rb_erase, because
Izik Eidus's avatar
Izik Eidus committed
802
		 * root_unstable_tree was already reset to RB_ROOT.
Hugh Dickins's avatar
Hugh Dickins committed
803 804 805
		 * But be careful when an mm is exiting: do the rb_erase
		 * if this rmap_item was inserted by this scan, rather
		 * than left over from before.
Izik Eidus's avatar
Izik Eidus committed
806 807
		 */
		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808
		BUG_ON(age > 1);
Izik Eidus's avatar
Izik Eidus committed
809
		if (!age)
Petr Holasek's avatar
Petr Holasek committed
810
			rb_erase(&rmap_item->node,
Hugh Dickins's avatar
Hugh Dickins committed
811
				 root_unstable_tree + NUMA(rmap_item->nid));
812
		ksm_pages_unshared--;
813
		rmap_item->address &= PAGE_MASK;
Izik Eidus's avatar
Izik Eidus committed
814
	}
815
out:
Izik Eidus's avatar
Izik Eidus committed
816 817 818 819
	cond_resched();		/* we're called from many long loops */
}

static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
Hugh Dickins's avatar
Hugh Dickins committed
820
				       struct rmap_item **rmap_list)
Izik Eidus's avatar
Izik Eidus committed
821
{
Hugh Dickins's avatar
Hugh Dickins committed
822 823 824
	while (*rmap_list) {
		struct rmap_item *rmap_item = *rmap_list;
		*rmap_list = rmap_item->rmap_list;
Izik Eidus's avatar
Izik Eidus committed
825 826 827 828 829 830
		remove_rmap_item_from_tree(rmap_item);
		free_rmap_item(rmap_item);
	}
}

/*
Hugh Dickins's avatar
Hugh Dickins committed
831
 * Though it's very tempting to unmerge rmap_items from stable tree rather
Izik Eidus's avatar
Izik Eidus committed
832 833 834 835 836
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
Hugh Dickins's avatar
Hugh Dickins committed
837 838 839 840 841
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
Izik Eidus's avatar
Izik Eidus committed
842
 */
Hugh Dickins's avatar
Hugh Dickins committed
843 844
static int unmerge_ksm_pages(struct vm_area_struct *vma,
			     unsigned long start, unsigned long end)
Izik Eidus's avatar
Izik Eidus committed
845 846
{
	unsigned long addr;
Hugh Dickins's avatar
Hugh Dickins committed
847
	int err = 0;
Izik Eidus's avatar
Izik Eidus committed
848

Hugh Dickins's avatar
Hugh Dickins committed
849
	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
Hugh Dickins's avatar
Hugh Dickins committed
850 851
		if (ksm_test_exit(vma->vm_mm))
			break;
Hugh Dickins's avatar
Hugh Dickins committed
852 853 854 855 856 857
		if (signal_pending(current))
			err = -ERESTARTSYS;
		else
			err = break_ksm(vma, addr);
	}
	return err;
Izik Eidus's avatar
Izik Eidus committed
858 859
}

860 861 862 863 864 865 866 867 868 869 870
static inline struct stable_node *page_stable_node(struct page *page)
{
	return PageKsm(page) ? page_rmapping(page) : NULL;
}

static inline void set_page_stable_node(struct page *page,
					struct stable_node *stable_node)
{
	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}

Hugh Dickins's avatar
Hugh Dickins committed
871 872 873 874
#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */
875 876 877 878 879
static int remove_stable_node(struct stable_node *stable_node)
{
	struct page *page;
	int err;

880
	page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 882 883 884 885 886 887
	if (!page) {
		/*
		 * get_ksm_page did remove_node_from_stable_tree itself.
		 */
		return 0;
	}

Hugh Dickins's avatar
Hugh Dickins committed
888 889 890 891 892
	if (WARN_ON_ONCE(page_mapped(page))) {
		/*
		 * This should not happen: but if it does, just refuse to let
		 * merge_across_nodes be switched - there is no need to panic.
		 */
893
		err = -EBUSY;
Hugh Dickins's avatar
Hugh Dickins committed
894
	} else {
895
		/*
Hugh Dickins's avatar
Hugh Dickins committed
896 897 898
		 * The stable node did not yet appear stale to get_ksm_page(),
		 * since that allows for an unmapped ksm page to be recognized
		 * right up until it is freed; but the node is safe to remove.
899 900 901 902 903 904 905 906 907 908 909 910 911 912
		 * This page might be in a pagevec waiting to be freed,
		 * or it might be PageSwapCache (perhaps under writeback),
		 * or it might have been removed from swapcache a moment ago.
		 */
		set_page_stable_node(page, NULL);
		remove_node_from_stable_tree(stable_node);
		err = 0;
	}

	unlock_page(page);
	put_page(page);
	return err;
}

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
static int remove_stable_node_chain(struct stable_node *stable_node,
				    struct rb_root *root)
{
	struct stable_node *dup;
	struct hlist_node *hlist_safe;

	if (!is_stable_node_chain(stable_node)) {
		VM_BUG_ON(is_stable_node_dup(stable_node));
		if (remove_stable_node(stable_node))
			return true;
		else
			return false;
	}

	hlist_for_each_entry_safe(dup, hlist_safe,
				  &stable_node->hlist, hlist_dup) {
		VM_BUG_ON(!is_stable_node_dup(dup));
		if (remove_stable_node(dup))
			return true;
	}
	BUG_ON(!hlist_empty(&stable_node->hlist));
	free_stable_node_chain(stable_node, root);
	return false;
}

938 939
static int remove_all_stable_nodes(void)
{
940
	struct stable_node *stable_node, *next;
941 942 943
	int nid;
	int err = 0;

Hugh Dickins's avatar
Hugh Dickins committed
944
	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 946 947
		while (root_stable_tree[nid].rb_node) {
			stable_node = rb_entry(root_stable_tree[nid].rb_node,
						struct stable_node, node);
948 949
			if (remove_stable_node_chain(stable_node,
						     root_stable_tree + nid)) {
950 951 952 953 954 955
				err = -EBUSY;
				break;	/* proceed to next nid */
			}
			cond_resched();
		}
	}
956
	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 958 959 960
		if (remove_stable_node(stable_node))
			err = -EBUSY;
		cond_resched();
	}
961 962 963
	return err;
}

Hugh Dickins's avatar
Hugh Dickins committed
964
static int unmerge_and_remove_all_rmap_items(void)
Izik Eidus's avatar
Izik Eidus committed
965 966 967 968
{
	struct mm_slot *mm_slot;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
Hugh Dickins's avatar
Hugh Dickins committed
969 970 971
	int err = 0;

	spin_lock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
972
	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
Hugh Dickins's avatar
Hugh Dickins committed
973 974
						struct mm_slot, mm_list);
	spin_unlock(&ksm_mmlist_lock);
Izik Eidus's avatar
Izik Eidus committed
975

Hugh Dickins's avatar
Hugh Dickins committed
976 977
	for (mm_slot = ksm_scan.mm_slot;
			mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
Izik Eidus's avatar
Izik Eidus committed
978 979 980
		mm = mm_slot->mm;
		down_read(&mm->mmap_sem);
		for (vma = mm->mmap; vma; vma = vma->vm_next) {
Hugh Dickins's avatar
Hugh Dickins committed
981 982
			if (ksm_test_exit(mm))
				break;
Izik Eidus's avatar
Izik Eidus committed
983 984
			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
				continue;
Hugh Dickins's avatar
Hugh Dickins committed
985 986
			err = unmerge_ksm_pages(vma,
						vma->vm_start, vma->vm_end);
Hugh Dickins's avatar
Hugh Dickins committed
987 988
			if (err)
				goto error;
Izik Eidus's avatar
Izik Eidus committed
989
		}
Hugh Dickins's avatar
Hugh Dickins committed
990

Hugh Dickins's avatar
Hugh Dickins committed
991
		remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992
		up_read(&mm->mmap_sem);
Hugh Dickins's avatar
Hugh Dickins committed
993 994

		spin_lock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
995
		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
Hugh Dickins's avatar
Hugh Dickins committed
996
						struct mm_slot, mm_list);
Hugh Dickins's avatar
Hugh Dickins committed
997
		if (ksm_test_exit(mm)) {
998
			hash_del(&mm_slot->link);
Hugh Dickins's avatar
Hugh Dickins committed
999 1000 1001 1002 1003 1004
			list_del(&mm_slot->mm_list);
			spin_unlock(&ksm_mmlist_lock);

			free_mm_slot(mm_slot);
			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
			mmdrop(mm);
1005
		} else
Hugh Dickins's avatar
Hugh Dickins committed
1006
			spin_unlock(&ksm_mmlist_lock);
Izik Eidus's avatar
Izik Eidus committed
1007 1008
	}

1009 1010
	/* Clean up stable nodes, but don't worry if some are still busy */
	remove_all_stable_nodes();
Hugh Dickins's avatar
Hugh Dickins committed
1011
	ksm_scan.seqnr = 0;
Hugh Dickins's avatar
Hugh Dickins committed
1012 1013 1014 1015
	return 0;

error:
	up_read(&mm->mmap_sem);
Izik Eidus's avatar
Izik Eidus committed
1016
	spin_lock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
1017
	ksm_scan.mm_slot = &ksm_mm_head;
Izik Eidus's avatar
Izik Eidus committed
1018
	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins's avatar
Hugh Dickins committed
1019
	return err;
Izik Eidus's avatar
Izik Eidus committed
1020
}
Hugh Dickins's avatar
Hugh Dickins committed
1021
#endif /* CONFIG_SYSFS */
Izik Eidus's avatar
Izik Eidus committed
1022 1023 1024 1025

static u32 calc_checksum(struct page *page)
{
	u32 checksum;
1026
	void *addr = kmap_atomic(page);
1027
	checksum = xxhash(addr, PAGE_SIZE, 0);
1028
	kunmap_atomic(addr);
Izik Eidus's avatar
Izik Eidus committed
1029 1030 1031 1032 1033 1034 1035 1036
	return checksum;
}

static int memcmp_pages(struct page *page1, struct page *page2)
{
	char *addr1, *addr2;
	int ret;

1037 1038
	addr1 = kmap_atomic(page1);
	addr2 = kmap_atomic(page2);
Izik Eidus's avatar
Izik Eidus committed
1039
	ret = memcmp(addr1, addr2, PAGE_SIZE);
1040 1041
	kunmap_atomic(addr2);
	kunmap_atomic(addr1);
Izik Eidus's avatar
Izik Eidus committed
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
	return ret;
}

static inline int pages_identical(struct page *page1, struct page *page2)
{
	return !memcmp_pages(page1, page2);
}

static int write_protect_page(struct vm_area_struct *vma, struct page *page,
			      pte_t *orig_pte)
{
	struct mm_struct *mm = vma->vm_mm;
1054 1055 1056 1057
	struct page_vma_mapped_walk pvmw = {
		.page = page,
		.vma = vma,
	};
Izik Eidus's avatar
Izik Eidus committed
1058 1059
	int swapped;
	int err = -EFAULT;
1060
	struct mmu_notifier_range range;
Izik Eidus's avatar
Izik Eidus committed
1061

1062 1063
	pvmw.address = page_address_in_vma(page, vma);
	if (pvmw.address == -EFAULT)
Izik Eidus's avatar
Izik Eidus committed
1064 1065
		goto out;

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1066
	BUG_ON(PageTransCompound(page));
1067

1068
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1069
				pvmw.address,
1070 1071
				pvmw.address + PAGE_SIZE);
	mmu_notifier_invalidate_range_start(&range);
1072

1073
	if (!page_vma_mapped_walk(&pvmw))
1074
		goto out_mn;
1075 1076
	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
		goto out_unlock;
Izik Eidus's avatar
Izik Eidus committed
1077

1078
	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
Minchan Kim's avatar
Minchan Kim committed
1079 1080
	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
						mm_tlb_flush_pending(mm)) {
Izik Eidus's avatar
Izik Eidus committed
1081 1082 1083
		pte_t entry;

		swapped = PageSwapCache(page);
1084
		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
Izik Eidus's avatar
Izik Eidus committed
1085
		/*
Lucas De Marchi's avatar
Lucas De Marchi committed
1086
		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
Izik Eidus's avatar
Izik Eidus committed
1087 1088 1089 1090 1091 1092
		 * take any lock, therefore the check that we are going to make
		 * with the pagecount against the mapcount is racey and
		 * O_DIRECT can happen right after the check.
		 * So we clear the pte and flush the tlb before the check
		 * this assure us that no O_DIRECT can happen after the check
		 * or in the middle of the check.
1093 1094 1095 1096
		 *
		 * No need to notify as we are downgrading page table to read
		 * only not changing it to point to a new page.
		 *
1097
		 * See Documentation/vm/mmu_notifier.rst
Izik Eidus's avatar
Izik Eidus committed
1098
		 */
1099
		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
Izik Eidus's avatar
Izik Eidus committed
1100 1101 1102 1103
		/*
		 * Check that no O_DIRECT or similar I/O is in progress on the
		 * page
		 */
1104
		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1105
			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
Izik Eidus's avatar
Izik Eidus committed
1106 1107
			goto out_unlock;
		}
1108 1109
		if (pte_dirty(entry))
			set_page_dirty(page);
1110 1111 1112 1113 1114

		if (pte_protnone(entry))
			entry = pte_mkclean(pte_clear_savedwrite(entry));
		else
			entry = pte_mkclean(pte_wrprotect(entry));
1115
		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
Izik Eidus's avatar
Izik Eidus committed
1116
	}
1117
	*orig_pte = *pvmw.pte;
Izik Eidus's avatar
Izik Eidus committed
1118 1119 1120
	err = 0;

out_unlock:
1121
	page_vma_mapped_walk_done(&pvmw);
1122
out_mn:
1123
	mmu_notifier_invalidate_range_end(&range);
Izik Eidus's avatar
Izik Eidus committed
1124 1125 1126 1127 1128 1129
out:
	return err;
}

/**
 * replace_page - replace page in vma by new ksm page
1130 1131 1132
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
Izik Eidus's avatar
Izik Eidus committed
1133 1134 1135 1136
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
1137 1138
static int replace_page(struct vm_area_struct *vma, struct page *page,
			struct page *kpage, pte_t orig_pte)
Izik Eidus's avatar
Izik Eidus committed
1139 1140 1141 1142
{
	struct mm_struct *mm = vma->vm_mm;
	pmd_t *pmd;
	pte_t *ptep;
1143
	pte_t newpte;
Izik Eidus's avatar
Izik Eidus committed
1144 1145 1146
	spinlock_t *ptl;
	unsigned long addr;
	int err = -EFAULT;
1147
	struct mmu_notifier_range range;
Izik Eidus's avatar
Izik Eidus committed
1148

1149
	addr = page_address_in_vma(page, vma);
Izik Eidus's avatar
Izik Eidus committed
1150 1151 1152
	if (addr == -EFAULT)
		goto out;

Bob Liu's avatar
Bob Liu committed
1153 1154
	pmd = mm_find_pmd(mm, addr);
	if (!pmd)
Izik Eidus's avatar
Izik Eidus committed
1155 1156
		goto out;

1157
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1158
				addr + PAGE_SIZE);
1159
	mmu_notifier_invalidate_range_start(&range);
1160

Izik Eidus's avatar
Izik Eidus committed
1161 1162 1163
	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_same(*ptep, orig_pte)) {
		pte_unmap_unlock(ptep, ptl);
1164
		goto out_mn;
Izik Eidus's avatar
Izik Eidus committed
1165 1166
	}

1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
	/*
	 * No need to check ksm_use_zero_pages here: we can only have a
	 * zero_page here if ksm_use_zero_pages was enabled alreaady.
	 */
	if (!is_zero_pfn(page_to_pfn(kpage))) {
		get_page(kpage);
		page_add_anon_rmap(kpage, vma, addr, false);
		newpte = mk_pte(kpage, vma->vm_page_prot);
	} else {
		newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
					       vma->vm_page_prot));
1178 1179 1180 1181 1182 1183 1184
		/*
		 * We're replacing an anonymous page with a zero page, which is
		 * not anonymous. We need to do proper accounting otherwise we
		 * will get wrong values in /proc, and a BUG message in dmesg
		 * when tearing down the mm.
		 */
		dec_mm_counter(mm, MM_ANONPAGES);
1185
	}
Izik Eidus's avatar
Izik Eidus committed
1186 1187

	flush_cache_page(vma, addr, pte_pfn(*ptep));
1188 1189 1190 1191
	/*
	 * No need to notify as we are replacing a read only page with another
	 * read only page with the same content.
	 *
1192
	 * See Documentation/vm/mmu_notifier.rst
1193 1194
	 */
	ptep_clear_flush(vma, addr, ptep);
1195
	set_pte_at_notify(mm, addr, ptep, newpte);
Izik Eidus's avatar
Izik Eidus committed
1196

1197
	page_remove_rmap(page, false);
1198 1199
	if (!page_mapped(page))
		try_to_free_swap(page);
1200
	put_page(page);
Izik Eidus's avatar
Izik Eidus committed
1201 1202 1203

	pte_unmap_unlock(ptep, ptl);
	err = 0;
1204
out_mn:
1205
	mmu_notifier_invalidate_range_end(&range);
Izik Eidus's avatar
Izik Eidus committed
1206 1207 1208 1209 1210 1211
out:
	return err;
}

/*
 * try_to_merge_one_page - take two pages and merge them into one
1212 1213
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
1214 1215
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
Izik Eidus's avatar
Izik Eidus committed
1216 1217 1218 1219
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
1220
				 struct page *page, struct page *kpage)
Izik Eidus's avatar
Izik Eidus committed
1221 1222 1223 1224
{
	pte_t orig_pte = __pte(0);
	int err = -EFAULT;

Hugh Dickins's avatar
Hugh Dickins committed
1225 1226 1227
	if (page == kpage)			/* ksm page forked */
		return 0;

1228
	if (!PageAnon(page))
Izik Eidus's avatar
Izik Eidus committed
1229 1230 1231 1232 1233 1234 1235 1236 1237
		goto out;

	/*
	 * We need the page lock to read a stable PageSwapCache in
	 * write_protect_page().  We use trylock_page() instead of
	 * lock_page() because we don't want to wait here - we
	 * prefer to continue scanning and merging different pages,
	 * then come back to this page when it is unlocked.
	 */
1238
	if (!trylock_page(page))
1239
		goto out;
1240 1241

	if (PageTransCompound(page)) {
1242
		if (split_huge_page(page))
1243 1244 1245
			goto out_unlock;
	}

Izik Eidus's avatar
Izik Eidus committed
1246 1247 1248 1249 1250 1251
	/*
	 * If this anonymous page is mapped only here, its pte may need
	 * to be write-protected.  If it's mapped elsewhere, all of its
	 * ptes are necessarily already write-protected.  But in either
	 * case, we need to lock and check page_count is not raised.
	 */
1252 1253 1254 1255 1256 1257 1258 1259 1260
	if (write_protect_page(vma, page, &orig_pte) == 0) {
		if (!kpage) {
			/*
			 * While we hold page lock, upgrade page from
			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
			 * stable_tree_insert() will update stable_node.
			 */
			set_page_stable_node(page, NULL);
			mark_page_accessed(page);
Minchan Kim's avatar
Minchan Kim committed
1261 1262 1263 1264 1265 1266
			/*
			 * Page reclaim just frees a clean page with no dirty
			 * ptes: make sure that the ksm page would be swapped.
			 */
			if (!PageDirty(page))
				SetPageDirty(page);
1267 1268 1269 1270
			err = 0;
		} else if (pages_identical(page, kpage))
			err = replace_page(vma, page, kpage, orig_pte);
	}
Izik Eidus's avatar
Izik Eidus committed
1271

1272
	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1273
		munlock_vma_page(page);
1274 1275 1276 1277 1278 1279 1280
		if (!PageMlocked(kpage)) {
			unlock_page(page);
			lock_page(kpage);
			mlock_vma_page(kpage);
			page = kpage;		/* for final unlock */
		}
	}
1281

1282
out_unlock:
1283
	unlock_page(page);