swap_state.c 22.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
11
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
12 13
#include <linux/kernel_stat.h>
#include <linux/swap.h>
14
#include <linux/swapops.h>
Linus Torvalds's avatar
Linus Torvalds committed
15 16 17
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
18
#include <linux/blkdev.h>
19
#include <linux/pagevec.h>
20
#include <linux/migrate.h>
21
#include <linux/vmalloc.h>
22
#include <linux/swap_slots.h>
23
#include <linux/huge_mm.h>
Linus Torvalds's avatar
Linus Torvalds committed
24 25 26 27 28

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
Jens Axboe's avatar
Jens Axboe committed
29
 * vmscan's shrink_page_list.
Linus Torvalds's avatar
Linus Torvalds committed
30
 */
31
static const struct address_space_operations swap_aops = {
Linus Torvalds's avatar
Linus Torvalds committed
32
	.writepage	= swap_writepage,
33
	.set_page_dirty	= swap_set_page_dirty,
34
#ifdef CONFIG_MIGRATION
35
	.migratepage	= migrate_page,
36
#endif
Linus Torvalds's avatar
Linus Torvalds committed
37 38
};

39 40
struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41
static bool enable_vma_readahead __read_mostly = true;
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59

#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)				\
	(((addr) & PAGE_MASK) |					\
	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
	 ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)					\
	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
Linus Torvalds's avatar
Linus Torvalds committed
60 61

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
62
#define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
Linus Torvalds's avatar
Linus Torvalds committed
63 64 65 66 67 68 69 70

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

71 72
unsigned long total_swapcache_pages(void)
{
73
	unsigned int i, j, nr;
74
	unsigned long ret = 0;
75
	struct address_space *spaces;
76
	struct swap_info_struct *si;
77

78
	for (i = 0; i < MAX_SWAPFILES; i++) {
79 80 81 82 83 84 85 86
		swp_entry_t entry = swp_entry(i, 1);

		/* Avoid get_swap_device() to warn for bad swap entry */
		if (!swp_swap_info(entry))
			continue;
		/* Prevent swapoff to free swapper_spaces */
		si = get_swap_device(entry);
		if (!si)
87
			continue;
88 89
		nr = nr_swapper_spaces[i];
		spaces = swapper_spaces[i];
90 91
		for (j = 0; j < nr; j++)
			ret += spaces[j].nrpages;
92
		put_swap_device(si);
93
	}
94 95 96
	return ret;
}

97 98
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

Linus Torvalds's avatar
Linus Torvalds committed
99 100
void show_swap_cache_info(void)
{
101
	printk("%lu pages in swap cache\n", total_swapcache_pages());
102
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
Linus Torvalds's avatar
Linus Torvalds committed
103
		swap_cache_info.add_total, swap_cache_info.del_total,
104
		swap_cache_info.find_success, swap_cache_info.find_total);
105 106
	printk("Free swap  = %ldkB\n",
		get_nr_swap_pages() << (PAGE_SHIFT - 10));
Linus Torvalds's avatar
Linus Torvalds committed
107 108 109 110
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
111
 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
Linus Torvalds's avatar
Linus Torvalds committed
112 113
 * but sets SwapCache flag and private instead of mapping and index.
 */
114
int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
115
{
116
	struct address_space *address_space = swap_address_space(entry);
117
	pgoff_t idx = swp_offset(entry);
118
	XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
119
	unsigned long i, nr = compound_nr(page);
Linus Torvalds's avatar
Linus Torvalds committed
120

121 122 123
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageSwapCache(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
124

125
	page_ref_add(page, nr);
126 127
	SetPageSwapCache(page);

128 129 130 131 132 133 134 135
	do {
		xas_lock_irq(&xas);
		xas_create_range(&xas);
		if (xas_error(&xas))
			goto unlock;
		for (i = 0; i < nr; i++) {
			VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
			set_page_private(page + i, entry.val + i);
136
			xas_store(&xas, page + i);
137 138
			xas_next(&xas);
		}
139 140 141
		address_space->nrpages += nr;
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		ADD_CACHE_INFO(add_total, nr);
142 143 144
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));
145

146 147
	if (!xas_error(&xas))
		return 0;
148

149 150 151
	ClearPageSwapCache(page);
	page_ref_sub(page, nr);
	return xas_error(&xas);
Linus Torvalds's avatar
Linus Torvalds committed
152 153 154 155 156 157
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
158
void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
Linus Torvalds's avatar
Linus Torvalds committed
159
{
160
	struct address_space *address_space = swap_address_space(entry);
161
	int i, nr = hpage_nr_pages(page);
162 163
	pgoff_t idx = swp_offset(entry);
	XA_STATE(xas, &address_space->i_pages, idx);
164

165 166 167
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	VM_BUG_ON_PAGE(PageWriteback(page), page);
Linus Torvalds's avatar
Linus Torvalds committed
168

169
	for (i = 0; i < nr; i++) {
170
		void *entry = xas_store(&xas, NULL);
171
		VM_BUG_ON_PAGE(entry != page + i, entry);
172
		set_page_private(page + i, 0);
173
		xas_next(&xas);
174
	}
Linus Torvalds's avatar
Linus Torvalds committed
175
	ClearPageSwapCache(page);
176 177 178
	address_space->nrpages -= nr;
	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
	ADD_CACHE_INFO(del_total, nr);
Linus Torvalds's avatar
Linus Torvalds committed
179 180 181 182 183 184 185 186 187
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
188
int add_to_swap(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
189 190 191 192
{
	swp_entry_t entry;
	int err;

193 194
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageUptodate(page), page);
Linus Torvalds's avatar
Linus Torvalds committed
195

196
	entry = get_swap_page(page);
197
	if (!entry.val)
198 199
		return 0;

200
	/*
201
	 * XArray node allocations from PF_MEMALLOC contexts could
202 203 204 205 206 207 208
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
Minchan Kim's avatar
Minchan Kim committed
209
	 * Add it to the swap cache.
210 211 212
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
213
	if (err)
Nick Piggin's avatar
Nick Piggin committed
214
		/*
215 216
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
Linus Torvalds's avatar
Linus Torvalds committed
217
		 */
218
		goto fail;
219 220 221 222 223 224 225 226 227 228 229
	/*
	 * Normally the page will be dirtied in unmap because its pte should be
	 * dirty. A special case is MADV_FREE page. The page'e pte could have
	 * dirty bit cleared but the page's SwapBacked bit is still set because
	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
	 * such page, unmap will not set dirty bit for it, so page reclaim will
	 * not write the page out. This can cause data corruption when the page
	 * is swap in later. Always setting the dirty bit for the page solves
	 * the problem.
	 */
	set_page_dirty(page);
230 231 232 233

	return 1;

fail:
234
	put_swap_page(page, entry);
235
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
236 237 238 239 240 241 242 243 244 245
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
246 247
	swp_entry_t entry = { .val = page_private(page) };
	struct address_space *address_space = swap_address_space(entry);
Linus Torvalds's avatar
Linus Torvalds committed
248

Matthew Wilcox's avatar
Matthew Wilcox committed
249
	xa_lock_irq(&address_space->i_pages);
250
	__delete_from_swap_cache(page, entry);
Matthew Wilcox's avatar
Matthew Wilcox committed
251
	xa_unlock_irq(&address_space->i_pages);
Linus Torvalds's avatar
Linus Torvalds committed
252

253
	put_swap_page(page, entry);
254
	page_ref_sub(page, hpage_nr_pages(page));
Linus Torvalds's avatar
Linus Torvalds committed
255 256 257 258 259 260
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
261 262
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
Linus Torvalds's avatar
Linus Torvalds committed
263 264 265 266
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
267 268
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
Linus Torvalds's avatar
Linus Torvalds committed
269 270 271 272 273 274
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
275
 * this page if it is the last user of the page.
Linus Torvalds's avatar
Linus Torvalds committed
276 277 278 279
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
280
	if (!is_huge_zero_page(page))
281
		put_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
282 283 284 285 286 287 288 289 290
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;
291
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
292 293

	lru_add_drain();
294 295
	for (i = 0; i < nr; i++)
		free_swap_cache(pagep[i]);
296
	release_pages(pagep, nr);
Linus Torvalds's avatar
Linus Torvalds committed
297 298
}

299 300 301 302 303
static inline bool swap_use_vma_readahead(void)
{
	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}

Linus Torvalds's avatar
Linus Torvalds committed
304 305 306 307 308 309
/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
310 311
struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
			       unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
312 313
{
	struct page *page;
314
	struct swap_info_struct *si;
Linus Torvalds's avatar
Linus Torvalds committed
315

316 317 318
	si = get_swap_device(entry);
	if (!si)
		return NULL;
319
	page = find_get_page(swap_address_space(entry), swp_offset(entry));
320
	put_swap_device(si);
Linus Torvalds's avatar
Linus Torvalds committed
321

322 323
	INC_CACHE_INFO(find_total);
	if (page) {
324 325 326
		bool vma_ra = swap_use_vma_readahead();
		bool readahead;

Linus Torvalds's avatar
Linus Torvalds committed
327
		INC_CACHE_INFO(find_success);
328 329 330 331
		/*
		 * At the moment, we don't support PG_readahead for anon THP
		 * so let's bail out rather than confusing the readahead stat.
		 */
332 333
		if (unlikely(PageTransCompound(page)))
			return page;
334

335
		readahead = TestClearPageReadahead(page);
336 337 338 339 340 341 342
		if (vma && vma_ra) {
			unsigned long ra_val;
			int win, hits;

			ra_val = GET_SWAP_RA_VAL(vma);
			win = SWAP_RA_WIN(ra_val);
			hits = SWAP_RA_HITS(ra_val);
343 344 345 346 347
			if (readahead)
				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
			atomic_long_set(&vma->swap_readahead_info,
					SWAP_RA_VAL(addr, win, hits));
		}
348

349
		if (readahead) {
350
			count_vm_event(SWAP_RA_HIT);
351
			if (!vma || !vma_ra)
352
				atomic_inc(&swapin_readahead_hits);
353
		}
354
	}
355

Linus Torvalds's avatar
Linus Torvalds committed
356 357 358
	return page;
}

359 360 361
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr,
			bool *new_page_allocated)
Linus Torvalds's avatar
Linus Torvalds committed
362
{
363 364
	struct page *found_page = NULL, *new_page = NULL;
	struct swap_info_struct *si;
Linus Torvalds's avatar
Linus Torvalds committed
365
	int err;
366
	*new_page_allocated = false;
Linus Torvalds's avatar
Linus Torvalds committed
367 368 369 370 371 372 373

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
374 375 376 377 378 379
		si = get_swap_device(entry);
		if (!si)
			break;
		found_page = find_get_page(swap_address_space(entry),
					   swp_offset(entry));
		put_swap_device(si);
Linus Torvalds's avatar
Linus Torvalds committed
380 381 382
		if (found_page)
			break;

383 384 385 386 387 388 389 390 391 392
		/*
		 * Just skip read ahead for unused swap slot.
		 * During swap_off when swap_slot_cache is disabled,
		 * we have to handle the race between putting
		 * swap entry in swap cache and marking swap slot
		 * as SWAP_HAS_CACHE.  That's done in later part of code or
		 * else swap_off will be aborted if we return NULL.
		 */
		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
			break;
393

Linus Torvalds's avatar
Linus Torvalds committed
394 395 396 397
		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
398
			new_page = alloc_page_vma(gfp_mask, vma, addr);
Linus Torvalds's avatar
Linus Torvalds committed
399 400 401 402
			if (!new_page)
				break;		/* Out of memory */
		}

403 404 405
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
406
		err = swapcache_prepare(entry);
407 408 409 410
		if (err == -EEXIST) {
			/*
			 * We might race against get_swap_page() and stumble
			 * across a SWAP_HAS_CACHE swap_map entry whose page
411
			 * has not been brought into the swapcache yet.
412 413
			 */
			cond_resched();
414
			continue;
415
		} else if (err)		/* swp entry is obsolete ? */
416 417
			break;

418
		/* May fail (-ENOMEM) if XArray node allocation failed. */
419
		__SetPageLocked(new_page);
420
		__SetPageSwapBacked(new_page);
421
		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
Nick Piggin's avatar
Nick Piggin committed
422
		if (likely(!err)) {
423
			/* Initiate read into locked page */
424
			SetPageWorkingset(new_page);
425
			lru_cache_add_anon(new_page);
426
			*new_page_allocated = true;
Linus Torvalds's avatar
Linus Torvalds committed
427 428
			return new_page;
		}
429
		__ClearPageLocked(new_page);
430 431 432 433
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
434
		put_swap_page(new_page, entry);
435
	} while (err != -ENOMEM);
Linus Torvalds's avatar
Linus Torvalds committed
436 437

	if (new_page)
438
		put_page(new_page);
Linus Torvalds's avatar
Linus Torvalds committed
439 440
	return found_page;
}
441

442 443 444 445 446 447 448
/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
449
		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
450 451 452 453 454 455
{
	bool page_was_allocated;
	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
			vma, addr, &page_was_allocated);

	if (page_was_allocated)
456
		swap_readpage(retpage, do_poll);
457 458 459 460

	return retpage;
}

461 462 463 464 465
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
				      unsigned long offset,
				      int hits,
				      int max_pages,
				      int prev_win)
466
{
467
	unsigned int pages, last_ra;
468 469 470 471 472 473

	/*
	 * This heuristic has been found to work well on both sequential and
	 * random loads, swapping to hard disk or to SSD: please don't ask
	 * what the "+ 2" means, it just happens to work well, that's all.
	 */
474
	pages = hits + 2;
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
	if (pages == 2) {
		/*
		 * We can have no readahead hits to judge by: but must not get
		 * stuck here forever, so check for an adjacent offset instead
		 * (and don't even bother to check whether swap type is same).
		 */
		if (offset != prev_offset + 1 && offset != prev_offset - 1)
			pages = 1;
	} else {
		unsigned int roundup = 4;
		while (roundup < pages)
			roundup <<= 1;
		pages = roundup;
	}

	if (pages > max_pages)
		pages = max_pages;

	/* Don't shrink readahead too fast */
494
	last_ra = prev_win / 2;
495 496
	if (pages < last_ra)
		pages = last_ra;
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515

	return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
	static unsigned long prev_offset;
	unsigned int hits, pages, max_pages;
	static atomic_t last_readahead_pages;

	max_pages = 1 << READ_ONCE(page_cluster);
	if (max_pages <= 1)
		return 1;

	hits = atomic_xchg(&swapin_readahead_hits, 0);
	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
				  atomic_read(&last_readahead_pages));
	if (!hits)
		prev_offset = offset;
516 517 518 519 520
	atomic_set(&last_readahead_pages, pages);

	return pages;
}

521
/**
522
 * swap_cluster_readahead - swap in pages in hope we need them soon
523
 * @entry: swap entry of this memory
524
 * @gfp_mask: memory allocation flags
525
 * @vmf: fault information
526 527 528 529 530 531 532 533 534 535 536
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
537
 * Caller must hold read mmap_sem if vmf->vma is not NULL.
538
 */
539 540
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
541 542
{
	struct page *page;
543 544
	unsigned long entry_offset = swp_offset(entry);
	unsigned long offset = entry_offset;
545
	unsigned long start_offset, end_offset;
546
	unsigned long mask;
547
	struct swap_info_struct *si = swp_swap_info(entry);
548
	struct blk_plug plug;
549
	bool do_poll = true, page_allocated;
550 551
	struct vm_area_struct *vma = vmf->vma;
	unsigned long addr = vmf->address;
552

553 554 555 556
	mask = swapin_nr_pages(offset) - 1;
	if (!mask)
		goto skip;

557 558 559 560 561 562 563
	/* Test swap type to make sure the dereference is safe */
	if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
		struct inode *inode = si->swap_file->f_mapping->host;
		if (inode_read_congested(inode))
			goto skip;
	}

564
	do_poll = false;
565 566 567 568 569
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;
570 571
	if (end_offset >= si->max)
		end_offset = si->max - 1;
572

573
	blk_start_plug(&plug);
574
	for (offset = start_offset; offset <= end_offset ; offset++) {
575
		/* Ok, do the async read-ahead now */
576 577 578
		page = __read_swap_cache_async(
			swp_entry(swp_type(entry), offset),
			gfp_mask, vma, addr, &page_allocated);
579
		if (!page)
580
			continue;
581 582
		if (page_allocated) {
			swap_readpage(page, false);
583
			if (offset != entry_offset) {
584 585 586
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
587
		}
588
		put_page(page);
589
	}
590 591
	blk_finish_plug(&plug);

592
	lru_add_drain();	/* Push any new pages onto the LRU now */
593
skip:
594
	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
595
}
596 597 598 599 600 601 602

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
	struct address_space *spaces, *space;
	unsigned int i, nr;

	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
Kees Cook's avatar
Kees Cook committed
603
	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
604 605 606 607
	if (!spaces)
		return -ENOMEM;
	for (i = 0; i < nr; i++) {
		space = spaces + i;
608
		xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
609 610 611 612 613 614
		atomic_set(&space->i_mmap_writable, 0);
		space->a_ops = &swap_aops;
		/* swap cache doesn't use writeback related tags */
		mapping_set_no_writeback_tags(space);
	}
	nr_swapper_spaces[type] = nr;
615
	swapper_spaces[type] = spaces;
616 617 618 619 620 621

	return 0;
}

void exit_swap_address_space(unsigned int type)
{
622
	kvfree(swapper_spaces[type]);
623
	nr_swapper_spaces[type] = 0;
624
	swapper_spaces[type] = NULL;
625
}
626 627 628 629 630 631 632 633 634 635 636 637 638 639

static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
				     unsigned long faddr,
				     unsigned long lpfn,
				     unsigned long rpfn,
				     unsigned long *start,
				     unsigned long *end)
{
	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
		      PFN_DOWN(faddr & PMD_MASK));
	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}

640 641
static void swap_ra_info(struct vm_fault *vmf,
			struct vma_swap_readahead *ra_info)
642 643
{
	struct vm_area_struct *vma = vmf->vma;
644
	unsigned long ra_val;
645 646 647
	swp_entry_t entry;
	unsigned long faddr, pfn, fpfn;
	unsigned long start, end;
648
	pte_t *pte, *orig_pte;
649 650 651 652 653
	unsigned int max_win, hits, prev_win, win, left;
#ifndef CONFIG_64BIT
	pte_t *tpte;
#endif

654 655 656
	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
			     SWAP_RA_ORDER_CEILING);
	if (max_win == 1) {
657 658
		ra_info->win = 1;
		return;
659 660
	}

661
	faddr = vmf->address;
662 663 664 665 666 667
	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
	entry = pte_to_swp_entry(*pte);
	if ((unlikely(non_swap_entry(entry)))) {
		pte_unmap(orig_pte);
		return;
	}
668 669

	fpfn = PFN_DOWN(faddr);
670 671 672 673 674
	ra_val = GET_SWAP_RA_VAL(vma);
	pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
	prev_win = SWAP_RA_WIN(ra_val);
	hits = SWAP_RA_HITS(ra_val);
	ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
675 676 677 678
					       max_win, prev_win);
	atomic_long_set(&vma->swap_readahead_info,
			SWAP_RA_VAL(faddr, win, 0));

679 680 681 682
	if (win == 1) {
		pte_unmap(orig_pte);
		return;
	}
683 684 685 686 687 688 689 690 691 692 693 694

	/* Copy the PTEs because the page table may be unmapped */
	if (fpfn == pfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
	else if (pfn == fpfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
				  &start, &end);
	else {
		left = (win - 1) / 2;
		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
				  &start, &end);
	}
695 696 697
	ra_info->nr_pte = end - start;
	ra_info->offset = fpfn - start;
	pte -= ra_info->offset;
698
#ifdef CONFIG_64BIT
699
	ra_info->ptes = pte;
700
#else
701
	tpte = ra_info->ptes;
702 703 704
	for (pfn = start; pfn != end; pfn++)
		*tpte++ = *pte++;
#endif
705
	pte_unmap(orig_pte);
706 707
}

708 709 710 711 712 713 714 715 716 717 718 719 720 721
/**
 * swap_vma_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read in a few pages whoes
 * virtual addresses are around the fault address in the same vma.
 *
 * Caller must hold read mmap_sem if vmf->vma is not NULL.
 *
 */
722 723
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
				       struct vm_fault *vmf)
724 725 726 727 728 729 730 731
{
	struct blk_plug plug;
	struct vm_area_struct *vma = vmf->vma;
	struct page *page;
	pte_t *pte, pentry;
	swp_entry_t entry;
	unsigned int i;
	bool page_allocated;
732
	struct vma_swap_readahead ra_info = {0,};
733

734 735
	swap_ra_info(vmf, &ra_info);
	if (ra_info.win == 1)
736 737 738
		goto skip;

	blk_start_plug(&plug);
739
	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
	     i++, pte++) {
		pentry = *pte;
		if (pte_none(pentry))
			continue;
		if (pte_present(pentry))
			continue;
		entry = pte_to_swp_entry(pentry);
		if (unlikely(non_swap_entry(entry)))
			continue;
		page = __read_swap_cache_async(entry, gfp_mask, vma,
					       vmf->address, &page_allocated);
		if (!page)
			continue;
		if (page_allocated) {
			swap_readpage(page, false);
755
			if (i != ra_info.offset) {
756 757 758 759 760 761 762 763 764 765
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
		}
		put_page(page);
	}
	blk_finish_plug(&plug);
	lru_add_drain();
skip:
	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
766
				     ra_info.win == 1);
767
}
768

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * It's a main entry function for swap readahead. By the configuration,
 * it will read ahead blocks by cluster-based(ie, physical disk based)
 * or vma-based(ie, virtual address based on faulty address) readahead.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
{
	return swap_use_vma_readahead() ?
			swap_vma_readahead(entry, gfp_mask, vmf) :
			swap_cluster_readahead(entry, gfp_mask, vmf);
}

789 790 791 792
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
793
	return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
794 795 796 797 798 799
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
800
		enable_vma_readahead = true;
801
	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
802
		enable_vma_readahead = false;
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843
	else
		return -EINVAL;

	return count;
}
static struct kobj_attribute vma_ra_enabled_attr =
	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
	       vma_ra_enabled_store);

static struct attribute *swap_attrs[] = {
	&vma_ra_enabled_attr.attr,
	NULL,
};

static struct attribute_group swap_attr_group = {
	.attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
	int err;
	struct kobject *swap_kobj;

	swap_kobj = kobject_create_and_add("swap", mm_kobj);
	if (!swap_kobj) {
		pr_err("failed to create swap kobject\n");
		return -ENOMEM;
	}
	err = sysfs_create_group(swap_kobj, &swap_attr_group);
	if (err) {
		pr_err("failed to register swap group\n");
		goto delete_obj;
	}
	return 0;

delete_obj:
	kobject_put(swap_kobj);
	return err;
}
subsys_initcall(swap_init_sysfs);
#endif