shmem.c 105 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9 10
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
11
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16 17
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
18 19 20
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23
 * This file is released under the GPL.
 */

24 25 26 27
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
28
#include <linux/ramfs.h>
29
#include <linux/pagemap.h>
30 31
#include <linux/file.h>
#include <linux/mm.h>
32
#include <linux/random.h>
33
#include <linux/sched/signal.h>
34
#include <linux/export.h>
35
#include <linux/swap.h>
36
#include <linux/uio.h>
37
#include <linux/khugepaged.h>
38
#include <linux/hugetlb.h>
39
#include <linux/frontswap.h>
40

41 42
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */

43 44 45
static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
46 47 48 49 50 51
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

52
#include <linux/xattr.h>
53
#include <linux/exportfs.h>
54
#include <linux/posix_acl.h>
55
#include <linux/posix_acl_xattr.h>
Linus Torvalds's avatar
Linus Torvalds committed
56 57 58 59 60 61 62
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
63
#include <linux/pagevec.h>
64
#include <linux/percpu_counter.h>
65
#include <linux/falloc.h>
66
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
67 68 69 70
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
71
#include <linux/ctype.h>
72
#include <linux/migrate.h>
73
#include <linux/highmem.h>
74
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
75
#include <linux/magic.h>
76
#include <linux/syscalls.h>
David Herrmann's avatar
David Herrmann committed
77
#include <linux/fcntl.h>
78
#include <uapi/linux/memfd.h>
79
#include <linux/userfaultfd_k.h>
80
#include <linux/rmap.h>
81
#include <linux/uuid.h>
82

83
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
84 85
#include <asm/pgtable.h>

86 87
#include "internal.h"

88 89
#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
Linus Torvalds's avatar
Linus Torvalds committed
90 91 92 93

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

94 95 96
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

97
/*
98 99 100
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_mutex making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
101 102
 */
struct shmem_falloc {
103
	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
104 105 106 107 108 109
	pgoff_t start;		/* start of range currently being fallocated */
	pgoff_t next;		/* the next page offset to be fallocated */
	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
};

110
#ifdef CONFIG_TMPFS
111 112
static unsigned long shmem_default_max_blocks(void)
{
113
	return totalram_pages() / 2;
114 115 116 117
}

static unsigned long shmem_default_max_inodes(void)
{
118 119 120
	unsigned long nr_pages = totalram_pages();

	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
121
}
122
#endif
123

124 125 126
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index);
127 128 129 130
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
			     struct page **pagep, enum sgp_type sgp,
			     gfp_t gfp, struct vm_area_struct *vma,
			     vm_fault_t *fault_type);
131
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
132
		struct page **pagep, enum sgp_type sgp,
133
		gfp_t gfp, struct vm_area_struct *vma,
134
		struct vm_fault *vmf, vm_fault_t *fault_type);
135

136
int shmem_getpage(struct inode *inode, pgoff_t index,
137
		struct page **pagep, enum sgp_type sgp)
138 139
{
	return shmem_getpage_gfp(inode, index, pagep, sgp,
140
		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
141
}
Linus Torvalds's avatar
Linus Torvalds committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
156
	return (flags & VM_NORESERVE) ?
157
		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
158 159 160 161
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
162
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
163 164 165
		vm_unacct_memory(VM_ACCT(size));
}

166 167 168 169 170 171 172 173 174 175 176 177 178
static inline int shmem_reacct_size(unsigned long flags,
		loff_t oldsize, loff_t newsize)
{
	if (!(flags & VM_NORESERVE)) {
		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
			return security_vm_enough_memory_mm(current->mm,
					VM_ACCT(newsize) - VM_ACCT(oldsize));
		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
179 180
/*
 * ... whereas tmpfs objects are accounted incrementally as
181
 * pages are allocated, in order to allow large sparse files.
Linus Torvalds's avatar
Linus Torvalds committed
182 183 184
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
185
static inline int shmem_acct_block(unsigned long flags, long pages)
Linus Torvalds's avatar
Linus Torvalds committed
186
{
187 188 189 190 191
	if (!(flags & VM_NORESERVE))
		return 0;

	return security_vm_enough_memory_mm(current->mm,
			pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
192 193 194 195
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
196
	if (flags & VM_NORESERVE)
197
		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
198 199
}

200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (shmem_acct_block(info->flags, pages))
		return false;

	if (sbinfo->max_blocks) {
		if (percpu_counter_compare(&sbinfo->used_blocks,
					   sbinfo->max_blocks - pages) > 0)
			goto unacct;
		percpu_counter_add(&sbinfo->used_blocks, pages);
	}

	return true;

unacct:
	shmem_unacct_blocks(info->flags, pages);
	return false;
}

static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (sbinfo->max_blocks)
		percpu_counter_sub(&sbinfo->used_blocks, pages);
	shmem_unacct_blocks(info->flags, pages);
}

232
static const struct super_operations shmem_ops;
233
static const struct address_space_operations shmem_aops;
234
static const struct file_operations shmem_file_operations;
235 236 237
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
238
static const struct vm_operations_struct shmem_vm_ops;
239
static struct file_system_type shmem_fs_type;
Linus Torvalds's avatar
Linus Torvalds committed
240

241 242 243 244 245
bool vma_is_shmem(struct vm_area_struct *vma)
{
	return vma->vm_ops == &shmem_vm_ops;
}

Linus Torvalds's avatar
Linus Torvalds committed
246
static LIST_HEAD(shmem_swaplist);
247
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
248

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

274
/**
275
 * shmem_recalc_inode - recalculate the block usage of an inode
Linus Torvalds's avatar
Linus Torvalds committed
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
		info->alloced -= freed;
294
		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
295
		shmem_inode_unacct_blocks(inode, freed);
Linus Torvalds's avatar
Linus Torvalds committed
296 297 298
	}
}

299 300 301
bool shmem_charge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
302
	unsigned long flags;
303

304
	if (!shmem_inode_acct_block(inode, pages))
305
		return false;
306

307 308 309
	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
	inode->i_mapping->nrpages += pages;

310
	spin_lock_irqsave(&info->lock, flags);
311 312 313
	info->alloced += pages;
	inode->i_blocks += pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
314
	spin_unlock_irqrestore(&info->lock, flags);
315 316 317 318 319 320 321

	return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
322
	unsigned long flags;
323

324 325
	/* nrpages adjustment done by __delete_from_page_cache() or caller */

326
	spin_lock_irqsave(&info->lock, flags);
327 328 329
	info->alloced -= pages;
	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
330
	spin_unlock_irqrestore(&info->lock, flags);
331

332
	shmem_inode_unacct_blocks(inode, pages);
333 334
}

335
/*
336
 * Replace item expected in xarray by a new item, while holding xa_lock.
337
 */
338
static int shmem_replace_entry(struct address_space *mapping,
339 340
			pgoff_t index, void *expected, void *replacement)
{
341
	XA_STATE(xas, &mapping->i_pages, index);
342
	void *item;
343 344

	VM_BUG_ON(!expected);
345
	VM_BUG_ON(!replacement);
346
	item = xas_load(&xas);
347 348
	if (item != expected)
		return -ENOENT;
349
	xas_store(&xas, replacement);
350 351 352
	return 0;
}

353 354 355 356 357 358 359 360 361 362
/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
			       pgoff_t index, swp_entry_t swap)
{
363
	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
364 365
}

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *	disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *	enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *	only allocate huge pages if the page will be fully within i_size,
 *	also respect fadvise()/madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *	only allocate huge pages if requested with fadvise()/madvise();
 */

#define SHMEM_HUGE_NEVER	0
#define SHMEM_HUGE_ALWAYS	1
#define SHMEM_HUGE_WITHIN_SIZE	2
#define SHMEM_HUGE_ADVISE	3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *	disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY		(-1)
#define SHMEM_HUGE_FORCE	(-2)

398
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
399 400
/* ifdef here to avoid bloating shmem.o when not necessary */

401
static int shmem_huge __read_mostly;
402

403
#if defined(CONFIG_SYSFS)
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
static int shmem_parse_huge(const char *str)
{
	if (!strcmp(str, "never"))
		return SHMEM_HUGE_NEVER;
	if (!strcmp(str, "always"))
		return SHMEM_HUGE_ALWAYS;
	if (!strcmp(str, "within_size"))
		return SHMEM_HUGE_WITHIN_SIZE;
	if (!strcmp(str, "advise"))
		return SHMEM_HUGE_ADVISE;
	if (!strcmp(str, "deny"))
		return SHMEM_HUGE_DENY;
	if (!strcmp(str, "force"))
		return SHMEM_HUGE_FORCE;
	return -EINVAL;
}
420
#endif
421

422
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
static const char *shmem_format_huge(int huge)
{
	switch (huge) {
	case SHMEM_HUGE_NEVER:
		return "never";
	case SHMEM_HUGE_ALWAYS:
		return "always";
	case SHMEM_HUGE_WITHIN_SIZE:
		return "within_size";
	case SHMEM_HUGE_ADVISE:
		return "advise";
	case SHMEM_HUGE_DENY:
		return "deny";
	case SHMEM_HUGE_FORCE:
		return "force";
	default:
		VM_BUG_ON(1);
		return "bad_val";
	}
}
443
#endif
444

445 446 447 448
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	LIST_HEAD(list), *pos, *next;
449
	LIST_HEAD(to_remove);
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
	struct inode *inode;
	struct shmem_inode_info *info;
	struct page *page;
	unsigned long batch = sc ? sc->nr_to_scan : 128;
	int removed = 0, split = 0;

	if (list_empty(&sbinfo->shrinklist))
		return SHRINK_STOP;

	spin_lock(&sbinfo->shrinklist_lock);
	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);

		/* pin the inode */
		inode = igrab(&info->vfs_inode);

		/* inode is about to be evicted */
		if (!inode) {
			list_del_init(&info->shrinklist);
			removed++;
			goto next;
		}

		/* Check if there's anything to gain */
		if (round_up(inode->i_size, PAGE_SIZE) ==
				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
476
			list_move(&info->shrinklist, &to_remove);
477 478 479 480 481 482 483 484 485 486 487
			removed++;
			goto next;
		}

		list_move(&info->shrinklist, &list);
next:
		if (!--batch)
			break;
	}
	spin_unlock(&sbinfo->shrinklist_lock);

488 489 490 491 492 493 494
	list_for_each_safe(pos, next, &to_remove) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;
		list_del_init(&info->shrinklist);
		iput(inode);
	}

495 496 497 498 499 500
	list_for_each_safe(pos, next, &list) {
		int ret;

		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;

501 502
		if (nr_to_split && split >= nr_to_split)
			goto leave;
503

504
		page = find_get_page(inode->i_mapping,
505 506 507 508
				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
		if (!page)
			goto drop;

509
		/* No huge page at the end of the file: nothing to split */
510 511 512 513 514
		if (!PageTransHuge(page)) {
			put_page(page);
			goto drop;
		}

515 516 517 518 519 520 521 522 523 524 525 526
		/*
		 * Leave the inode on the list if we failed to lock
		 * the page at this time.
		 *
		 * Waiting for the lock may lead to deadlock in the
		 * reclaim path.
		 */
		if (!trylock_page(page)) {
			put_page(page);
			goto leave;
		}

527 528 529 530
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);

531 532 533
		/* If split failed leave the inode on the list */
		if (ret)
			goto leave;
534 535 536 537 538

		split++;
drop:
		list_del_init(&info->shrinklist);
		removed++;
539
leave:
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
		iput(inode);
	}

	spin_lock(&sbinfo->shrinklist_lock);
	list_splice_tail(&list, &sbinfo->shrinklist);
	sbinfo->shrinklist_len -= removed;
	spin_unlock(&sbinfo->shrinklist_lock);

	return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

	if (!READ_ONCE(sbinfo->shrinklist_len))
		return SHRINK_STOP;

	return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	return READ_ONCE(sbinfo->shrinklist_len);
}
568
#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
569 570 571

#define shmem_huge SHMEM_HUGE_DENY

572 573 574 575 576
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	return 0;
}
577
#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
578

579 580 581 582 583 584 585 586 587
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	    (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
	    shmem_huge != SHMEM_HUGE_DENY)
		return true;
	return false;
}

588 589 590 591 592
/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct page *page,
				   struct address_space *mapping,
593
				   pgoff_t index, void *expected, gfp_t gfp)
594
{
595 596 597
	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
	unsigned long i = 0;
	unsigned long nr = 1UL << compound_order(page);
598

599 600
	VM_BUG_ON_PAGE(PageTail(page), page);
	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
601 602
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
603
	VM_BUG_ON(expected && PageTransHuge(page));
604

605
	page_ref_add(page, nr);
606 607 608
	page->mapping = mapping;
	page->index = index;

609 610 611 612 613 614 615 616 617 618
	do {
		void *entry;
		xas_lock_irq(&xas);
		entry = xas_find_conflict(&xas);
		if (entry != expected)
			xas_set_err(&xas, -EEXIST);
		xas_create_range(&xas);
		if (xas_error(&xas))
			goto unlock;
next:
619
		xas_store(&xas, page + i);
620 621 622
		if (++i < nr) {
			xas_next(&xas);
			goto next;
623
		}
624
		if (PageTransHuge(page)) {
625
			count_vm_event(THP_FILE_ALLOC);
626
			__inc_node_page_state(page, NR_SHMEM_THPS);
627 628
		}
		mapping->nrpages += nr;
629 630
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
631 632 633 634 635
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));

	if (xas_error(&xas)) {
636
		page->mapping = NULL;
637
		page_ref_sub(page, nr);
638
		return xas_error(&xas);
639
	}
640 641

	return 0;
642 643
}

644 645 646 647 648 649 650 651
/*
 * Like delete_from_page_cache, but substitutes swap for page.
 */
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
	struct address_space *mapping = page->mapping;
	int error;

652 653
	VM_BUG_ON_PAGE(PageCompound(page), page);

Matthew Wilcox's avatar
Matthew Wilcox committed
654
	xa_lock_irq(&mapping->i_pages);
655
	error = shmem_replace_entry(mapping, page->index, page, radswap);
656 657
	page->mapping = NULL;
	mapping->nrpages--;
658 659
	__dec_node_page_state(page, NR_FILE_PAGES);
	__dec_node_page_state(page, NR_SHMEM);
Matthew Wilcox's avatar
Matthew Wilcox committed
660
	xa_unlock_irq(&mapping->i_pages);
661
	put_page(page);
662 663 664
	BUG_ON(error);
}

665
/*
666
 * Remove swap entry from page cache, free the swap and its page cache.
667 668 669 670
 */
static int shmem_free_swap(struct address_space *mapping,
			   pgoff_t index, void *radswap)
{
671
	void *old;
672

673
	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
674 675 676 677
	if (old != radswap)
		return -ENOENT;
	free_swap_and_cache(radix_to_swp_entry(radswap));
	return 0;
678 679
}

680 681
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
682
 * given offsets are swapped out.
683
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
684
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
685 686
 * as long as the inode doesn't go away and racy results are not a problem.
 */
687 688
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
						pgoff_t start, pgoff_t end)
689
{
690
	XA_STATE(xas, &mapping->i_pages, start);
691
	struct page *page;
692
	unsigned long swapped = 0;
693 694

	rcu_read_lock();
695 696
	xas_for_each(&xas, page, end - 1) {
		if (xas_retry(&xas, page))
697
			continue;
698
		if (xa_is_value(page))
699 700 701
			swapped++;

		if (need_resched()) {
702
			xas_pause(&xas);
703 704 705 706 707 708 709 710 711
			cond_resched_rcu();
		}
	}

	rcu_read_unlock();

	return swapped << PAGE_SHIFT;
}

712 713 714 715
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
716
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
	struct inode *inode = file_inode(vma->vm_file);
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct address_space *mapping = inode->i_mapping;
	unsigned long swapped;

	/* Be careful as we don't hold info->lock */
	swapped = READ_ONCE(info->swapped);

	/*
	 * The easier cases are when the shmem object has nothing in swap, or
	 * the vma maps it whole. Then we can simply use the stats that we
	 * already track.
	 */
	if (!swapped)
		return 0;

	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
		return swapped << PAGE_SHIFT;

	/* Here comes the more involved part */
	return shmem_partial_swap_usage(mapping,
			linear_page_index(vma, vma->vm_start),
			linear_page_index(vma, vma->vm_end));
}

746 747 748 749 750 751 752 753 754
/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	pgoff_t index = 0;

755
	pagevec_init(&pvec);
756 757 758 759 760 761 762 763
	/*
	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
	 */
	while (!mapping_unevictable(mapping)) {
		/*
		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
		 */
764 765
		pvec.nr = find_get_entries(mapping, index,
					   PAGEVEC_SIZE, pvec.pages, indices);
766 767 768
		if (!pvec.nr)
			break;
		index = indices[pvec.nr - 1] + 1;
769
		pagevec_remove_exceptionals(&pvec);
770
		check_move_unevictable_pages(&pvec);
771 772 773
		pagevec_release(&pvec);
		cond_resched();
	}
774 775 776
}

/*
Matthew Wilcox's avatar
Matthew Wilcox committed
777
 * Remove range of pages and swap entries from page cache, and free them.
778
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
779
 */
780 781
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
								 bool unfalloc)
Linus Torvalds's avatar
Linus Torvalds committed
782
{
783
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
784
	struct shmem_inode_info *info = SHMEM_I(inode);
785 786 787 788
	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
789
	struct pagevec pvec;
790 791
	pgoff_t indices[PAGEVEC_SIZE];
	long nr_swaps_freed = 0;
792
	pgoff_t index;
793 794
	int i;

795 796
	if (lend == -1)
		end = -1;	/* unsigned, so actually very big */
797

798
	pagevec_init(&pvec);
799
	index = start;
800
	while (index < end) {
801 802 803
		pvec.nr = find_get_entries(mapping, index,
			min(end - index, (pgoff_t)PAGEVEC_SIZE),
			pvec.pages, indices);
804 805
		if (!pvec.nr)
			break;
806 807 808
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

809
			index = indices[i];
810
			if (index >= end)
811 812
				break;

813
			if (xa_is_value(page)) {
814 815
				if (unfalloc)
					continue;
816 817
				nr_swaps_freed += !shmem_free_swap(mapping,
								index, page);
818
				continue;
819 820
			}

821 822
			VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);

823
			if (!trylock_page(page))
824
				continue;
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

845
			if (!unfalloc || !PageUptodate(page)) {
846 847
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
848
					VM_BUG_ON_PAGE(PageWriteback(page), page);
849 850
					truncate_inode_page(mapping, page);
				}
851 852 853
			}
			unlock_page(page);
		}
854
		pagevec_remove_exceptionals(&pvec);
855
		pagevec_release(&pvec);
856 857 858
		cond_resched();
		index++;
	}
Linus Torvalds's avatar
Linus Torvalds committed
859

860
	if (partial_start) {
861
		struct page *page = NULL;
862
		shmem_getpage(inode, start - 1, &page, SGP_READ);
863
		if (page) {
864
			unsigned int top = PAGE_SIZE;
865 866 867 868 869 870 871
			if (start > end) {
				top = partial_end;
				partial_end = 0;
			}
			zero_user_segment(page, partial_start, top);
			set_page_dirty(page);
			unlock_page(page);
872
			put_page(page);
873 874 875 876
		}
	}
	if (partial_end) {
		struct page *page = NULL;
877
		shmem_getpage(inode, end, &page, SGP_READ);
878 879
		if (page) {
			zero_user_segment(page, 0, partial_end);
880 881
			set_page_dirty(page);
			unlock_page(page);
882
			put_page(page);
883 884
		}
	}
885 886
	if (start >= end)
		return;
887 888

	index = start;
889
	while (index < end) {
890
		cond_resched();
891 892

		pvec.nr = find_get_entries(mapping, index,
893
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
894
				pvec.pages, indices);
895
		if (!pvec.nr) {
896 897
			/* If all gone or hole-punch or unfalloc, we're done */
			if (index == start || end != -1)
898
				break;
899
			/* But if truncating, restart to make sure all gone */
900 901 902 903 904 905
			index = start;
			continue;
		}
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

906
			index = indices[i];
907
			if (index >= end)
908 909
				break;

910
			if (xa_is_value(page)) {
911 912
				if (unfalloc)
					continue;
913 914 915 916 917 918
				if (shmem_free_swap(mapping, index, page)) {
					/* Swap was replaced by page: retry */
					index--;
					break;
				}
				nr_swaps_freed++;
919 920 921
				continue;
			}

922
			lock_page(page);
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				/*
				 * Partial thp truncate due 'start' in middle
				 * of THP: don't need to look on these pages
				 * again on !pvec.nr restart.
				 */
				if (index != round_down(end, HPAGE_PMD_NR))
					start++;
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

950
			if (!unfalloc || !PageUptodate(page)) {
951 952
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
953
					VM_BUG_ON_PAGE(PageWriteback(page), page);
954
					truncate_inode_page(mapping, page);
955 956 957 958 959
				} else {
					/* Page was replaced by swap: retry */
					unlock_page(page);
					index--;
					break;
960
				}
961
			}
962 963
			unlock_page(page);
		}
964
		pagevec_remove_exceptionals(&pvec);
965
		pagevec_release(&pvec);
966 967
		index++;
	}
968

969
	spin_lock_irq(&info->lock);
970
	info->swapped -= nr_swaps_freed;
Linus Torvalds's avatar
Linus Torvalds committed
971
	shmem_recalc_inode(inode);
972
	spin_unlock_irq(&info->lock);
973
}
Linus Torvalds's avatar
Linus Torvalds committed
974

975 976 977
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
	shmem_undo_range(inode, lstart, lend, false);
978
	inode->i_ctime = inode->i_mtime = current_time(inode);
Linus Torvalds's avatar
Linus Torvalds committed
979
}
980
EXPORT_SYMBOL_GPL(shmem_truncate_range);
Linus Torvalds's avatar
Linus Torvalds committed
981

982 983
static int shmem_getattr(const struct path *path, struct kstat *stat,
			 u32 request_mask, unsigned int query_flags)
984
{
985
	struct inode *inode = path->dentry->d_inode;
986
	struct shmem_inode_info *info = SHMEM_I(inode);
987
	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
988

989
	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
990
		spin_lock_irq(&info->lock);
991
		shmem_recalc_inode(inode);
992
		spin_unlock_irq(&info->lock);
993
	}
994
	generic_fillattr(inode, stat);
995 996 997 998

	if (is_huge_enabled(sb_info))
		stat->blksize = HPAGE_PMD_SIZE;

999 1000 1001
	return 0;
}

1002
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
Linus Torvalds's avatar
Linus Torvalds committed
1003
{
1004
	struct inode *inode = d_inode(dentry);
David Herrmann's avatar
David Herrmann committed
1005
	struct shmem_inode_info *info = SHMEM_I(inode);
1006
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1007 1008
	int error;

1009
	error = setattr_prepare(dentry, attr);
1010 1011 1012
	if (error)
		return error;

1013 1014 1015
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
		loff_t oldsize = inode->i_size;
		loff_t newsize = attr->ia_size;
1016

David Herrmann's avatar
David Herrmann committed
1017 1018 1019 1020 1021
		/* protected by i_mutex */
		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
			return -EPERM;

1022
		if (newsize != oldsize) {
1023 1024 1025 1026
			error = shmem_reacct_size(SHMEM_I(inode)->flags,
					oldsize, newsize);
			if (error)
				return error;
1027
			i_size_write(inode, newsize);
1028
			inode->i_ctime = inode->i_mtime = current_time(inode);
1029
		}
1030
		if (newsize <= oldsize) {
1031
			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1032 1033 1034 1035 1036 1037
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
			if (info->alloced)
				shmem_truncate_range(inode,
							newsize, (loff_t)-1);
1038
			/* unmap again to remove racily COWed private pages */
1039 1040 1041
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
1042 1043 1044 1045 1046 1047 1048

			/*
			 * Part of the huge page can be beyond i_size: subject
			 * to shrink under memory pressure.
			 */
			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				spin_lock(&sbinfo->shrinklist_lock);
1049 1050 1051 1052 1053
				/*
				 * _careful to defend against unlocked access to
				 * ->shrink_list in shmem_unused_huge_shrink()
				 */
				if (list_empty_careful(&info->shrinklist)) {
1054 1055 1056 1057 1058 1059
					list_add_tail(&info->shrinklist,
							&sbinfo->shrinklist);
					sbinfo->shrinklist_len++;
				}
				spin_unlock(&sbinfo->shrinklist_lock);
			}
1060
		}
Linus Torvalds's avatar
Linus Torvalds committed
1061 1062
	}

1063 1064
	setattr_copy(inode, attr);
	if (attr->ia_valid & ATTR_MODE)
1065
		error = posix_acl_chmod(inode, inode->i_mode);
Linus Torvalds's avatar
Linus Torvalds committed
1066 1067 1068
	return error;
}

1069
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
1070 1071
{
	struct shmem_inode_info *info = SHMEM_I(inode);
1072
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1073

1074
	if (inode->i_mapping->a_ops == &shmem_aops) {
Linus Torvalds's avatar
Linus Torvalds committed
1075 1076
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
1077
		shmem_truncate_range(inode, 0, (loff_t)-1);
1078 1079 1080 1081 1082 1083 1084 1085
		if (!list_empty(&info->shrinklist)) {
			spin_lock(&sbinfo->shrinklist_lock);
			if (!list_empty(&info->shrinklist)) {
				list_del_init(&info->shrinklist);
				sbinfo->shrinklist_len--;
			}
			spin_unlock(&sbinfo->shrinklist_lock);
		}
1086 1087 1088 1089
		while (!list_empty(&info->swaplist)) {
			/* Wait while shmem_unuse() is scanning this inode... */
			wait_var_event(&info->stop_eviction,
				       !atomic_read(&info->stop_eviction));
1090
			mutex_lock(&shmem_swaplist_mutex);
1091 1092 1093
			/* ...but beware of the race if we peeked too early */
			if (!atomic_read(&info->stop_eviction))
				list_del_init(&info->swaplist);
1094
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1095
		}
1096
	}
1097

1098
	simple_xattrs_free(&info->xattrs);
1099
	WARN_ON(inode->i_blocks);
1100
	shmem_free_inode(inode->i_sb);
1101
	clear_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
1102 1103
}

1104 1105 1106 1107 1108
extern struct swap_info_struct *swap_info[];

static int shmem_find_swap_entries(struct address_space *mapping,
				   pgoff_t start, unsigned int nr_entries,
				   struct page **entries, pgoff_t *indices,
1109
				   unsigned int type, bool frontswap)
1110
{
1111 1112
	XA_STATE(xas, &mapping->i_pages, start);
	struct page *page;
1113
	swp_entry_t entry;
1114 1115 1116 1117
	unsigned int ret = 0;

	if (!nr_entries)
		return 0;
1118 1119

	rcu_read_lock();
1120 1121
	xas_for_each(&xas, page, ULONG_MAX) {
		if (xas_retry(&xas, page))
1122
			continue;
1123 1124

		if (!xa_is_value(page))
1125
			continue;
1126

1127 1128 1129 1130 1131 1132
		entry = radix_to_swp_entry(page);
		if (swp_type(entry) != type)
			continue;
		if (frontswap &&
		    !frontswap_test(swap_info[type], swp_offset(entry)))
			continue;
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142

		indices[ret] = xas.xa_index;
		entries[ret] = page;

		if (need_resched()) {
			xas_pause(&xas);
			cond_resched_rcu();
		}
		if (++ret == nr_entries)
			break;
1143 1144
	}
	rcu_read_unlock();
1145

1146
	return ret;
1147 1148
}

1149
/*
1150 1151
 * Move the swapped pages for an inode to page cache. Returns the count
 * of pages swapped in, or the error in case of failure.
1152
 */
1153 1154
static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
				    pgoff_t *indices)
Linus Torvalds's avatar
Linus Torvalds committed
1155
{
1156 1157
	int i = 0;
	int ret = 0;
1158
	int error = 0;
1159
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
1160

1161 1162
	for (i = 0; i < pvec.nr; i++) {
		struct page *page = pvec.pages[i];
1163

1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
		if (!xa_is_value(page))
			continue;
		error = shmem_swapin_page(inode, indices[i],
					  &page, SGP_CACHE,
					  mapping_gfp_mask(mapping),
					  NULL, NULL);
		if (error == 0) {
			unlock_page(page);
			put_page(page);
			ret++;
		}
		if (error == -ENOMEM)
			break;
		error = 0;
1178
	}
1179 1180
	return error ? error : ret;
}
1181

1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
			     bool frontswap, unsigned long *fs_pages_to_unuse)
{
	struct address_space *mapping = inode->i_mapping;
	pgoff_t start = 0;
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
	int ret = 0;

	pagevec_init(&pvec);
	do {
		unsigned int nr_entries = PAGEVEC_SIZE;

		if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
			nr_entries = *fs_pages_to_unuse;

		pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
						  pvec.pages, indices,
1204
						  type, frontswap);
1205 1206 1207
		if (pvec.nr == 0) {
			ret = 0;
			break;
1208
		}
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225

		ret = shmem_unuse_swap_entries(inode, pvec, indices);
		if (ret < 0)
			break;

		if (frontswap_partial) {
			*fs_pages_to_unuse -= ret;
			if (*fs_pages_to_unuse == 0) {
				ret = FRONTSWAP_PAGES_UNUSED;
				break;
			}
		}

		start = indices[pvec.nr - 1];
	} while (true);

	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1226 1227 1228
}

/*
1229 1230 1231
 * Read all the shared memory data that resides in the swap
 * device 'type' back into memory, so the swap device can be
 * unused.
Linus Torvalds's avatar
Linus Torvalds committed
1232
 */
1233 1234
int shmem_unuse(unsigned int type, bool frontswap,
		unsigned long *fs_pages_to_unuse)
Linus Torvalds's avatar
Linus Torvalds committed
1235
{
1236
	struct shmem_inode_info *info, *next;
1237 1238
	int error = 0;

1239 1240 1241 1242 1243 1244
	if (list_empty(&shmem_swaplist))
		return 0;

	mutex_lock(&shmem_swaplist_mutex);
	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
		if (!info->swapped) {
1245
			list_del_init(&info->swaplist);
1246 1247
			continue;
		}
1248 1249 1250 1251 1252 1253 1254
		/*
		 * Drop the swaplist mutex while searching the inode for swap;
		 * but before doing so, make sure shmem_evict_inode() will not
		 * remove placeholder inode from swaplist, nor let it be freed
		 * (igrab() would protect from unlink, but not from unmount).
		 */
		atomic_inc(&info->stop_eviction);
1255 1256
		mutex_unlock(&shmem_swaplist_mutex);

1257
		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1258
					  fs_pages_to_unuse);
1259
		cond_resched();
1260 1261 1262 1263 1264

		mutex_lock(&shmem_swaplist_mutex);
		next = list_next_entry(info, swaplist);
		if (!info->swapped)
			list_del_init(&info->swaplist);
1265 1266
		if (atomic_dec_and_test(&info->stop_eviction))
			wake_up_var(&info->stop_eviction);
1267
		if (error)
1268
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1269
	}
1270
	mutex_unlock(&shmem_swaplist_mutex);
1271 1272

	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
	struct shmem_inode_info *info;
	struct address_space *mapping;
	struct inode *inode;
1283 1284
	swp_entry_t swap;
	pgoff_t index;
Linus Torvalds's avatar
Linus Torvalds committed
1285

1286
	VM_BUG_ON_PAGE(PageCompound(page), page);
Linus Torvalds's avatar
Linus Torvalds committed
1287 1288 1289 1290 1291 1292 1293
	BUG_ON(!PageLocked(page));
	mapping = page->mapping;
	index = page->index;
	inode = mapping->host;
	info = SHMEM_I(inode);
	if (info->flags & VM_LOCKED)
		goto redirty;
1294
	if (!total_swap_pages)
Linus Torvalds's avatar
Linus Torvalds committed
1295 1296
		goto redirty;

1297
	/*
1298 1299 1300 1301 1302
	 * Our capabilities prevent regular writeback or sync from ever calling
	 * shmem_writepage; but a stacking filesystem might use ->writepage of
	 * its underlying filesystem, in which case tmpfs should write out to
	 * swap only in response to memory pressure, and not for the writeback
	 * threads or sync.
1303
	 */
1304 1305 1306 1307
	if (!wbc->for_reclaim) {
		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
		goto redirty;
	}
1308 1309 1310 1311 1312

	/*
	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
	 * value into swapfile.c, the only way we can correctly account for a
	 * fallocated page arriving here is now to initialize it and write it.
1313 1314 1315 1316 1317 1318
	 *
	 * That's okay for a page already fallocated earlier, but if we have
	 * not yet completed the fallocation, then (a) we want to keep track
	 * of this page in case we have to undo it, and (b) it may not be a
	 * good idea to continue anyway, once we're pushing into swap.  So
	 * reactivate the page, and let shmem_fallocate() quit when too many.
1319 1320
	 */
	if (!PageUptodate(page)) {
1321 1322 1323 1324 1325
		if (inode->i_private) {
			struct shmem_falloc *shmem_falloc;
			spin_lock(&inode->i_lock);
			shmem_falloc = inode->i_private;
			if (shmem_falloc &&
1326
			    !shmem_falloc->waitq &&
1327 1328 1329 1330 1331 1332 1333 1334 1335
			    index >= shmem_falloc->start &&
			    index < shmem_falloc->next)
				shmem_falloc->nr_unswapped++;
			else
				shmem_falloc = NULL;
			spin_unlock(&inode->i_lock);
			if (shmem_falloc)
				goto redirty;
		}
1336 1337 1338 1339 1340
		clear_highpage(page);
		flush_dcache_page(page);
		SetPageUptodate(page);
	}

1341
	swap = get_swap_page(page);
1342 1343
	if (!swap.val)
		goto redirty;
1344

1345 1346
	/*
	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1347 1348
	 * if it's not already there.  Do it now before the page is
	 * moved to swap cache, when its pagelock no longer protects
1349
	 * the inode from eviction.  But don't unlock the mutex until
1350 1351
	 * we've incremented swapped, because shmem_unuse_inode() will
	 * prune a !swapped inode from the swaplist under this mutex.
1352
	 */
1353 1354
	mutex_lock(&shmem_swaplist_mutex);
	if (list_empty(&info->swaplist))
1355
		list_add(&info->swaplist, &shmem_swaplist);
1356

1357
	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1358
		spin_lock_irq(&info->lock);
1359
		shmem_recalc_inode(inode);
1360
		info->swapped++;
1361
		spin_unlock_irq(&info->lock);
1362

1363 1364 1365
		swap_shmem_alloc(swap);
		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));

1366
		mutex_unlock(&shmem_swaplist_mutex);
1367
		BUG_ON(page_mapped(page));
1368
		swap_writepage(page, wbc);
Linus Torvalds's avatar
Linus Torvalds committed
1369 1370 1371
		return 0;
	}<