vmscan.c 44.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
22
#include <linux/vmstat.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>	/* for try_to_release_page(),
					buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
37
#include <linux/delay.h>
38
#include <linux/kthread.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
42
43
44

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>

45
46
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
47
48
49
50
51
struct scan_control {
	/* Incremented by the number of inactive pages that were scanned */
	unsigned long nr_scanned;

	/* This context's GFP mask */
Al Viro's avatar
Al Viro committed
52
	gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
53
54
55

	int may_writepage;

56
57
58
	/* Can pages be swapped as part of reclaim? */
	int may_swap;

Linus Torvalds's avatar
Linus Torvalds committed
59
60
61
62
63
	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
	 * In this context, it doesn't matter that we scan the
	 * whole list at once. */
	int swap_cluster_max;
64
65

	int swappiness;
66
67

	int all_unreclaimable;
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
};

/*
 * The list of shrinker callbacks used by to apply pressure to
 * ageable caches.
 */
struct shrinker {
	shrinker_t		shrinker;
	struct list_head	list;
	int			seeks;	/* seeks to recreate an obj */
	long			nr;	/* objs pending delete */
};

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetch(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetchw(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 100.  Higher means more swappy.
 */
int vm_swappiness = 60;
115
long vm_total_pages;	/* The total number of pages which the VM controls */
Linus Torvalds's avatar
Linus Torvalds committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

/*
 * Add a shrinker callback to be called from the vm
 */
struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
{
        struct shrinker *shrinker;

        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
        if (shrinker) {
	        shrinker->shrinker = theshrinker;
	        shrinker->seeks = seeks;
	        shrinker->nr = 0;
	        down_write(&shrinker_rwsem);
	        list_add_tail(&shrinker->list, &shrinker_list);
	        up_write(&shrinker_rwsem);
	}
	return shrinker;
}
EXPORT_SYMBOL(set_shrinker);

/*
 * Remove one
 */
void remove_shrinker(struct shrinker *shrinker)
{
	down_write(&shrinker_rwsem);
	list_del(&shrinker->list);
	up_write(&shrinker_rwsem);
	kfree(shrinker);
}
EXPORT_SYMBOL(remove_shrinker);

#define SHRINK_BATCH 128
/*
 * Call the shrink functions to age shrinkable caches
 *
 * Here we assume it costs one seek to replace a lru page and that it also
 * takes a seek to recreate a cache object.  With this in mind we age equal
 * percentages of the lru and ageable caches.  This should balance the seeks
 * generated by these structures.
 *
 * If the vm encounted mapped pages on the LRU it increase the pressure on
 * slab to avoid swapping.
 *
 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 *
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
169
170
 *
 * Returns the number of slab objects which we shrunk.
Linus Torvalds's avatar
Linus Torvalds committed
171
 */
172
173
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
			unsigned long lru_pages)
Linus Torvalds's avatar
Linus Torvalds committed
174
175
{
	struct shrinker *shrinker;
176
	unsigned long ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
177
178
179
180
181

	if (scanned == 0)
		scanned = SWAP_CLUSTER_MAX;

	if (!down_read_trylock(&shrinker_rwsem))
182
		return 1;	/* Assume we'll be able to shrink next time */
Linus Torvalds's avatar
Linus Torvalds committed
183
184
185
186

	list_for_each_entry(shrinker, &shrinker_list, list) {
		unsigned long long delta;
		unsigned long total_scan;
187
		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
188
189

		delta = (4 * scanned) / shrinker->seeks;
190
		delta *= max_pass;
Linus Torvalds's avatar
Linus Torvalds committed
191
192
		do_div(delta, lru_pages + 1);
		shrinker->nr += delta;
193
194
195
196
197
198
199
200
201
202
203
204
205
		if (shrinker->nr < 0) {
			printk(KERN_ERR "%s: nr=%ld\n",
					__FUNCTION__, shrinker->nr);
			shrinker->nr = max_pass;
		}

		/*
		 * Avoid risking looping forever due to too large nr value:
		 * never try to free more than twice the estimate number of
		 * freeable entries.
		 */
		if (shrinker->nr > max_pass * 2)
			shrinker->nr = max_pass * 2;
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
211
212

		total_scan = shrinker->nr;
		shrinker->nr = 0;

		while (total_scan >= SHRINK_BATCH) {
			long this_scan = SHRINK_BATCH;
			int shrink_ret;
213
			int nr_before;
Linus Torvalds's avatar
Linus Torvalds committed
214

215
			nr_before = (*shrinker->shrinker)(0, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
216
217
218
			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
			if (shrink_ret == -1)
				break;
219
220
			if (shrink_ret < nr_before)
				ret += nr_before - shrink_ret;
221
			count_vm_events(SLABS_SCANNED, this_scan);
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
225
226
227
228
229
			total_scan -= this_scan;

			cond_resched();
		}

		shrinker->nr += total_scan;
	}
	up_read(&shrinker_rwsem);
230
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
}

/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
	struct address_space *mapping;

	/* Page is in somebody's page tables. */
	if (page_mapped(page))
		return 1;

	/* Be more reluctant to reclaim swapcache than pagecache */
	if (PageSwapCache(page))
		return 1;

	mapping = page_mapping(page);
	if (!mapping)
		return 0;

	/* File is mmap'd by somebody? */
	return mapping_mapped(mapping);
}

static inline int is_page_cache_freeable(struct page *page)
{
	return page_count(page) - !!PagePrivate(page) == 2;
}

static int may_write_to_queue(struct backing_dev_info *bdi)
{
261
	if (current->flags & PF_SWAPWRITE)
Linus Torvalds's avatar
Linus Torvalds committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
		return 1;
	if (!bdi_write_congested(bdi))
		return 1;
	if (bdi == current->backing_dev_info)
		return 1;
	return 0;
}

/*
 * We detected a synchronous write error writing a page out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the page and once
 * that page is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping lock_page() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
				struct page *page, int error)
{
	lock_page(page);
	if (page_mapping(page) == mapping) {
		if (error == -ENOSPC)
			set_bit(AS_ENOSPC, &mapping->flags);
		else
			set_bit(AS_EIO, &mapping->flags);
	}
	unlock_page(page);
}

295
296
297
298
299
300
301
302
303
304
305
306
/* possible outcome of pageout() */
typedef enum {
	/* failed to write page out, page is locked */
	PAGE_KEEP,
	/* move page to the active list, page is locked */
	PAGE_ACTIVATE,
	/* page has been sent to the disk successfully, page is unlocked */
	PAGE_SUCCESS,
	/* page is clean and locked */
	PAGE_CLEAN,
} pageout_t;

Linus Torvalds's avatar
Linus Torvalds committed
307
/*
308
309
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
Linus Torvalds's avatar
Linus Torvalds committed
310
 */
311
static pageout_t pageout(struct page *page, struct address_space *mapping)
Linus Torvalds's avatar
Linus Torvalds committed
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
{
	/*
	 * If the page is dirty, only perform writeback if that write
	 * will be non-blocking.  To prevent this allocation from being
	 * stalled by pagecache activity.  But note that there may be
	 * stalls if we need to run get_block().  We could test
	 * PagePrivate for that.
	 *
	 * If this process is currently in generic_file_write() against
	 * this page's queue, we can perform writeback even if that
	 * will block.
	 *
	 * If the page is swapcache, write it back even if that would
	 * block, for some throttling. This happens by accident, because
	 * swap_backing_dev_info is bust: it doesn't reflect the
	 * congestion state of the swapdevs.  Easy to fix, if needed.
	 * See swapfile.c:page_queue_congested().
	 */
	if (!is_page_cache_freeable(page))
		return PAGE_KEEP;
	if (!mapping) {
		/*
		 * Some data journaling orphaned pages can have
		 * page->mapping == NULL while being dirty with clean buffers.
		 */
337
		if (PagePrivate(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
			if (try_to_free_buffers(page)) {
				ClearPageDirty(page);
				printk("%s: orphaned page\n", __FUNCTION__);
				return PAGE_CLEAN;
			}
		}
		return PAGE_KEEP;
	}
	if (mapping->a_ops->writepage == NULL)
		return PAGE_ACTIVATE;
	if (!may_write_to_queue(mapping->backing_dev_info))
		return PAGE_KEEP;

	if (clear_page_dirty_for_io(page)) {
		int res;
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_NONE,
			.nr_to_write = SWAP_CLUSTER_MAX,
356
357
			.range_start = 0,
			.range_end = LLONG_MAX,
Linus Torvalds's avatar
Linus Torvalds committed
358
359
360
361
362
363
364
365
			.nonblocking = 1,
			.for_reclaim = 1,
		};

		SetPageReclaim(page);
		res = mapping->a_ops->writepage(page, &wbc);
		if (res < 0)
			handle_write_error(mapping, page, res);
366
		if (res == AOP_WRITEPAGE_ACTIVATE) {
Linus Torvalds's avatar
Linus Torvalds committed
367
368
369
370
371
372
373
			ClearPageReclaim(page);
			return PAGE_ACTIVATE;
		}
		if (!PageWriteback(page)) {
			/* synchronous write or broken a_ops? */
			ClearPageReclaim(page);
		}
374
		inc_zone_page_state(page, NR_VMSCAN_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
375
376
377
378
379
380
		return PAGE_SUCCESS;
	}

	return PAGE_CLEAN;
}

381
int remove_mapping(struct address_space *mapping, struct page *page)
382
{
383
384
	BUG_ON(!PageLocked(page));
	BUG_ON(mapping != page_mapping(page));
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417

	write_lock_irq(&mapping->tree_lock);

	/*
	 * The non-racy check for busy page.  It is critical to check
	 * PageDirty _after_ making sure that the page is freeable and
	 * not in use by anybody. 	(pagecache + us == 2)
	 */
	if (unlikely(page_count(page) != 2))
		goto cannot_free;
	smp_rmb();
	if (unlikely(PageDirty(page)))
		goto cannot_free;

	if (PageSwapCache(page)) {
		swp_entry_t swap = { .val = page_private(page) };
		__delete_from_swap_cache(page);
		write_unlock_irq(&mapping->tree_lock);
		swap_free(swap);
		__put_page(page);	/* The pagecache ref */
		return 1;
	}

	__remove_from_page_cache(page);
	write_unlock_irq(&mapping->tree_lock);
	__put_page(page);
	return 1;

cannot_free:
	write_unlock_irq(&mapping->tree_lock);
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
418
/*
419
 * shrink_page_list() returns the number of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
420
 */
421
422
static unsigned long shrink_page_list(struct list_head *page_list,
					struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
423
424
425
426
{
	LIST_HEAD(ret_pages);
	struct pagevec freed_pvec;
	int pgactivate = 0;
427
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445

	cond_resched();

	pagevec_init(&freed_pvec, 1);
	while (!list_empty(page_list)) {
		struct address_space *mapping;
		struct page *page;
		int may_enter_fs;
		int referenced;

		cond_resched();

		page = lru_to_page(page_list);
		list_del(&page->lru);

		if (TestSetPageLocked(page))
			goto keep;

Nick Piggin's avatar
Nick Piggin committed
446
		VM_BUG_ON(PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
447
448

		sc->nr_scanned++;
449
450
451
452

		if (!sc->may_swap && page_mapped(page))
			goto keep_locked;

Linus Torvalds's avatar
Linus Torvalds committed
453
454
455
456
457
458
459
		/* Double the slab pressure for mapped and swapcache pages */
		if (page_mapped(page) || PageSwapCache(page))
			sc->nr_scanned++;

		if (PageWriteback(page))
			goto keep_locked;

460
		referenced = page_referenced(page, 1);
Linus Torvalds's avatar
Linus Torvalds committed
461
462
463
464
465
466
467
468
469
		/* In active use or really unfreeable?  Activate it. */
		if (referenced && page_mapping_inuse(page))
			goto activate_locked;

#ifdef CONFIG_SWAP
		/*
		 * Anonymous process memory has backing store?
		 * Try to allocate it some swap space here.
		 */
470
		if (PageAnon(page) && !PageSwapCache(page))
471
			if (!add_to_swap(page, GFP_ATOMIC))
Linus Torvalds's avatar
Linus Torvalds committed
472
473
474
475
476
477
478
479
480
481
482
483
				goto activate_locked;
#endif /* CONFIG_SWAP */

		mapping = page_mapping(page);
		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

		/*
		 * The page is mapped into the page tables of one or more
		 * processes. Try to unmap it here.
		 */
		if (page_mapped(page) && mapping) {
484
			switch (try_to_unmap(page, 0)) {
Linus Torvalds's avatar
Linus Torvalds committed
485
486
487
488
489
490
491
492
493
494
495
496
497
498
			case SWAP_FAIL:
				goto activate_locked;
			case SWAP_AGAIN:
				goto keep_locked;
			case SWAP_SUCCESS:
				; /* try to free the page below */
			}
		}

		if (PageDirty(page)) {
			if (referenced)
				goto keep_locked;
			if (!may_enter_fs)
				goto keep_locked;
499
			if (!sc->may_writepage)
Linus Torvalds's avatar
Linus Torvalds committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
				goto keep_locked;

			/* Page is dirty, try to write it out here */
			switch(pageout(page, mapping)) {
			case PAGE_KEEP:
				goto keep_locked;
			case PAGE_ACTIVATE:
				goto activate_locked;
			case PAGE_SUCCESS:
				if (PageWriteback(page) || PageDirty(page))
					goto keep;
				/*
				 * A synchronous write - probably a ramdisk.  Go
				 * ahead and try to reclaim the page.
				 */
				if (TestSetPageLocked(page))
					goto keep;
				if (PageDirty(page) || PageWriteback(page))
					goto keep_locked;
				mapping = page_mapping(page);
			case PAGE_CLEAN:
				; /* try to free the page below */
			}
		}

		/*
		 * If the page has buffers, try to free the buffer mappings
		 * associated with this page. If we succeed we try to free
		 * the page as well.
		 *
		 * We do this even if the page is PageDirty().
		 * try_to_release_page() does not perform I/O, but it is
		 * possible for a page to have PageDirty set, but it is actually
		 * clean (all its buffers are clean).  This happens if the
		 * buffers were written out directly, with submit_bh(). ext3
		 * will do this, as well as the blockdev mapping. 
		 * try_to_release_page() will discover that cleanness and will
		 * drop the buffers and mark the page clean - it can be freed.
		 *
		 * Rarely, pages can have buffers and no ->mapping.  These are
		 * the pages which were not successfully invalidated in
		 * truncate_complete_page().  We try to drop those buffers here
		 * and if that worked, and the page is no longer mapped into
		 * process address space (page_count == 1) it can be freed.
		 * Otherwise, leave the page on the LRU so it is swappable.
		 */
		if (PagePrivate(page)) {
			if (!try_to_release_page(page, sc->gfp_mask))
				goto activate_locked;
			if (!mapping && page_count(page) == 1)
				goto free_it;
		}

553
		if (!mapping || !remove_mapping(mapping, page))
554
			goto keep_locked;
Linus Torvalds's avatar
Linus Torvalds committed
555
556
557

free_it:
		unlock_page(page);
558
		nr_reclaimed++;
Linus Torvalds's avatar
Linus Torvalds committed
559
560
561
562
563
564
565
566
567
568
569
		if (!pagevec_add(&freed_pvec, page))
			__pagevec_release_nonlru(&freed_pvec);
		continue;

activate_locked:
		SetPageActive(page);
		pgactivate++;
keep_locked:
		unlock_page(page);
keep:
		list_add(&page->lru, &ret_pages);
Nick Piggin's avatar
Nick Piggin committed
570
		VM_BUG_ON(PageLRU(page));
Linus Torvalds's avatar
Linus Torvalds committed
571
572
573
574
	}
	list_splice(&ret_pages, page_list);
	if (pagevec_count(&freed_pvec))
		__pagevec_release_nonlru(&freed_pvec);
575
	count_vm_events(PGACTIVATE, pgactivate);
576
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
}

/*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan:	The number of pages to look through on the list.
 * @src:	The LRU list to pull pages off.
 * @dst:	The temp list to put pages on to.
 * @scanned:	The number of pages that were scanned.
 *
 * returns how many pages were moved onto *@dst.
 */
596
597
598
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
		struct list_head *src, struct list_head *dst,
		unsigned long *scanned)
Linus Torvalds's avatar
Linus Torvalds committed
599
{
600
	unsigned long nr_taken = 0;
Linus Torvalds's avatar
Linus Torvalds committed
601
	struct page *page;
602
	unsigned long scan;
Linus Torvalds's avatar
Linus Torvalds committed
603

604
	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
605
		struct list_head *target;
Linus Torvalds's avatar
Linus Torvalds committed
606
607
608
		page = lru_to_page(src);
		prefetchw_prev_lru_page(page, src, flags);

Nick Piggin's avatar
Nick Piggin committed
609
		VM_BUG_ON(!PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
610

611
		list_del(&page->lru);
612
613
		target = src;
		if (likely(get_page_unless_zero(page))) {
614
			/*
615
616
617
			 * Be careful not to clear PageLRU until after we're
			 * sure the page is not being freed elsewhere -- the
			 * page release code relies on it.
618
			 */
619
620
621
622
			ClearPageLRU(page);
			target = dst;
			nr_taken++;
		} /* else it is being freed elsewhere */
623

624
		list_add(&page->lru, target);
Linus Torvalds's avatar
Linus Torvalds committed
625
626
627
628
629
630
631
	}

	*scanned = scan;
	return nr_taken;
}

/*
632
633
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
634
 */
635
636
static unsigned long shrink_inactive_list(unsigned long max_scan,
				struct zone *zone, struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
637
638
639
{
	LIST_HEAD(page_list);
	struct pagevec pvec;
640
	unsigned long nr_scanned = 0;
641
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
642
643
644
645
646

	pagevec_init(&pvec, 1);

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
647
	do {
Linus Torvalds's avatar
Linus Torvalds committed
648
		struct page *page;
649
650
651
		unsigned long nr_taken;
		unsigned long nr_scan;
		unsigned long nr_freed;
Linus Torvalds's avatar
Linus Torvalds committed
652
653
654
655
656
657
658
659

		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
					     &zone->inactive_list,
					     &page_list, &nr_scan);
		zone->nr_inactive -= nr_taken;
		zone->pages_scanned += nr_scan;
		spin_unlock_irq(&zone->lru_lock);

660
		nr_scanned += nr_scan;
661
		nr_freed = shrink_page_list(&page_list, sc);
662
		nr_reclaimed += nr_freed;
Nick Piggin's avatar
Nick Piggin committed
663
664
		local_irq_disable();
		if (current_is_kswapd()) {
665
666
			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
			__count_vm_events(KSWAPD_STEAL, nr_freed);
Nick Piggin's avatar
Nick Piggin committed
667
		} else
668
669
			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
		__count_vm_events(PGACTIVATE, nr_freed);
Nick Piggin's avatar
Nick Piggin committed
670

671
672
673
		if (nr_taken == 0)
			goto done;

Nick Piggin's avatar
Nick Piggin committed
674
		spin_lock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
675
676
677
678
679
		/*
		 * Put back any unfreeable pages.
		 */
		while (!list_empty(&page_list)) {
			page = lru_to_page(&page_list);
Nick Piggin's avatar
Nick Piggin committed
680
			VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
681
			SetPageLRU(page);
Linus Torvalds's avatar
Linus Torvalds committed
682
683
684
685
686
687
688
689
690
691
692
			list_del(&page->lru);
			if (PageActive(page))
				add_page_to_active_list(zone, page);
			else
				add_page_to_inactive_list(zone, page);
			if (!pagevec_add(&pvec, page)) {
				spin_unlock_irq(&zone->lru_lock);
				__pagevec_release(&pvec);
				spin_lock_irq(&zone->lru_lock);
			}
		}
693
  	} while (nr_scanned < max_scan);
694
	spin_unlock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
695
done:
696
	local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
697
	pagevec_release(&pvec);
698
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
699
700
}

701
702
703
704
705
static inline int zone_is_near_oom(struct zone *zone)
{
	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
}

Linus Torvalds's avatar
Linus Torvalds committed
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
 * processes, from rmap.
 *
 * If the pages are mostly unmapped, the processing is fast and it is
 * appropriate to hold zone->lru_lock across the whole operation.  But if
 * the pages are mapped, the processing is slow (page_referenced()) so we
 * should drop zone->lru_lock around each page.  It's impossible to balance
 * this, so instead we remove the pages from the LRU while processing them.
 * It is safe to rely on PG_active against the non-LRU pages in here because
 * nobody will play with that bit on a non-LRU page.
 *
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
723
724
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
				struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
725
{
726
	unsigned long pgmoved;
Linus Torvalds's avatar
Linus Torvalds committed
727
	int pgdeactivate = 0;
728
	unsigned long pgscanned;
Linus Torvalds's avatar
Linus Torvalds committed
729
730
731
732
733
734
	LIST_HEAD(l_hold);	/* The pages which were snipped off */
	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
	struct page *page;
	struct pagevec pvec;
	int reclaim_mapped = 0;
735

736
	if (sc->may_swap) {
737
738
739
740
		long mapped_ratio;
		long distress;
		long swap_tendency;

741
742
743
		if (zone_is_near_oom(zone))
			goto force_reclaim_mapped;

744
745
746
747
748
749
750
751
752
753
754
755
		/*
		 * `distress' is a measure of how much trouble we're having
		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
		 */
		distress = 100 >> zone->prev_priority;

		/*
		 * The point of this algorithm is to decide when to start
		 * reclaiming mapped memory instead of just pagecache.  Work out
		 * how much memory
		 * is mapped.
		 */
756
757
		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
				global_page_state(NR_ANON_PAGES)) * 100) /
758
					vm_total_pages;
759
760
761
762
763
764
765
766
767
768
769
770
771

		/*
		 * Now decide how much we really want to unmap some pages.  The
		 * mapped ratio is downgraded - just because there's a lot of
		 * mapped memory doesn't necessarily mean that page reclaim
		 * isn't succeeding.
		 *
		 * The distress ratio is important - we don't want to start
		 * going oom.
		 *
		 * A 100% value of vm_swappiness overrides this algorithm
		 * altogether.
		 */
772
		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
773
774
775
776
777
778

		/*
		 * Now use this metric to decide whether to start moving mapped
		 * memory onto the inactive list.
		 */
		if (swap_tendency >= 100)
779
force_reclaim_mapped:
780
781
			reclaim_mapped = 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
				    &l_hold, &pgscanned);
	zone->pages_scanned += pgscanned;
	zone->nr_active -= pgmoved;
	spin_unlock_irq(&zone->lru_lock);

	while (!list_empty(&l_hold)) {
		cond_resched();
		page = lru_to_page(&l_hold);
		list_del(&page->lru);
		if (page_mapped(page)) {
			if (!reclaim_mapped ||
			    (total_swap_pages == 0 && PageAnon(page)) ||
798
			    page_referenced(page, 0)) {
Linus Torvalds's avatar
Linus Torvalds committed
799
800
801
802
803
804
805
806
807
808
809
810
811
				list_add(&page->lru, &l_active);
				continue;
			}
		}
		list_add(&page->lru, &l_inactive);
	}

	pagevec_init(&pvec, 1);
	pgmoved = 0;
	spin_lock_irq(&zone->lru_lock);
	while (!list_empty(&l_inactive)) {
		page = lru_to_page(&l_inactive);
		prefetchw_prev_lru_page(page, &l_inactive, flags);
Nick Piggin's avatar
Nick Piggin committed
812
		VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
813
		SetPageLRU(page);
Nick Piggin's avatar
Nick Piggin committed
814
		VM_BUG_ON(!PageActive(page));
815
816
		ClearPageActive(page);

Linus Torvalds's avatar
Linus Torvalds committed
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
		list_move(&page->lru, &zone->inactive_list);
		pgmoved++;
		if (!pagevec_add(&pvec, page)) {
			zone->nr_inactive += pgmoved;
			spin_unlock_irq(&zone->lru_lock);
			pgdeactivate += pgmoved;
			pgmoved = 0;
			if (buffer_heads_over_limit)
				pagevec_strip(&pvec);
			__pagevec_release(&pvec);
			spin_lock_irq(&zone->lru_lock);
		}
	}
	zone->nr_inactive += pgmoved;
	pgdeactivate += pgmoved;
	if (buffer_heads_over_limit) {
		spin_unlock_irq(&zone->lru_lock);
		pagevec_strip(&pvec);
		spin_lock_irq(&zone->lru_lock);
	}

	pgmoved = 0;
	while (!list_empty(&l_active)) {
		page = lru_to_page(&l_active);
		prefetchw_prev_lru_page(page, &l_active, flags);
Nick Piggin's avatar
Nick Piggin committed
842
		VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
843
		SetPageLRU(page);
Nick Piggin's avatar
Nick Piggin committed
844
		VM_BUG_ON(!PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
845
846
847
848
849
850
851
852
853
854
855
		list_move(&page->lru, &zone->active_list);
		pgmoved++;
		if (!pagevec_add(&pvec, page)) {
			zone->nr_active += pgmoved;
			pgmoved = 0;
			spin_unlock_irq(&zone->lru_lock);
			__pagevec_release(&pvec);
			spin_lock_irq(&zone->lru_lock);
		}
	}
	zone->nr_active += pgmoved;
Nick Piggin's avatar
Nick Piggin committed
856

857
858
859
	__count_zone_vm_events(PGREFILL, zone, pgscanned);
	__count_vm_events(PGDEACTIVATE, pgdeactivate);
	spin_unlock_irq(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
860

Nick Piggin's avatar
Nick Piggin committed
861
	pagevec_release(&pvec);
Linus Torvalds's avatar
Linus Torvalds committed
862
863
864
865
866
}

/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
867
868
static unsigned long shrink_zone(int priority, struct zone *zone,
				struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
869
870
871
{
	unsigned long nr_active;
	unsigned long nr_inactive;
872
	unsigned long nr_to_scan;
873
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
874

875
876
	atomic_inc(&zone->reclaim_in_progress);

Linus Torvalds's avatar
Linus Torvalds committed
877
878
879
880
	/*
	 * Add one to `nr_to_scan' just to make sure that the kernel will
	 * slowly sift through the active list.
	 */
881
	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
Linus Torvalds's avatar
Linus Torvalds committed
882
883
884
885
886
887
	nr_active = zone->nr_scan_active;
	if (nr_active >= sc->swap_cluster_max)
		zone->nr_scan_active = 0;
	else
		nr_active = 0;

888
	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
Linus Torvalds's avatar
Linus Torvalds committed
889
890
891
892
893
894
895
896
	nr_inactive = zone->nr_scan_inactive;
	if (nr_inactive >= sc->swap_cluster_max)
		zone->nr_scan_inactive = 0;
	else
		nr_inactive = 0;

	while (nr_active || nr_inactive) {
		if (nr_active) {
897
			nr_to_scan = min(nr_active,
Linus Torvalds's avatar
Linus Torvalds committed
898
					(unsigned long)sc->swap_cluster_max);
899
			nr_active -= nr_to_scan;
900
			shrink_active_list(nr_to_scan, zone, sc);
Linus Torvalds's avatar
Linus Torvalds committed
901
902
903
		}

		if (nr_inactive) {
904
			nr_to_scan = min(nr_inactive,
Linus Torvalds's avatar
Linus Torvalds committed
905
					(unsigned long)sc->swap_cluster_max);
906
			nr_inactive -= nr_to_scan;
907
908
			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
								sc);
Linus Torvalds's avatar
Linus Torvalds committed
909
910
911
912
		}
	}

	throttle_vm_writeout();
913
914

	atomic_dec(&zone->reclaim_in_progress);
915
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
}

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * We reclaim from a zone even if that zone is over pages_high.  Because:
 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
 *    allocation or
 * b) The zones may be over pages_high but they must go *over* pages_high to
 *    satisfy the `incremental min' zone defense algorithm.
 *
 * Returns the number of reclaimed pages.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
934
static unsigned long shrink_zones(int priority, struct zone **zones,
935
					struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
936
{
937
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
938
939
	int i;

940
	sc->all_unreclaimable = 1;
Linus Torvalds's avatar
Linus Torvalds committed
941
942
943
	for (i = 0; zones[i] != NULL; i++) {
		struct zone *zone = zones[i];

944
		if (!populated_zone(zone))
Linus Torvalds's avatar
Linus Torvalds committed
945
946
			continue;

947
		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds's avatar
Linus Torvalds committed
948
949
			continue;

950
951
952
		zone->temp_priority = priority;
		if (zone->prev_priority > priority)
			zone->prev_priority = priority;
Linus Torvalds's avatar
Linus Torvalds committed
953

954
		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
Linus Torvalds's avatar
Linus Torvalds committed
955
956
			continue;	/* Let kswapd poll it */

957
958
		sc->all_unreclaimable = 0;

959
		nr_reclaimed += shrink_zone(priority, zone, sc);
Linus Torvalds's avatar
Linus Torvalds committed
960
	}
961
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
}
 
/*
 * This is the main entry point to direct page reclaim.
 *
 * If a full scan of the inactive list fails to free enough memory then we
 * are "out of memory" and something needs to be killed.
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick pdflush and take explicit naps in the
 * hope that some of these pages can be written.  But if the allocating task
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
977
unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
978
979
980
{
	int priority;
	int ret = 0;
981
	unsigned long total_scanned = 0;
982
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
983
984
985
	struct reclaim_state *reclaim_state = current->reclaim_state;
	unsigned long lru_pages = 0;
	int i;
986
987
988
989
990
	struct scan_control sc = {
		.gfp_mask = gfp_mask,
		.may_writepage = !laptop_mode,
		.swap_cluster_max = SWAP_CLUSTER_MAX,
		.may_swap = 1,
991
		.swappiness = vm_swappiness,
992
	};
Linus Torvalds's avatar
Linus Torvalds committed
993

994
	count_vm_event(ALLOCSTALL);
Linus Torvalds's avatar
Linus Torvalds committed
995
996
997
998

	for (i = 0; zones[i] != NULL; i++) {
		struct zone *zone = zones[i];

999
		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds's avatar
Linus Torvalds committed
1000
1001
1002
1003
1004
1005
1006
1007
			continue;

		zone->temp_priority = DEF_PRIORITY;
		lru_pages += zone->nr_active + zone->nr_inactive;
	}

	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
		sc.nr_scanned = 0;
1008
1009
		if (!priority)
			disable_swap_token();
1010
		nr_reclaimed += shrink_zones(priority, zones, &sc);
Linus Torvalds's avatar
Linus Torvalds committed
1011
1012
		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
		if (reclaim_state) {
1013
			nr_reclaimed += reclaim_state->reclaimed_slab;
Linus Torvalds's avatar
Linus Torvalds committed
1014
1015
1016
			reclaim_state->reclaimed_slab = 0;
		}
		total_scanned += sc.nr_scanned;
1017
		if (nr_reclaimed >= sc.swap_cluster_max) {
Linus Torvalds's avatar
Linus Torvalds committed
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
			ret = 1;
			goto out;
		}

		/*
		 * Try to write back as many pages as we just scanned.  This
		 * tends to cause slow streaming writers to write data to the
		 * disk smoothly, at the dirtying rate, which is nice.   But
		 * that's undesirable in laptop mode, where we *want* lumpy
		 * writeout.  So in laptop mode, write out the whole world.
		 */
1029
1030
		if (total_scanned > sc.swap_cluster_max +
					sc.swap_cluster_max / 2) {
1031
			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
Linus Torvalds's avatar
Linus Torvalds committed
1032
1033
1034
1035
1036
1037
1038
			sc.may_writepage = 1;
		}

		/* Take a nap, wait for some writeback to complete */
		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
			blk_congestion_wait(WRITE, HZ/10);
	}
1039
1040
1041
	/* top priority shrink_caches still had more to do? don't OOM, then */
	if (!sc.all_unreclaimable)
		ret = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1042
1043
1044
1045
out:
	for (i = 0; zones[i] != 0; i++) {
		struct zone *zone = zones[i];

1046
		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds's avatar
Linus Torvalds committed
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
			continue;

		zone->prev_priority = zone->temp_priority;
	}
	return ret;
}

/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
 *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
 * What we do is to detect the case where all pages in the zone have been
 * scanned twice and there has been zero successful reclaim.  Mark the zone as
 * dead and from now on, only perform a short scan.  Basically we're polling
 * the zone for when the problem goes away.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > pages_high, but once a zone is found to have
 * free_pages <= pages_high, we scan that zone and the lower zones regardless
 * of the number of free pages in the lower zones.  This interoperates with
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
1075
static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
Linus Torvalds's avatar
Linus Torvalds committed
1076
1077
1078
1079
{
	int all_zones_ok;
	int priority;
	int i;
1080
	unsigned long total_scanned;
1081
	unsigned long nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1082
	struct reclaim_state *reclaim_state = current->reclaim_state;
1083
1084
1085
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.may_swap = 1,
1086
1087
		.swap_cluster_max = SWAP_CLUSTER_MAX,
		.swappiness = vm_swappiness,
1088
	};
Linus Torvalds's avatar
Linus Torvalds committed
1089
1090
1091

loop_again:
	total_scanned = 0;
1092
	nr_reclaimed = 0;
Christoph Lameter's avatar
Christoph Lameter committed
1093
	sc.may_writepage = !laptop_mode;
1094
	count_vm_event(PAGEOUTRUN);
Linus Torvalds's avatar
Linus Torvalds committed
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105

	for (i = 0; i < pgdat->nr_zones; i++) {
		struct zone *zone = pgdat->node_zones + i;

		zone->temp_priority = DEF_PRIORITY;
	}

	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
		unsigned long lru_pages = 0;

1106
1107
1108
1109
		/* The swap token gets in the way of swapout... */
		if (!priority)
			disable_swap_token();

Linus Torvalds's avatar
Linus Torvalds committed
1110
1111
		all_zones_ok = 1;

1112
1113
1114
1115
1116
1117
		/*
		 * Scan in the highmem->dma direction for the highest
		 * zone which needs scanning
		 */
		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
			struct zone *zone = pgdat->node_zones + i;
Linus Torvalds's avatar
Linus Torvalds committed
1118

1119
1120
			if (!populated_zone(zone))
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
1121

1122
1123
			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
1124

1125
1126
1127
1128
			if (!zone_watermark_ok(zone, order, zone->pages_high,
					       0, 0)) {
				end_zone = i;
				goto scan;
Linus Torvalds's avatar
Linus Torvalds committed
1129
1130
			}
		}
1131
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
scan:
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			lru_pages += zone->nr_active + zone->nr_inactive;
		}

		/*
		 * Now scan the zone in the dma->highmem direction, stopping
		 * at the last zone which needs scanning.
		 *
		 * We do this because the page allocator works in the opposite
		 * direction.  This prevents the page allocator from allocating
		 * pages behind kswapd's direction of progress, which would
		 * cause too much scanning of the lower zones.
		 */
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;
1150
			int nr_slab;
Linus Torvalds's avatar
Linus Torvalds committed
1151

1152
			if (!populated_zone(zone))
Linus Torvalds's avatar
Linus Torvalds committed
1153
1154
1155
1156
1157
				continue;

			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
				continue;

1158
1159
1160
			if (!zone_watermark_ok(zone, order, zone->pages_high,
					       end_zone, 0))
				all_zones_ok = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1161
1162
1163
1164
			zone->temp_priority = priority;
			if (zone->prev_priority > priority)
				zone->prev_priority = priority;
			sc.nr_scanned = 0;
1165
			nr_reclaimed += shrink_zone(priority, zone, &sc);
Linus Torvalds's avatar
Linus Torvalds committed
1166
			reclaim_state->reclaimed_slab = 0;
1167
1168
			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
						lru_pages);
1169
			nr_reclaimed += reclaim_state->reclaimed_slab;
Linus Torvalds's avatar
Linus Torvalds committed
1170
1171
1172
			total_scanned += sc.nr_scanned;
			if (zone->all_unreclaimable)
				continue;
1173
			if (nr_slab == 0 && zone->pages_scanned >=
1174
				    (zone->nr_active + zone->nr_inactive) * 6)
Linus Torvalds's avatar
Linus Torvalds committed
1175
1176
1177
1178
1179
1180
1181
				zone->all_unreclaimable = 1;
			/*
			 * If we've done a decent amount of scanning and
			 * the reclaim ratio is low, start doing writepage
			 * even in laptop mode
			 */
			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1182
			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
Linus Torvalds's avatar
Linus Torvalds committed
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
				sc.may_writepage = 1;
		}
		if (all_zones_ok)
			break;		/* kswapd: all done */
		/*
		 * OK, kswapd is getting into trouble.  Take a nap, then take
		 * another pass across the zones.
		 */
		if (total_scanned && priority < DEF_PRIORITY - 2)
			blk_congestion_wait(WRITE, HZ/10);

		/*
		 * We do this so kswapd doesn't build up large priorities for
		 * example when it is freeing in parallel with allocators. It
		 * matches the direct reclaim path behaviour in terms of impact
		 * on zone->*_priority.
		 */
1200
		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
Linus Torvalds's avatar
Linus Torvalds committed
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
			break;
	}
out:
	for (i = 0; i < pgdat->nr_zones; i++) {
		struct zone *zone = pgdat->node_zones + i;

		zone->prev_priority = zone->temp_priority;
	}
	if (!all_zones_ok) {
		cond_resched();
		goto loop_again;
	}

1214
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
}

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process. 
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
	unsigned long order;
	pg_data_t *pgdat = (pg_data_t*)p;
	struct task_struct *tsk = current;
	DEFINE_WAIT(wait);
	struct reclaim_state reclaim_state = {
		.reclaimed_slab = 0,
	};
	cpumask_t cpumask;

	cpumask = node_to_cpumask(pgdat->node_id);
	if (!cpus_empty(cpumask))
		set_cpus_allowed(tsk, cpumask);
	current->reclaim_state = &reclaim_state;

	/*
	 * Tell the memory management that we're a "memory allocator",
	 * and that if we need more memory we should get access to it
	 * regardless (see "__alloc_pages()"). "kswapd" should
	 * never get caught in the normal page freeing logic.
	 *
	 * (Kswapd normally doesn't need memory anyway, but sometimes
	 * you need a small amount of memory in order to be able to
	 * page out something else, and this flag essentially protects
	 * us from recursively trying to free more memory as we're
	 * trying to free the first piece of memory in the first place).
	 */
1258
	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
Linus Torvalds's avatar
Linus Torvalds committed
1259
1260
1261
1262

	order = 0;
	for ( ; ; ) {
		unsigned long new_order;
1263
1264

		try_to_freeze();
Linus Torvalds's avatar
Linus Torvalds committed
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280

		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
		new_order = pgdat->kswapd_max_order;
		pgdat->kswapd_max_order = 0;
		if (order < new_order) {
			/*
			 * Don't sleep if someone wants a larger 'order'
			 * allocation
			 */
			order = new_order;
		} else {
			schedule();
			order = pgdat->kswapd_max_order;
		}
		finish_wait(&pgdat->kswapd_wait, &wait);

1281
		balance_pgdat(pgdat, order);
Linus Torvalds's avatar
Linus Torvalds committed
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
	}
	return 0;
}

/*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
void wakeup_kswapd(struct zone *zone, int order)
{
	pg_data_t *pgdat;

1293
	if (!populated_zone(zone))
Linus Torvalds's avatar
Linus Torvalds committed
1294
1295
1296
		return;

	pgdat = zone->zone_pgdat;
Rohit Seth's avatar
Rohit Seth committed
1297
	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
Linus Torvalds's avatar
Linus Torvalds committed
1298
1299
1300
		return;
	if (pgdat->kswapd_max_order < order)
		pgdat->kswapd_max_order = order;
1301
	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds's avatar
Linus Torvalds committed
1302
		return;
1303
	if (!waitqueue_active(&pgdat->kswapd_wait))
Linus Torvalds's avatar
Linus Torvalds committed
1304
		return;
1305
	wake_up_interruptible(&pgdat->kswapd_wait);
Linus Torvalds's avatar
Linus Torvalds committed
1306
1307
1308
1309
}

#ifdef CONFIG_PM
/*
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
 * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
 * from LRU lists system-wide, for given pass and priority, and returns the
 * number of reclaimed pages
 *
 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
 */
static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
				      int prio, struct scan_control *sc)
{
	struct zone *zone;
	unsigned long nr_to_scan, ret = 0;

	for_each_zone(zone) {

		if (!populated_zone(zone))
			continue;

		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
			continue;

		/* For pass = 0 we don't shrink the active list */
		if (pass > 0) {
			zone->nr_scan_active += (zone->nr_active >> prio) + 1;
			if (zone->nr_scan_active >= nr_pages || pass > 3) {
				zone->nr_scan_active = 0;
				nr_to_scan = min(nr_pages, zone->nr_active);
				shrink_active_list(nr_to_scan, zone, sc);
			}
		}

		zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
			zone->nr_scan_inactive = 0;
			nr_to_scan = min(nr_pages, zone->nr_inactive);
			ret += shrink_inactive_list(nr_to_scan, zone, sc);
			if (ret >= nr_pages)
				return ret;
		}
	}

	return ret;
}

/*
 * Try to free `nr_pages' of memory, system-wide, and return the number of
 * freed pages.
 *
 * Rather than trying to age LRUs the aim is to preserve the overall
 * LRU order by reclaiming preferentially
 * inactive > active > active referenced > active mapped
Linus Torvalds's avatar
Linus Torvalds committed
1360
 */
1361
unsigned long shrink_all_memory(unsigned long nr_pages)
Linus Torvalds's avatar
Linus Torvalds committed
1362
{
1363
	unsigned long lru_pages, nr_slab;
1364
	unsigned long ret = 0;
1365
1366
1367
1368
1369
1370
1371
1372
1373
	int pass;
	struct reclaim_state reclaim_state;
	struct zone *zone;
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.may_swap = 0,
		.swap_cluster_max = nr_pages,
		.may_writepage = 1,
		.swappiness = vm_swappiness,
Linus Torvalds's avatar
Linus Torvalds committed
1374
1375
1376
	};

	current->reclaim_state = &reclaim_state;
1377

1378
1379
1380
1381
	lru_pages = 0;
	for_each_zone(zone)
		lru_pages += zone->nr_active + zone->nr_inactive;

1382
	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1383
1384
1385
1386
1387
	/* If slab caches are huge, it's better to hit them first */
	while (nr_slab >= lru_pages) {
		reclaim_state.reclaimed_slab = 0;
		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
		if (!reclaim_state.reclaimed_slab)
Linus Torvalds's avatar
Linus Torvalds committed
1388
			break;
1389
1390
1391
1392
1393
1394

		ret += reclaim_state.reclaimed_slab;
		if (ret >= nr_pages)
			goto out;

		nr_slab -= reclaim_state.reclaimed_slab;
Linus Torvalds's avatar
Linus Torvalds committed
1395
	}
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439

	/*
	 * We try to shrink LRUs in 5 passes:
	 * 0 = Reclaim from inactive_list only
	 * 1 = Reclaim from active list but don't reclaim mapped
	 * 2 = 2nd pass of type 1
	 * 3 = Reclaim mapped (normal reclaim)
	 * 4 = 2nd pass of type 3
	 */
	for (pass = 0; pass < 5; pass++) {
		int prio;

		/* Needed for shrinking slab caches later on */
		if (!lru_pages)
			for_each_zone(zone) {
				lru_pages += zone->nr_active;
				lru_pages += zone->nr_inactive;
			}

		/* Force reclaiming mapped pages in the passes #3 and #4 */
		if (pass > 2) {
			sc.may_swap = 1;
			sc.swappiness = 100;
		}

		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
			unsigned long nr_to_scan = nr_pages - ret;

			sc.nr_scanned = 0;
			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
			if (ret >= nr_pages)
				goto out;

			reclaim_state.reclaimed_slab = 0;
			shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
			ret += reclaim_state.reclaimed_slab;
			if (ret >= nr_pages)
				goto out;

			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
				blk_congestion_wait(WRITE, HZ / 10);
		}

		lru_pages = 0;
1440
	}
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453

	/*
	 * If ret = 0, we could not shrink LRUs, but there may be something
	 * in slab caches
	 */
	if (!ret)
		do {
			reclaim_state.reclaimed_slab = 0;
			shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
			ret += reclaim_state.reclaimed_slab;
		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);

out:
Linus Torvalds's avatar
Linus Torvalds committed
1454
	current->reclaim_state = NULL;
1455

Linus Torvalds's avatar
Linus Torvalds committed
1456
1457
1458
1459
1460
1461
1462
1463
1464
	return ret;
}
#endif

#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
   not required for correctness.  So if the last cpu in a node goes
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
1465
static int __devinit cpu_callback(struct notifier_block *nfb,
1466
				  unsigned long action, void *hcpu)
Linus Torvalds's avatar
Linus Torvalds committed
1467
1468
1469
1470
1471
{
	pg_data_t *pgdat;
	cpumask_t mask;

	if (action == CPU_ONLINE) {
1472
		for_each_online_pgdat(pgdat) {
Linus Torvalds's avatar
Linus Torvalds committed
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
			mask = node_to_cpumask(pgdat->node_id);
			if (any_online_cpu(mask) != NR_CPUS)
				/* One of our CPUs online: restore mask */
				set_cpus_allowed(pgdat->kswapd, mask);
		}
	}
	return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */

1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
/*
 * This kswapd start function will be called by init and node-hot-add.
 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
 */
int kswapd_run(int nid)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kswapd)
		return 0;

	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
	if (IS_ERR(pgdat->kswapd)) {
		/* failure at boot is fatal */
		BUG_ON(system_state == SYSTEM_BOOTING);
		printk("Failed to start kswapd on node %d\n",nid);
		ret = -1;
	}
	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
1505
1506
static int __init kswapd_init(void)
{
1507
	int nid;
1508

Linus Torvalds's avatar
Linus Torvalds committed
1509
	swap_setup();
1510
1511
	for_each_online_node(nid)
 		kswapd_run(nid);
Linus Torvalds's avatar
Linus Torvalds committed
1512
1513
1514
1515
1516
	hotcpu_notifier(cpu_callback, 0);
	return 0;
}

module_init(kswapd_init)
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526

#ifdef CONFIG_NUMA
/*
 * Zone reclaim mode
 *
 * If non-zero call zone_reclaim when the number of free pages falls below
 * the watermarks.
 */
int zone_reclaim_mode __read_mostly;

1527
1528
1529
1530
1531
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */

1532
1533
1534
1535
1536
1537
1538
/*
 * Priority for ZONE_RECLAIM. This determines the fraction of pages
 * of a node considered for each zone_reclaim. 4 scans 1/16th of
 * a zone.
 */
#define ZONE_RECLAIM_PRIORITY 4

1539
1540
1541
1542
1543
1544
/*
 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
 * occur.
 */
int sysctl_min_unmapped_ratio = 1;

1545
1546
1547
1548
1549
1550
/*
 * If the number of slab pages in a zone grows beyond this percentage then
 * slab reclaim needs to occur.
 */
int sysctl_min_slab_ratio = 5;

1551
1552
1553
/*
 * Try to free up some pages from this zone through reclaim.
 */
1554
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1555
{
1556
	/* Minimum pages needed in order to stay on node */
1557
	const unsigned long nr_pages = 1 << order;
1558
1559
	struct task_struct *p = current;
	struct reclaim_state reclaim_state;
1560
	int priority;
1561
	unsigned long nr_reclaimed = 0;
1562
1563
1564
	struct scan_control sc = {
		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1565
1566
		.swap_cluster_max = max_t(unsigned long, nr_pages,
					SWAP_CLUSTER_MAX),
1567
		.gfp_mask = gfp_mask,
1568
		.swappiness = vm_swappiness,
1569
	};
1570
	unsigned long slab_reclaimable;
1571
1572
1573

	disable_swap_token();
	cond_resched();
1574
1575
1576
1577
1578
1579
	/*
	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
	 * and we also need to be able to write out pages for RECLAIM_WRITE
	 * and RECLAIM_SWAP.
	 */
	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
1580
1581
	reclaim_state.reclaimed_slab = 0;
	p->reclaim_state = &reclaim_state;
1582

1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
	if (zone_page_state(zone, NR_FILE_PAGES) -
		zone_page_state(zone, NR_FILE_MAPPED) >
		zone->min_unmapped_pages) {
		/*
		 * Free memory by calling shrink zone with increasing
		 * priorities until we have enough memory freed.
		 */
		priority = ZONE_RECLAIM_PRIORITY;
		do {
			nr_reclaimed += shrink_zone(priority, zone, &sc);
			priority--;
		} while (priority >= 0 && nr_reclaimed < nr_pages);
	}
1596

1597
1598
	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
	if (slab_reclaimable > zone->min_slab_pages) {
1599
		/*
1600
		 * shrink_slab() does not currently allow us to determine how
1601
1602
1603