page-writeback.c 85 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2
/*
3
 * mm/page-writeback.c
Linus Torvalds's avatar
Linus Torvalds committed
4
5
 *
 * Copyright (C) 2002, Linus Torvalds.
6
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Linus Torvalds's avatar
Linus Torvalds committed
7
8
9
10
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
11
 * 10Apr2002	Andrew Morton
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
15
 *		Initial version
 */

#include <linux/kernel.h>
16
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
18
19
20
21
22
23
24
25
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
26
#include <linux/task_io_accounting_ops.h>
Linus Torvalds's avatar
Linus Torvalds committed
27
28
#include <linux/blkdev.h>
#include <linux/mpage.h>
29
#include <linux/rmap.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
33
34
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
35
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36
#include <linux/pagevec.h>
37
#include <linux/timer.h>
38
#include <linux/sched/rt.h>
39
#include <linux/sched/signal.h>
40
#include <linux/mm_inline.h>
41
#include <trace/events/writeback.h>
Linus Torvalds's avatar
Linus Torvalds committed
42

43
44
#include "internal.h"

45
46
47
48
49
/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE		max(HZ/5, 1)

50
51
52
53
54
55
/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH	(128 >> (PAGE_SHIFT - 10))

56
57
58
59
60
/*
 * Estimate write bandwidth at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL	max(HZ/5, 1)

61
62
#define RATELIMIT_CALC_SHIFT	10

Linus Torvalds's avatar
Linus Torvalds committed
63
64
65
66
67
68
69
70
71
/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = 32;

/* The following parameters are exported via /proc/sys/vm */

/*
72
 * Start background writeback (via writeback threads) at this percentage
Linus Torvalds's avatar
Linus Torvalds committed
73
 */
74
int dirty_background_ratio = 10;
Linus Torvalds's avatar
Linus Torvalds committed
75

76
77
78
79
80
81
/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
unsigned long dirty_background_bytes;

82
83
84
85
86
87
/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
int vm_highmem_is_dirtyable;

Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
/*
 * The generator of dirty data starts writeback at this percentage
 */
91
int vm_dirty_ratio = 20;
Linus Torvalds's avatar
Linus Torvalds committed
92

93
94
95
96
97
98
/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
unsigned long vm_dirty_bytes;

Linus Torvalds's avatar
Linus Torvalds committed
99
/*
100
 * The interval between `kupdate'-style writebacks
Linus Torvalds's avatar
Linus Torvalds committed
101
 */
102
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
Linus Torvalds's avatar
Linus Torvalds committed
103

104
105
EXPORT_SYMBOL_GPL(dirty_writeback_interval);

Linus Torvalds's avatar
Linus Torvalds committed
106
/*
107
 * The longest time for which data is allowed to remain dirty
Linus Torvalds's avatar
Linus Torvalds committed
108
 */
109
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
Linus Torvalds's avatar
Linus Torvalds committed
110
111
112
113
114
115
116

/*
 * Flag that makes the machine dump writes/reads and block dirtyings.
 */
int block_dump;

/*
117
118
 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 * a full sync is triggered after this time elapses without any disk activity.
Linus Torvalds's avatar
Linus Torvalds committed
119
120
121
122
123
124
125
 */
int laptop_mode;

EXPORT_SYMBOL(laptop_mode);

/* End of sysctl-exported parameters */

126
struct wb_domain global_wb_domain;
Linus Torvalds's avatar
Linus Torvalds committed
127

128
129
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
130
131
#ifdef CONFIG_CGROUP_WRITEBACK
	struct wb_domain	*dom;
132
	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
133
#endif
134
	struct bdi_writeback	*wb;
135
	struct fprop_local_percpu *wb_completions;
136

137
	unsigned long		avail;		/* dirtyable */
138
139
140
141
142
143
	unsigned long		dirty;		/* file_dirty + write + nfs */
	unsigned long		thresh;		/* dirty threshold */
	unsigned long		bg_thresh;	/* dirty background threshold */

	unsigned long		wb_dirty;	/* per-wb counterparts */
	unsigned long		wb_thresh;
144
	unsigned long		wb_bg_thresh;
145
146

	unsigned long		pos_ratio;
147
148
};

149
150
151
152
153
154
/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
155

156
157
#ifdef CONFIG_CGROUP_WRITEBACK

158
159
160
161
#define GDTC_INIT(__wb)		.wb = (__wb),				\
				.dom = &global_wb_domain,		\
				.wb_completions = &(__wb)->completions

162
#define GDTC_INIT_NO_WB		.dom = &global_wb_domain
163
164
165
166
167

#define MDTC_INIT(__wb, __gdtc)	.wb = (__wb),				\
				.dom = mem_cgroup_wb_domain(__wb),	\
				.wb_completions = &(__wb)->memcg_completions, \
				.gdtc = __gdtc
168
169
170
171
172

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
	return dtc->dom;
}
173
174
175
176
177
178

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
	return dtc->dom;
}

179
180
181
182
183
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
	return mdtc->gdtc;
}

184
185
186
187
188
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
	return &wb->memcg_completions;
}

189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
static void wb_min_max_ratio(struct bdi_writeback *wb,
			     unsigned long *minp, unsigned long *maxp)
{
	unsigned long this_bw = wb->avg_write_bandwidth;
	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
	unsigned long long min = wb->bdi->min_ratio;
	unsigned long long max = wb->bdi->max_ratio;

	/*
	 * @wb may already be clean by the time control reaches here and
	 * the total may not include its bw.
	 */
	if (this_bw < tot_bw) {
		if (min) {
			min *= this_bw;
204
			min = div64_ul(min, tot_bw);
205
206
207
		}
		if (max < 100) {
			max *= this_bw;
208
			max = div64_ul(max, tot_bw);
209
210
211
212
213
214
215
216
217
		}
	}

	*minp = min;
	*maxp = max;
}

#else	/* CONFIG_CGROUP_WRITEBACK */

218
219
#define GDTC_INIT(__wb)		.wb = (__wb),                           \
				.wb_completions = &(__wb)->completions
220
#define GDTC_INIT_NO_WB
221
222
223
224
225
226
#define MDTC_INIT(__wb, __gdtc)

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
	return false;
}
227
228
229
230
231
232

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
	return &global_wb_domain;
}

233
234
235
236
237
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
	return NULL;
}

238
239
240
241
242
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
	return NULL;
}

243
244
245
246
247
248
249
250
251
static void wb_min_max_ratio(struct bdi_writeback *wb,
			     unsigned long *minp, unsigned long *maxp)
{
	*minp = wb->bdi->min_ratio;
	*maxp = wb->bdi->max_ratio;
}

#endif	/* CONFIG_CGROUP_WRITEBACK */

252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effictive number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

270
/**
271
272
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
273
 *
274
 * Return: the node's number of pages potentially available for dirty
275
 * page cache.  This is the base value for the per-node dirty limits.
276
 */
277
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
278
{
279
280
281
282
283
284
285
286
287
288
289
	unsigned long nr_pages = 0;
	int z;

	for (z = 0; z < MAX_NR_ZONES; z++) {
		struct zone *zone = pgdat->node_zones + z;

		if (!populated_zone(zone))
			continue;

		nr_pages += zone_page_state(zone, NR_FREE_PAGES);
	}
290

291
292
293
294
295
	/*
	 * Pages reserved for the kernel should not be considered
	 * dirtyable, to prevent a situation where reclaim has to
	 * clean pages in order to balance the zones.
	 */
296
	nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
297

298
299
	nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
	nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
300
301
302
303

	return nr_pages;
}

304
305
306
307
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
	int node;
308
	unsigned long x = 0;
309
	int i;
310
311

	for_each_node_state(node, N_HIGH_MEMORY) {
312
313
		for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
			struct zone *z;
314
			unsigned long nr_pages;
315
316
317
318
319

			if (!is_highmem_idx(i))
				continue;

			z = &NODE_DATA(node)->node_zones[i];
320
321
			if (!populated_zone(z))
				continue;
322

323
			nr_pages = zone_page_state(z, NR_FREE_PAGES);
324
			/* watch for underflows */
325
			nr_pages -= min(nr_pages, high_wmark_pages(z));
326
327
328
			nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
			nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
			x += nr_pages;
329
		}
330
	}
331

332
333
334
335
336
337
338
339
340
341
342
343
	/*
	 * Unreclaimable memory (kernel memory or anonymous memory
	 * without swap) can bring down the dirtyable pages below
	 * the zone's dirty balance reserve and the above calculation
	 * will underflow.  However we still want to add in nodes
	 * which are below threshold (negative values) to get a more
	 * accurate calculation but make sure that the total never
	 * underflows.
	 */
	if ((long)x < 0)
		x = 0;

344
345
346
347
348
349
350
351
352
353
354
355
356
	/*
	 * Make sure that the number of highmem pages is never larger
	 * than the number of the total dirtyable memory. This can only
	 * occur in very strange VM situations but we want to make sure
	 * that this does not occur.
	 */
	return min(x, total);
#else
	return 0;
#endif
}

/**
357
 * global_dirtyable_memory - number of globally dirtyable pages
358
 *
359
 * Return: the global number of pages potentially available for dirty
360
 * page cache.  This is the base value for the global dirty limits.
361
 */
362
static unsigned long global_dirtyable_memory(void)
363
364
365
{
	unsigned long x;

366
	x = global_zone_page_state(NR_FREE_PAGES);
367
368
369
370
371
372
	/*
	 * Pages reserved for the kernel should not be considered
	 * dirtyable, to prevent a situation where reclaim has to
	 * clean pages in order to balance the zones.
	 */
	x -= min(x, totalreserve_pages);
373

374
375
	x += global_node_page_state(NR_INACTIVE_FILE);
	x += global_node_page_state(NR_ACTIVE_FILE);
376

377
378
379
380
381
382
	if (!vm_highmem_is_dirtyable)
		x -= highmem_dirtyable_memory(x);

	return x + 1;	/* Ensure that we never return 0 */
}

383
384
385
/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
386
 *
387
388
389
390
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
391
392
 * real-time tasks.
 */
393
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
394
{
395
396
397
398
	const unsigned long available_memory = dtc->avail;
	struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
	unsigned long bytes = vm_dirty_bytes;
	unsigned long bg_bytes = dirty_background_bytes;
399
400
401
	/* convert ratios to per-PAGE_SIZE for higher precision */
	unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
	unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
402
403
	unsigned long thresh;
	unsigned long bg_thresh;
404
405
	struct task_struct *tsk;

406
407
408
409
410
411
412
	/* gdtc is !NULL iff @dtc is for memcg domain */
	if (gdtc) {
		unsigned long global_avail = gdtc->avail;

		/*
		 * The byte settings can't be applied directly to memcg
		 * domains.  Convert them to ratios by scaling against
413
414
415
		 * globally available memory.  As the ratios are in
		 * per-PAGE_SIZE, they can be obtained by dividing bytes by
		 * number of pages.
416
417
		 */
		if (bytes)
418
419
			ratio = min(DIV_ROUND_UP(bytes, global_avail),
				    PAGE_SIZE);
420
		if (bg_bytes)
421
422
			bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
				       PAGE_SIZE);
423
424
425
426
427
		bytes = bg_bytes = 0;
	}

	if (bytes)
		thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
428
	else
429
		thresh = (ratio * available_memory) / PAGE_SIZE;
430

431
432
	if (bg_bytes)
		bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
433
	else
434
		bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
435

436
	if (bg_thresh >= thresh)
437
		bg_thresh = thresh / 2;
438
439
	tsk = current;
	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
440
441
		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
442
	}
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
	dtc->thresh = thresh;
	dtc->bg_thresh = bg_thresh;

	/* we should eventually report the domain in the TP */
	if (!gdtc)
		trace_global_dirty_state(bg_thresh, thresh);
}

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };

	gdtc.avail = global_dirtyable_memory();
	domain_dirty_limits(&gdtc);

	*pbackground = gdtc.bg_thresh;
	*pdirty = gdtc.thresh;
468
469
}

470
/**
471
472
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
473
 *
474
 * Return: the maximum number of dirty pages allowed in a node, based
475
 * on the node's dirtyable memory.
476
 */
477
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
478
{
479
	unsigned long node_memory = node_dirtyable_memory(pgdat);
480
481
482
483
484
	struct task_struct *tsk = current;
	unsigned long dirty;

	if (vm_dirty_bytes)
		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
485
			node_memory / global_dirtyable_memory();
486
	else
487
		dirty = vm_dirty_ratio * node_memory / 100;
488
489
490
491
492
493
494
495

	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
		dirty += dirty / 4;

	return dirty;
}

/**
496
497
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
498
 *
499
 * Return: %true when the dirty pages in @pgdat are within the node's
500
501
 * dirty limit, %false if the limit is exceeded.
 */
502
bool node_dirty_ok(struct pglist_data *pgdat)
503
{
504
505
506
	unsigned long limit = node_dirty_limit(pgdat);
	unsigned long nr_pages = 0;

507
508
509
	nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
	nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
	nr_pages += node_page_state(pgdat, NR_WRITEBACK);
510

511
	return nr_pages <= limit;
512
513
}

514
int dirty_background_ratio_handler(struct ctl_table *table, int write,
515
		void __user *buffer, size_t *lenp,
516
517
518
519
		loff_t *ppos)
{
	int ret;

520
	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
521
522
523
524
525
526
	if (ret == 0 && write)
		dirty_background_bytes = 0;
	return ret;
}

int dirty_background_bytes_handler(struct ctl_table *table, int write,
527
		void __user *buffer, size_t *lenp,
528
529
530
531
		loff_t *ppos)
{
	int ret;

532
	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
533
534
535
536
537
	if (ret == 0 && write)
		dirty_background_ratio = 0;
	return ret;
}

538
int dirty_ratio_handler(struct ctl_table *table, int write,
539
		void __user *buffer, size_t *lenp,
540
541
542
		loff_t *ppos)
{
	int old_ratio = vm_dirty_ratio;
543
544
	int ret;

545
	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
546
	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
547
		writeback_set_ratelimit();
548
549
550
551
552
553
		vm_dirty_bytes = 0;
	}
	return ret;
}

int dirty_bytes_handler(struct ctl_table *table, int write,
554
		void __user *buffer, size_t *lenp,
555
556
		loff_t *ppos)
{
557
	unsigned long old_bytes = vm_dirty_bytes;
558
559
	int ret;

560
	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
561
	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
562
		writeback_set_ratelimit();
563
		vm_dirty_ratio = 0;
564
565
566
567
	}
	return ret;
}

568
569
570
571
572
573
574
575
576
static unsigned long wp_next_time(unsigned long cur_time)
{
	cur_time += VM_COMPLETIONS_PERIOD_LEN;
	/* 0 has a special meaning... */
	if (!cur_time)
		return 1;
	return cur_time;
}

577
578
579
static void wb_domain_writeout_inc(struct wb_domain *dom,
				   struct fprop_local_percpu *completions,
				   unsigned int max_prop_frac)
580
{
581
582
	__fprop_inc_percpu_max(&dom->completions, completions,
			       max_prop_frac);
583
	/* First event after period switching was turned off? */
584
	if (unlikely(!dom->period_time)) {
585
586
587
588
589
590
		/*
		 * We can race with other __bdi_writeout_inc calls here but
		 * it does not cause any harm since the resulting time when
		 * timer will fire and what is in writeout_period_time will be
		 * roughly the same.
		 */
Tejun Heo's avatar
Tejun Heo committed
591
592
		dom->period_time = wp_next_time(jiffies);
		mod_timer(&dom->period_timer, dom->period_time);
593
	}
594
595
}

596
597
598
599
600
/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from test_clear_page_writeback().
 */
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
601
{
602
	struct wb_domain *cgdom;
603

604
	inc_wb_stat(wb, WB_WRITTEN);
605
606
	wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
			       wb->bdi->max_prop_frac);
607
608
609
610
611

	cgdom = mem_cgroup_wb_domain(wb);
	if (cgdom)
		wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
				       wb->bdi->max_prop_frac);
612
613
}

614
void wb_writeout_inc(struct bdi_writeback *wb)
615
{
616
617
618
	unsigned long flags;

	local_irq_save(flags);
619
	__wb_writeout_inc(wb);
620
	local_irq_restore(flags);
621
}
622
EXPORT_SYMBOL_GPL(wb_writeout_inc);
623

624
625
626
627
/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
628
static void writeout_period(struct timer_list *t)
629
{
630
	struct wb_domain *dom = from_timer(dom, t, period_timer);
Tejun Heo's avatar
Tejun Heo committed
631
	int miss_periods = (jiffies - dom->period_time) /
632
633
						 VM_COMPLETIONS_PERIOD_LEN;

Tejun Heo's avatar
Tejun Heo committed
634
635
	if (fprop_new_period(&dom->completions, miss_periods + 1)) {
		dom->period_time = wp_next_time(dom->period_time +
636
				miss_periods * VM_COMPLETIONS_PERIOD_LEN);
Tejun Heo's avatar
Tejun Heo committed
637
		mod_timer(&dom->period_timer, dom->period_time);
638
639
640
641
642
	} else {
		/*
		 * Aging has zeroed all fractions. Stop wasting CPU on period
		 * updates.
		 */
Tejun Heo's avatar
Tejun Heo committed
643
		dom->period_time = 0;
644
645
646
	}
}

Tejun Heo's avatar
Tejun Heo committed
647
648
649
int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
	memset(dom, 0, sizeof(*dom));
650
651
652

	spin_lock_init(&dom->lock);

653
	timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
654
655
656

	dom->dirty_limit_tstamp = jiffies;

Tejun Heo's avatar
Tejun Heo committed
657
658
659
	return fprop_global_init(&dom->completions, gfp);
}

660
661
662
663
664
665
666
667
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
	del_timer_sync(&dom->period_timer);
	fprop_global_destroy(&dom->completions);
}
#endif

668
/*
669
670
671
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
672
673
674
675
676
677
678
 */
static unsigned int bdi_min_ratio;

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
	int ret = 0;

679
	spin_lock_bh(&bdi_lock);
680
	if (min_ratio > bdi->max_ratio) {
681
		ret = -EINVAL;
682
683
684
685
686
687
688
689
690
	} else {
		min_ratio -= bdi->min_ratio;
		if (bdi_min_ratio + min_ratio < 100) {
			bdi_min_ratio += min_ratio;
			bdi->min_ratio += min_ratio;
		} else {
			ret = -EINVAL;
		}
	}
691
	spin_unlock_bh(&bdi_lock);
692
693
694
695
696
697
698
699
700
701
702

	return ret;
}

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
	int ret = 0;

	if (max_ratio > 100)
		return -EINVAL;

703
	spin_lock_bh(&bdi_lock);
704
705
706
707
	if (bdi->min_ratio > max_ratio) {
		ret = -EINVAL;
	} else {
		bdi->max_ratio = max_ratio;
708
		bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
709
	}
710
	spin_unlock_bh(&bdi_lock);
711
712
713

	return ret;
}
714
EXPORT_SYMBOL(bdi_set_max_ratio);
715

716
717
718
719
720
721
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
					   unsigned long bg_thresh)
{
	return (thresh + bg_thresh) / 2;
}

722
723
static unsigned long hard_dirty_limit(struct wb_domain *dom,
				      unsigned long thresh)
724
{
725
	return max(thresh, dom->dirty_limit);
726
727
}

728
729
730
731
732
733
/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
			    unsigned long filepages, unsigned long headroom)
734
735
{
	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
736
737
738
	unsigned long clean = filepages - min(filepages, mdtc->dirty);
	unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
	unsigned long other_clean = global_clean - min(global_clean, clean);
739

740
	mdtc->avail = filepages + min(headroom, other_clean);
741
742
}

743
/**
744
745
 * __wb_calc_thresh - @wb's share of dirty throttling threshold
 * @dtc: dirty_throttle_context of interest
746
 *
747
748
749
750
751
 * Note that balance_dirty_pages() will only seriously take it as a hard limit
 * when sleeping max_pause per page is not enough to keep the dirty pages under
 * control. For example, when the device is completely stalled due to some error
 * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
752
 * more (rather than completely block them) when the wb dirty pages go high.
753
 *
754
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
755
756
757
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
758
 * The wb's share of dirty limit will be adapting to its throughput and
759
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
760
761
762
 *
 * Return: @wb's dirty limit in pages. The term "dirty" in the context of
 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
763
 */
764
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
765
{
766
	struct wb_domain *dom = dtc_dom(dtc);
767
	unsigned long thresh = dtc->thresh;
768
	u64 wb_thresh;
769
	unsigned long numerator, denominator;
770
	unsigned long wb_min_ratio, wb_max_ratio;
771

772
	/*
773
	 * Calculate this BDI's share of the thresh ratio.
774
	 */
775
	fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
Tejun Heo's avatar
Tejun Heo committed
776
			      &numerator, &denominator);
777

778
779
	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
	wb_thresh *= numerator;
780
	wb_thresh = div64_ul(wb_thresh, denominator);
781

782
	wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
783

784
785
786
	wb_thresh += (thresh * wb_min_ratio) / 100;
	if (wb_thresh > (thresh * wb_max_ratio) / 100)
		wb_thresh = thresh * wb_max_ratio / 100;
787

788
	return wb_thresh;
Linus Torvalds's avatar
Linus Torvalds committed
789
790
}

791
792
793
794
795
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
	struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
					       .thresh = thresh };
	return __wb_calc_thresh(&gdtc);
Linus Torvalds's avatar
Linus Torvalds committed
796
797
}

798
799
800
801
802
803
804
805
806
807
808
809
810
811
/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0	 => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
812
static long long pos_ratio_polynom(unsigned long setpoint,
813
814
815
816
817
818
					  unsigned long dirty,
					  unsigned long limit)
{
	long long pos_ratio;
	long x;

819
	x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
820
		      (limit - setpoint) | 1);
821
822
823
824
825
826
827
828
	pos_ratio = x;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

	return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

829
830
831
832
833
/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
834
 * We want the dirty pages be balanced around the global/wb setpoints.
835
836
837
838
839
840
841
842
843
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
844
845
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0 .............*
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
870
 * (o) wb control line
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
896
 *                wb_setpoint^                    x_intercept^
897
 *
898
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
899
900
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
901
902
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
903
 */
904
static void wb_position_ratio(struct dirty_throttle_control *dtc)
905
{
906
	struct bdi_writeback *wb = dtc->wb;
907
	unsigned long write_bw = wb->avg_write_bandwidth;
908
	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
909
	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
910
	unsigned long wb_thresh = dtc->wb_thresh;
911
912
	unsigned long x_intercept;
	unsigned long setpoint;		/* dirty pages' target balance point */
913
	unsigned long wb_setpoint;
914
915
916
917
	unsigned long span;
	long long pos_ratio;		/* for scaling up/down the rate limit */
	long x;

918
919
	dtc->pos_ratio = 0;

920
	if (unlikely(dtc->dirty >= limit))
921
		return;
922
923
924
925

	/*
	 * global setpoint
	 *
926
927
928
	 * See comment for pos_ratio_polynom().
	 */
	setpoint = (freerun + limit) / 2;
929
	pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
930
931
932
933

	/*
	 * The strictlimit feature is a tool preventing mistrusted filesystems
	 * from growing a large number of dirty pages before throttling. For
934
935
	 * such filesystems balance_dirty_pages always checks wb counters
	 * against wb limits. Even if global "nr_dirty" is under "freerun".
936
937
938
939
	 * This is especially important for fuse which sets bdi->max_ratio to
	 * 1% by default. Without strictlimit feature, fuse writeback may
	 * consume arbitrary amount of RAM because it is accounted in
	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
940
	 *
941
	 * Here, in wb_position_ratio(), we calculate pos_ratio based on
942
	 * two values: wb_dirty and wb_thresh. Let's consider an example:
943
944
	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
	 * limits are set by default to 10% and 20% (background and throttle).
945
	 * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
946
	 * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
947
	 * about ~6K pages (as the average of background and throttle wb
948
	 * limits). The 3rd order polynomial will provide positive feedback if
949
	 * wb_dirty is under wb_setpoint and vice versa.
950
	 *
951
	 * Note, that we cannot use global counters in these calculations
952
	 * because we want to throttle process writing to a strictlimit wb
953
954
	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
	 * in the example above).
955
	 */
956
	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
957
		long long wb_pos_ratio;
958

959
960
961
962
963
		if (dtc->wb_dirty < 8) {
			dtc->pos_ratio = min_t(long long, pos_ratio * 2,
					   2 << RATELIMIT_CALC_SHIFT);
			return;
		}
964

965
		if (dtc->wb_dirty >= wb_thresh)
966
			return;
967

968
969
		wb_setpoint = dirty_freerun_ceiling(wb_thresh,
						    dtc->wb_bg_thresh);
970

971
		if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
972
			return;
973

974
		wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
975
						 wb_thresh);
976
977

		/*
978
979
		 * Typically, for strictlimit case, wb_setpoint << setpoint
		 * and pos_ratio >> wb_pos_ratio. In the other words global
980
		 * state ("dirty") is not limiting factor and we have to
981
		 * make decision based on wb counters. But there is an
982
983
		 * important case when global pos_ratio should get precedence:
		 * global limits are exceeded (e.g. due to activities on other
984
		 * wb's) while given strictlimit wb is below limit.
985
		 *
986
		 * "pos_ratio * wb_pos_ratio" would work for the case above,
987
		 * but it would look too non-natural for the case of all
988
		 * activity in the system coming from a single strictlimit wb
989
990
991
992
		 * with bdi->max_ratio == 100%.
		 *
		 * Note that min() below somewhat changes the dynamics of the
		 * control system. Normally, pos_ratio value can be well over 3
993
		 * (when globally we are at freerun and wb is well below wb
994
995
996
997
		 * setpoint). Now the maximum pos_ratio in the same situation
		 * is 2. We might want to tweak this if we observe the control
		 * system is too slow to adapt.
		 */
998
999
		dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
		return;
1000
	}
For faster browsing, not all history is shown. View entire blame