memcontrol.c 186 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13
14
15
16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
31
#include <linux/mm.h>
32
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
33
#include <linux/pagemap.h>
34
#include <linux/smp.h>
35
#include <linux/page-flags.h>
36
#include <linux/backing-dev.h>
37
38
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
39
#include <linux/limits.h>
40
#include <linux/export.h>
41
#include <linux/mutex.h>
42
#include <linux/rbtree.h>
43
#include <linux/slab.h>
44
#include <linux/swap.h>
45
#include <linux/swapops.h>
46
#include <linux/spinlock.h>
47
#include <linux/eventfd.h>
48
#include <linux/poll.h>
49
#include <linux/sort.h>
50
#include <linux/fs.h>
51
#include <linux/seq_file.h>
52
#include <linux/vmpressure.h>
53
#include <linux/mm_inline.h>
54
#include <linux/page_cgroup.h>
55
#include <linux/cpu.h>
56
#include <linux/oom.h>
57
#include <linux/lockdep.h>
58
#include <linux/file.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
59
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
60
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
61
#include <net/ip.h>
Glauber Costa's avatar
Glauber Costa committed
62
#include <net/tcp_memcontrol.h>
63
#include "slab.h"
64

65
66
#include <asm/uaccess.h>

67
68
#include <trace/events/vmscan.h>

69
70
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
71

72
#define MEM_CGROUP_RECLAIM_RETRIES	5
73
static struct mem_cgroup *root_mem_cgroup __read_mostly;
74

Andrew Morton's avatar
Andrew Morton committed
75
#ifdef CONFIG_MEMCG_SWAP
Li Zefan's avatar
Li Zefan committed
76
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77
int do_swap_account __read_mostly;
78
79

/* for remember boot option*/
Andrew Morton's avatar
Andrew Morton committed
80
#ifdef CONFIG_MEMCG_SWAP_ENABLED
81
82
static int really_do_swap_account __initdata = 1;
#else
83
static int really_do_swap_account __initdata;
84
85
#endif

86
#else
87
#define do_swap_account		0
88
89
90
#endif


91
92
93
static const char * const mem_cgroup_stat_names[] = {
	"cache",
	"rss",
94
	"rss_huge",
95
	"mapped_file",
96
	"writeback",
97
98
99
	"swap",
};

100
101
102
enum mem_cgroup_events_index {
	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
103
104
	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
105
106
	MEM_CGROUP_EVENTS_NSTATS,
};
107
108
109
110
111
112
113
114

static const char * const mem_cgroup_events_names[] = {
	"pgpgin",
	"pgpgout",
	"pgfault",
	"pgmajfault",
};

115
116
117
118
119
120
121
122
static const char * const mem_cgroup_lru_names[] = {
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

123
124
125
126
127
128
129
130
/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
 * for trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
	MEM_CGROUP_TARGET_THRESH,
131
	MEM_CGROUP_TARGET_SOFTLIMIT,
132
	MEM_CGROUP_TARGET_NUMAINFO,
133
134
	MEM_CGROUP_NTARGETS,
};
135
136
137
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
138

139
struct mem_cgroup_stat_cpu {
140
	long count[MEM_CGROUP_STAT_NSTATS];
141
	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142
	unsigned long nr_page_events;
143
	unsigned long targets[MEM_CGROUP_NTARGETS];
144
145
};

146
struct mem_cgroup_reclaim_iter {
Michal Hocko's avatar
Michal Hocko committed
147
148
149
150
	/*
	 * last scanned hierarchy member. Valid only if last_dead_count
	 * matches memcg->dead_count of the hierarchy root group.
	 */
151
	struct mem_cgroup *last_visited;
152
	int last_dead_count;
Michal Hocko's avatar
Michal Hocko committed
153

154
155
156
157
	/* scan generation, increased every round-trip */
	unsigned int generation;
};

158
159
160
161
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
162
	struct lruvec		lruvec;
163
	unsigned long		lru_size[NR_LRU_LISTS];
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
164

165
166
	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

167
168
169
170
	struct rb_node		tree_node;	/* RB tree node */
	unsigned long long	usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
	bool			on_tree;
171
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
172
						/* use container_of	   */
173
174
175
176
177
178
};

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_zone {
	struct rb_root rb_root;
	spinlock_t lock;
};

struct mem_cgroup_tree_per_node {
	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

199
200
201
202
203
struct mem_cgroup_threshold {
	struct eventfd_ctx *eventfd;
	u64 threshold;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
204
/* For threshold */
205
struct mem_cgroup_threshold_ary {
206
	/* An array index points to threshold just below or equal to usage. */
207
	int current_threshold;
208
209
210
211
212
	/* Size of entries[] */
	unsigned int size;
	/* Array of thresholds */
	struct mem_cgroup_threshold entries[0];
};
213
214
215
216
217
218
219
220
221
222
223
224

struct mem_cgroup_thresholds {
	/* Primary thresholds array */
	struct mem_cgroup_threshold_ary *primary;
	/*
	 * Spare threshold array.
	 * This is needed to make mem_cgroup_unregister_event() "never fail".
	 * It must be able to store at least primary->size - 1 entries.
	 */
	struct mem_cgroup_threshold_ary *spare;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
225
226
227
228
229
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
230

231
232
233
/*
 * cgroup_event represents events which userspace want to receive.
 */
234
struct mem_cgroup_event {
235
	/*
236
	 * memcg which the event belongs to.
237
	 */
238
	struct mem_cgroup *memcg;
239
240
241
242
243
244
245
246
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
247
248
249
250
251
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
252
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
253
			      struct eventfd_ctx *eventfd, const char *args);
254
255
256
257
258
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
259
	void (*unregister_event)(struct mem_cgroup *memcg,
260
				 struct eventfd_ctx *eventfd);
261
262
263
264
265
266
267
268
269
270
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
	wait_queue_t wait;
	struct work_struct remove;
};

271
272
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
273

274
275
276
277
278
279
280
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
281
282
283
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
284
285
286
287
288
289
290
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
291

292
293
294
	/* vmpressure notifications */
	struct vmpressure vmpressure;

295
296
297
298
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
299

300
301
302
303
	/*
	 * the counter to account for kernel memory usage.
	 */
	struct res_counter kmem;
304
305
306
307
	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;
308
	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
309
310
311

	bool		oom_lock;
	atomic_t	under_oom;
312
	atomic_t	oom_wakeups;
313

314
	int	swappiness;
315
316
	/* OOM-Killer disable */
	int		oom_kill_disable;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
317

318
319
320
	/* set when res.limit == memsw.limit */
	bool		memsw_is_minimum;

321
322
323
324
	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
325
	struct mem_cgroup_thresholds thresholds;
326

327
	/* thresholds for mem+swap usage. RCU-protected */
328
	struct mem_cgroup_thresholds memsw_thresholds;
329

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
330
331
	/* For oom notifier event fd */
	struct list_head oom_notify;
332

333
334
335
336
	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
Andrew Morton's avatar
Andrew Morton committed
337
	unsigned long move_charge_at_immigrate;
338
339
340
341
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t	moving_account;
342
343
	/* taken only while moving_account > 0 */
	spinlock_t	move_lock;
344
	/*
345
	 * percpu counter.
346
	 */
347
	struct mem_cgroup_stat_cpu __percpu *stat;
348
349
350
351
352
353
	/*
	 * used when a cpu is offlined or other synchronizations
	 * See mem_cgroup_read_stat().
	 */
	struct mem_cgroup_stat_cpu nocpu_base;
	spinlock_t pcp_counter_lock;
Glauber Costa's avatar
Glauber Costa committed
354

Michal Hocko's avatar
Michal Hocko committed
355
	atomic_t	dead_count;
Michal Hocko's avatar
Michal Hocko committed
356
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357
	struct cg_proto tcp_mem;
Glauber Costa's avatar
Glauber Costa committed
358
#endif
359
#if defined(CONFIG_MEMCG_KMEM)
360
361
	/* analogous to slab_common's slab_caches list, but per-memcg;
	 * protected by memcg_slab_mutex */
362
363
364
365
	struct list_head memcg_slab_caches;
        /* Index in the kmem_cache->memcg_params->memcg_caches array */
	int kmemcg_id;
#endif
366
367
368
369
370
371
372

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif
373

374
375
376
377
	/* List of events which userspace want to receive */
	struct list_head event_list;
	spinlock_t event_list_lock;

378
379
	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
380
381
};

382
383
/* internal only representation about the status of kmem accounting. */
enum {
384
	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
385
	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
386
387
388
389
390
391
392
};

#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
393
394
395
396
397
398
399
400

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}

static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
401
402
403
404
405
	/*
	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
	 * will call css_put() if it sees the memcg is dead.
	 */
	smp_wmb();
406
407
408
409
410
411
412
413
414
	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
{
	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
				  &memcg->kmem_account_flags);
}
415
416
#endif

417
418
/* Stuffs for move charges at task migration. */
/*
419
420
 * Types of charges to be moved. "move_charge_at_immitgrate" and
 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
421
422
 */
enum move_type {
423
	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
424
	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
425
426
427
	NR_MOVE_TYPE,
};

428
429
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
430
	spinlock_t	  lock; /* for from, to */
431
432
	struct mem_cgroup *from;
	struct mem_cgroup *to;
433
	unsigned long immigrate_flags;
434
	unsigned long precharge;
435
	unsigned long moved_charge;
436
	unsigned long moved_swap;
437
438
439
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
440
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
441
442
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
443

444
445
static bool move_anon(void)
{
446
	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
447
448
}

449
450
static bool move_file(void)
{
451
	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
452
453
}

454
455
456
457
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
458
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
459
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
460

461
462
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
463
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
464
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
465
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
466
467
468
	NR_CHARGE_TYPE,
};

469
/* for encoding cft->private value on file */
470
471
472
473
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
474
	_KMEM,
475
476
};

477
478
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
479
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
480
481
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
482

483
484
485
486
487
488
489
490
/*
 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 */
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

491
492
493
494
495
496
497
/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
 */
static DEFINE_MUTEX(memcg_create_mutex);

498
499
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
500
	return s ? container_of(s, struct mem_cgroup, css) : NULL;
501
502
}

503
504
505
506
507
508
509
510
511
512
513
514
515
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

516
517
518
519
520
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

521
522
523
524
525
526
/*
 * We restrict the id in the range of [1, 65535], so it can fit into
 * an unsigned short.
 */
#define MEM_CGROUP_ID_MAX	USHRT_MAX

Li Zefan's avatar
Li Zefan committed
527
528
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
529
	return memcg->css.id;
Li Zefan's avatar
Li Zefan committed
530
531
532
533
534
535
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
	struct cgroup_subsys_state *css;

536
	css = css_from_id(id, &memory_cgrp_subsys);
Li Zefan's avatar
Li Zefan committed
537
538
539
	return mem_cgroup_from_css(css);
}

Glauber Costa's avatar
Glauber Costa committed
540
/* Writing them here to avoid exposing memcg's inner layout */
Michal Hocko's avatar
Michal Hocko committed
541
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
Glauber Costa's avatar
Glauber Costa committed
542
543
544

void sock_update_memcg(struct sock *sk)
{
545
	if (mem_cgroup_sockets_enabled) {
Glauber Costa's avatar
Glauber Costa committed
546
		struct mem_cgroup *memcg;
547
		struct cg_proto *cg_proto;
Glauber Costa's avatar
Glauber Costa committed
548
549
550

		BUG_ON(!sk->sk_prot->proto_cgroup);

551
552
553
554
555
556
557
558
559
560
		/* Socket cloning can throw us here with sk_cgrp already
		 * filled. It won't however, necessarily happen from
		 * process context. So the test for root memcg given
		 * the current task's memcg won't help us in this case.
		 *
		 * Respecting the original socket's memcg is a better
		 * decision in this case.
		 */
		if (sk->sk_cgrp) {
			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
561
			css_get(&sk->sk_cgrp->memcg->css);
562
563
564
			return;
		}

Glauber Costa's avatar
Glauber Costa committed
565
566
		rcu_read_lock();
		memcg = mem_cgroup_from_task(current);
567
		cg_proto = sk->sk_prot->proto_cgroup(memcg);
568
		if (!mem_cgroup_is_root(memcg) &&
569
570
		    memcg_proto_active(cg_proto) &&
		    css_tryget_online(&memcg->css)) {
571
			sk->sk_cgrp = cg_proto;
Glauber Costa's avatar
Glauber Costa committed
572
573
574
575
576
577
578
579
		}
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(sock_update_memcg);

void sock_release_memcg(struct sock *sk)
{
580
	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
Glauber Costa's avatar
Glauber Costa committed
581
582
583
		struct mem_cgroup *memcg;
		WARN_ON(!sk->sk_cgrp->memcg);
		memcg = sk->sk_cgrp->memcg;
584
		css_put(&sk->sk_cgrp->memcg->css);
Glauber Costa's avatar
Glauber Costa committed
585
586
	}
}
Glauber Costa's avatar
Glauber Costa committed
587
588
589
590
591
592

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg || mem_cgroup_is_root(memcg))
		return NULL;

593
	return &memcg->tcp_mem;
Glauber Costa's avatar
Glauber Costa committed
594
595
}
EXPORT_SYMBOL(tcp_proto_cgroup);
Glauber Costa's avatar
Glauber Costa committed
596

597
598
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
599
	if (!memcg_proto_activated(&memcg->tcp_mem))
600
601
602
603
604
605
606
607
608
		return;
	static_key_slow_dec(&memcg_socket_limit_enabled);
}
#else
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
}
#endif

609
#ifdef CONFIG_MEMCG_KMEM
610
611
/*
 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
Li Zefan's avatar
Li Zefan committed
612
613
614
615
616
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
617
618
619
620
621
622
 *
 * The current size of the caches array is stored in
 * memcg_limited_groups_array_size.  It will double each time we have to
 * increase it.
 */
static DEFINE_IDA(kmem_limited_groups);
623
624
int memcg_limited_groups_array_size;

625
626
627
628
629
630
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
631
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
632
633
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
634
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
635
636
637
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
638
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
639

640
641
642
643
644
645
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
646
struct static_key memcg_kmem_enabled_key;
647
EXPORT_SYMBOL(memcg_kmem_enabled_key);
648
649
650

static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
651
	if (memcg_kmem_is_active(memcg)) {
652
		static_key_slow_dec(&memcg_kmem_enabled_key);
653
654
		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
	}
655
656
657
658
659
	/*
	 * This check can't live in kmem destruction function,
	 * since the charges will outlive the cgroup
	 */
	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
660
661
662
663
664
665
666
667
668
669
670
671
672
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */

static void disarm_static_keys(struct mem_cgroup *memcg)
{
	disarm_sock_keys(memcg);
	disarm_kmem_keys(memcg);
}

673
static void drain_all_stock_async(struct mem_cgroup *memcg);
674

675
static struct mem_cgroup_per_zone *
676
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
677
{
678
679
680
	int nid = zone_to_nid(zone);
	int zid = zone_idx(zone);

681
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
682
683
}

684
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
685
{
686
	return &memcg->css;
687
688
}

689
static struct mem_cgroup_per_zone *
690
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
691
{
692
693
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
694

695
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
696
697
}

698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

713
714
715
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz,
					 unsigned long long new_usage_in_excess)
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_zone *mz_node;

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess)
			p = &(*p)->rb_left;
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

745
746
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz)
747
748
749
750
751
752
753
{
	if (!mz->on_tree)
		return;
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

754
755
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
				       struct mem_cgroup_tree_per_zone *mctz)
756
757
{
	spin_lock(&mctz->lock);
758
	__mem_cgroup_remove_exceeded(mz, mctz);
759
760
761
762
763
764
765
766
767
768
	spin_unlock(&mctz->lock);
}


static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
	unsigned long long excess;
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;

769
	mctz = soft_limit_tree_from_page(page);
770
771
772
773
774
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
775
		mz = mem_cgroup_page_zoneinfo(memcg, page);
776
777
778
779
780
781
782
783
784
		excess = res_counter_soft_limit_excess(&memcg->res);
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
			spin_lock(&mctz->lock);
			/* if on-tree, remove it */
			if (mz->on_tree)
785
				__mem_cgroup_remove_exceeded(mz, mctz);
786
787
788
789
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
790
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
791
792
793
794
795
796
797
798
			spin_unlock(&mctz->lock);
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
	struct mem_cgroup_tree_per_zone *mctz;
799
800
	struct mem_cgroup_per_zone *mz;
	int nid, zid;
801

802
803
804
805
	for_each_node(nid) {
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			mctz = soft_limit_tree_node_zone(nid, zid);
806
			mem_cgroup_remove_exceeded(mz, mctz);
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
		}
	}
}

static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct rb_node *rightmost = NULL;
	struct mem_cgroup_per_zone *mz;

retry:
	mz = NULL;
	rightmost = rb_last(&mctz->rb_root);
	if (!rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
829
	__mem_cgroup_remove_exceeded(mz, mctz);
830
	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
831
	    !css_tryget_online(&mz->memcg->css))
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct mem_cgroup_per_zone *mz;

	spin_lock(&mctz->lock);
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
	spin_unlock(&mctz->lock);
	return mz;
}

848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
/*
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
 * a periodic synchronizion of counter in memcg's counter.
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
867
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
868
				 enum mem_cgroup_stat_index idx)
869
{
870
	long val = 0;
871
872
	int cpu;

873
874
	get_online_cpus();
	for_each_online_cpu(cpu)
875
		val += per_cpu(memcg->stat->count[idx], cpu);
876
#ifdef CONFIG_HOTPLUG_CPU
877
878
879
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.count[idx];
	spin_unlock(&memcg->pcp_counter_lock);
880
881
#endif
	put_online_cpus();
882
883
884
	return val;
}

885
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
886
887
888
					 bool charge)
{
	int val = (charge) ? 1 : -1;
889
	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
890
891
}

892
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
893
894
895
896
897
					    enum mem_cgroup_events_index idx)
{
	unsigned long val = 0;
	int cpu;

898
	get_online_cpus();
899
	for_each_online_cpu(cpu)
900
		val += per_cpu(memcg->stat->events[idx], cpu);
901
#ifdef CONFIG_HOTPLUG_CPU
902
903
904
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.events[idx];
	spin_unlock(&memcg->pcp_counter_lock);
905
#endif
906
	put_online_cpus();
907
908
909
	return val;
}

910
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
911
					 struct page *page,
912
					 bool anon, int nr_pages)
913
{
914
915
916
917
918
919
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
	if (anon)
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
920
				nr_pages);
921
	else
922
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
923
				nr_pages);
924

925
926
927
928
	if (PageTransHuge(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
				nr_pages);

929
930
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
931
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
932
	else {
933
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
934
935
		nr_pages = -nr_pages; /* for event */
	}
936

937
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
938
939
}

940
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
941
942
943
944
945
946
947
{
	struct mem_cgroup_per_zone *mz;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	return mz->lru_size[lru];
}

948
949
950
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
						  int nid,
						  unsigned int lru_mask)
951
{
952
	unsigned long nr = 0;
953
954
	int zid;

955
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
956

957
958
959
960
961
962
963
964
965
966
967
968
	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		struct mem_cgroup_per_zone *mz;
		enum lru_list lru;

		for_each_lru(lru) {
			if (!(BIT(lru) & lru_mask))
				continue;
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			nr += mz->lru_size[lru];
		}
	}
	return nr;
969
}
970

971
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
972
			unsigned int lru_mask)
973
{
974
	unsigned long nr = 0;
975
	int nid;
976

977
	for_each_node_state(nid, N_MEMORY)
978
979
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
980
981
}

982
983
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
984
985
986
{
	unsigned long val, next;

987
	val = __this_cpu_read(memcg->stat->nr_page_events);
988
	next = __this_cpu_read(memcg->stat->targets[target]);
989
	/* from time_after() in jiffies.h */
990
991
992
993
994
	if ((long)next - (long)val < 0) {
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
995
996
997
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
998
999
1000
1001
1002
1003
1004
1005
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
		__this_cpu_write(memcg->stat->targets[target], next);
		return true;
1006
	}
1007
	return false;
1008
1009
1010
1011
1012
1013
}

/*
 * Check events in order.
 *
 */
1014
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1015
{
1016
	preempt_disable();
1017
	/* threshold event is triggered in finer grain than soft limit */
1018
1019
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
1020
		bool do_softlimit;
1021
		bool do_numainfo __maybe_unused;
1022

1023
1024
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
1025
1026
1027
1028
1029
1030
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
		preempt_enable();

1031
		mem_cgroup_threshold(memcg);
1032
1033
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
1034
#if MAX_NUMNODES > 1
1035
		if (unlikely(do_numainfo))
1036
			atomic_inc(&memcg->numainfo_events);
1037
#endif
1038
1039
	} else
		preempt_enable();
1040
1041
}

1042
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1043
{
1044
1045
1046
1047
1048
1049
1050
1051
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

1052
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1053
1054
}

1055
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1056
{
1057
	struct mem_cgroup *memcg = NULL;
1058

1059
1060
	rcu_read_lock();
	do {
1061
1062
1063
1064
1065
1066
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
1067
			memcg = root_mem_cgroup;
1068
1069
1070
1071
1072
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
1073
	} while (!css_tryget_online(&memcg->css));
1074
	rcu_read_unlock();
1075
	return memcg;
1076
1077
}

1078
1079
1080
1081
1082
1083
1084
/*
 * Returns a next (in a pre-order walk) alive memcg (with elevated css
 * ref. count) or NULL if the whole root's subtree has been visited.
 *
 * helper function to be used by mem_cgroup_iter
 */
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1085
		struct mem_cgroup *last_visited)
1086
{
1087
	struct cgroup_subsys_state *prev_css, *next_css;
1088

1089
	prev_css = last_visited ? &last_visited->css : NULL;
1090
skip_node:
1091
	next_css = css_next_descendant_pre(prev_css, &root->css);
1092
1093
1094
1095
1096
1097
1098

	/*
	 * Even if we found a group we have to make sure it is
	 * alive. css && !memcg means that the groups should be
	 * skipped and we should continue the tree walk.
	 * last_visited css is safe to use because it is
	 * protected by css_get and the tree walk is rcu safe.
1099
1100
1101
1102
1103
1104
1105
1106
	 *
	 * We do not take a reference on the root of the tree walk
	 * because we might race with the root removal when it would
	 * be the only node in the iterated hierarchy and mem_cgroup_iter
	 * would end up in an endless loop because it expects that at
	 * least one valid node will be returned. Root cannot disappear
	 * because caller of the iterator should hold it already so
	 * skipping css reference should be safe.
1107
	 */
1108
	if (next_css) {
1109
		if ((next_css == &root->css) ||
1110
1111
		    ((next_css->flags & CSS_ONLINE) &&
		     css_tryget_online(next_css)))
1112
			return mem_cgroup_from_css(next_css);
1113
1114
1115

		prev_css = next_css;
		goto skip_node;
1116
1117
1118
1119
1120
	}

	return NULL;
}

1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
{
	/*
	 * When a group in the hierarchy below root is destroyed, the
	 * hierarchy iterator can no longer be trusted since it might
	 * have pointed to the destroyed group.  Invalidate it.