memcontrol.c 186 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13
14
15
16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
31
#include <linux/mm.h>
32
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
33
#include <linux/pagemap.h>
34
#include <linux/smp.h>
35
#include <linux/page-flags.h>
36
#include <linux/backing-dev.h>
37
38
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
39
#include <linux/limits.h>
40
#include <linux/export.h>
41
#include <linux/mutex.h>
42
#include <linux/rbtree.h>
43
#include <linux/slab.h>
44
#include <linux/swap.h>
45
#include <linux/swapops.h>
46
#include <linux/spinlock.h>
47
48
#include <linux/eventfd.h>
#include <linux/sort.h>
49
#include <linux/fs.h>
50
#include <linux/seq_file.h>
51
#include <linux/vmalloc.h>
52
#include <linux/vmpressure.h>
53
#include <linux/mm_inline.h>
54
#include <linux/page_cgroup.h>
55
#include <linux/cpu.h>
56
#include <linux/oom.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
57
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
58
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
59
#include <net/ip.h>
Glauber Costa's avatar
Glauber Costa committed
60
#include <net/tcp_memcontrol.h>
61

62
63
#include <asm/uaccess.h>

64
65
#include <trace/events/vmscan.h>

66
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
67
68
EXPORT_SYMBOL(mem_cgroup_subsys);

69
#define MEM_CGROUP_RECLAIM_RETRIES	5
70
static struct mem_cgroup *root_mem_cgroup __read_mostly;
71

Andrew Morton's avatar
Andrew Morton committed
72
#ifdef CONFIG_MEMCG_SWAP
Li Zefan's avatar
Li Zefan committed
73
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
74
int do_swap_account __read_mostly;
75
76

/* for remember boot option*/
Andrew Morton's avatar
Andrew Morton committed
77
#ifdef CONFIG_MEMCG_SWAP_ENABLED
78
79
80
81
82
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif

83
#else
84
#define do_swap_account		0
85
86
87
#endif


88
89
90
91
92
93
94
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
95
96
97
98
99
	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
100
101
102
	MEM_CGROUP_STAT_NSTATS,
};

103
104
105
static const char * const mem_cgroup_stat_names[] = {
	"cache",
	"rss",
106
	"rss_huge",
107
108
109
110
	"mapped_file",
	"swap",
};

111
112
113
enum mem_cgroup_events_index {
	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
114
115
	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
116
117
	MEM_CGROUP_EVENTS_NSTATS,
};
118
119
120
121
122
123
124
125

static const char * const mem_cgroup_events_names[] = {
	"pgpgin",
	"pgpgout",
	"pgfault",
	"pgmajfault",
};

126
127
128
129
130
131
132
133
static const char * const mem_cgroup_lru_names[] = {
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

134
135
136
137
138
139
140
141
142
/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
 * for trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
	MEM_CGROUP_TARGET_THRESH,
	MEM_CGROUP_TARGET_SOFTLIMIT,
143
	MEM_CGROUP_TARGET_NUMAINFO,
144
145
	MEM_CGROUP_NTARGETS,
};
146
147
148
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
149

150
struct mem_cgroup_stat_cpu {
151
	long count[MEM_CGROUP_STAT_NSTATS];
152
	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
153
	unsigned long nr_page_events;
154
	unsigned long targets[MEM_CGROUP_NTARGETS];
155
156
};

157
struct mem_cgroup_reclaim_iter {
Michal Hocko's avatar
Michal Hocko committed
158
159
160
161
	/*
	 * last scanned hierarchy member. Valid only if last_dead_count
	 * matches memcg->dead_count of the hierarchy root group.
	 */
162
	struct mem_cgroup *last_visited;
Michal Hocko's avatar
Michal Hocko committed
163
164
	unsigned long last_dead_count;

165
166
167
168
	/* scan generation, increased every round-trip */
	unsigned int generation;
};

169
170
171
172
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
173
	struct lruvec		lruvec;
174
	unsigned long		lru_size[NR_LRU_LISTS];
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
175

176
177
	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

178
179
180
181
	struct rb_node		tree_node;	/* RB tree node */
	unsigned long long	usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
	bool			on_tree;
182
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
183
						/* use container_of	   */
184
185
186
187
188
189
};

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_zone {
	struct rb_root rb_root;
	spinlock_t lock;
};

struct mem_cgroup_tree_per_node {
	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

210
211
212
213
214
struct mem_cgroup_threshold {
	struct eventfd_ctx *eventfd;
	u64 threshold;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
215
/* For threshold */
216
struct mem_cgroup_threshold_ary {
217
	/* An array index points to threshold just below or equal to usage. */
218
	int current_threshold;
219
220
221
222
223
	/* Size of entries[] */
	unsigned int size;
	/* Array of thresholds */
	struct mem_cgroup_threshold entries[0];
};
224
225
226
227
228
229
230
231
232
233
234
235

struct mem_cgroup_thresholds {
	/* Primary thresholds array */
	struct mem_cgroup_threshold_ary *primary;
	/*
	 * Spare threshold array.
	 * This is needed to make mem_cgroup_unregister_event() "never fail".
	 * It must be able to store at least primary->size - 1 entries.
	 */
	struct mem_cgroup_threshold_ary *spare;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
236
237
238
239
240
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
241

242
243
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
244

245
246
247
248
249
250
251
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
252
253
254
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
255
256
257
258
259
260
261
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
262

263
264
265
	/* vmpressure notifications */
	struct vmpressure vmpressure;

266
267
268
269
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
270

271
272
273
274
	/*
	 * the counter to account for kernel memory usage.
	 */
	struct res_counter kmem;
275
276
277
278
	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;
279
	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
280
281
282
283

	bool		oom_lock;
	atomic_t	under_oom;

284
	int	swappiness;
285
286
	/* OOM-Killer disable */
	int		oom_kill_disable;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
287

288
289
290
	/* set when res.limit == memsw.limit */
	bool		memsw_is_minimum;

291
292
293
294
	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
295
	struct mem_cgroup_thresholds thresholds;
296

297
	/* thresholds for mem+swap usage. RCU-protected */
298
	struct mem_cgroup_thresholds memsw_thresholds;
299

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
300
301
	/* For oom notifier event fd */
	struct list_head oom_notify;
302

303
304
305
306
307
	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
	unsigned long 	move_charge_at_immigrate;
308
309
310
311
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t	moving_account;
312
313
	/* taken only while moving_account > 0 */
	spinlock_t	move_lock;
314
	/*
315
	 * percpu counter.
316
	 */
317
	struct mem_cgroup_stat_cpu __percpu *stat;
318
319
320
321
322
323
	/*
	 * used when a cpu is offlined or other synchronizations
	 * See mem_cgroup_read_stat().
	 */
	struct mem_cgroup_stat_cpu nocpu_base;
	spinlock_t pcp_counter_lock;
Glauber Costa's avatar
Glauber Costa committed
324

Michal Hocko's avatar
Michal Hocko committed
325
	atomic_t	dead_count;
Michal Hocko's avatar
Michal Hocko committed
326
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
Glauber Costa's avatar
Glauber Costa committed
327
328
	struct tcp_memcontrol tcp_mem;
#endif
329
330
331
332
333
334
335
336
#if defined(CONFIG_MEMCG_KMEM)
	/* analogous to slab_common's slab_caches list. per-memcg */
	struct list_head memcg_slab_caches;
	/* Not a spinlock, we can take a lot of time walking the list */
	struct mutex slab_caches_mutex;
        /* Index in the kmem_cache->memcg_params->memcg_caches array */
	int kmemcg_id;
#endif
337
338
339
340
341
342
343

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif
344

345
346
	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
347
348
};

349
350
351
352
353
354
static size_t memcg_size(void)
{
	return sizeof(struct mem_cgroup) +
		nr_node_ids * sizeof(struct mem_cgroup_per_node);
}

355
356
357
/* internal only representation about the status of kmem accounting. */
enum {
	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
358
	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
359
	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
360
361
};

362
363
364
/* We account when limit is on, but only after call sites are patched */
#define KMEM_ACCOUNTED_MASK \
		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
365
366
367
368
369
370

#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
371
372
373
374
375
376

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}

377
378
379
380
381
static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
{
	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}

382
383
384
385
386
static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
{
	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}

387
388
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
389
390
391
392
393
	/*
	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
	 * will call css_put() if it sees the memcg is dead.
	 */
	smp_wmb();
394
395
396
397
398
399
400
401
402
	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
{
	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
				  &memcg->kmem_account_flags);
}
403
404
#endif

405
406
/* Stuffs for move charges at task migration. */
/*
407
408
 * Types of charges to be moved. "move_charge_at_immitgrate" and
 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
409
410
 */
enum move_type {
411
	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
412
	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
413
414
415
	NR_MOVE_TYPE,
};

416
417
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
418
	spinlock_t	  lock; /* for from, to */
419
420
	struct mem_cgroup *from;
	struct mem_cgroup *to;
421
	unsigned long immigrate_flags;
422
	unsigned long precharge;
423
	unsigned long moved_charge;
424
	unsigned long moved_swap;
425
426
427
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
428
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
429
430
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
431

432
433
static bool move_anon(void)
{
434
	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
435
436
}

437
438
static bool move_file(void)
{
439
	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
440
441
}

442
443
444
445
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
446
447
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
448

449
450
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
451
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
452
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
453
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
454
455
456
	NR_CHARGE_TYPE,
};

457
/* for encoding cft->private value on file */
458
459
460
461
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
462
	_KMEM,
463
464
};

465
466
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
467
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
468
469
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
470

471
472
473
474
475
476
477
478
/*
 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 */
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

479
480
481
482
483
484
485
/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
 */
static DEFINE_MUTEX(memcg_create_mutex);

486
487
488
489
490
491
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
	return container_of(s, struct mem_cgroup, css);
}

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
	return &mem_cgroup_from_css(css)->vmpressure;
}

510
511
512
513
514
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

Glauber Costa's avatar
Glauber Costa committed
515
/* Writing them here to avoid exposing memcg's inner layout */
Michal Hocko's avatar
Michal Hocko committed
516
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
Glauber Costa's avatar
Glauber Costa committed
517
518
519

void sock_update_memcg(struct sock *sk)
{
520
	if (mem_cgroup_sockets_enabled) {
Glauber Costa's avatar
Glauber Costa committed
521
		struct mem_cgroup *memcg;
522
		struct cg_proto *cg_proto;
Glauber Costa's avatar
Glauber Costa committed
523
524
525

		BUG_ON(!sk->sk_prot->proto_cgroup);

526
527
528
529
530
531
532
533
534
535
		/* Socket cloning can throw us here with sk_cgrp already
		 * filled. It won't however, necessarily happen from
		 * process context. So the test for root memcg given
		 * the current task's memcg won't help us in this case.
		 *
		 * Respecting the original socket's memcg is a better
		 * decision in this case.
		 */
		if (sk->sk_cgrp) {
			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
536
			css_get(&sk->sk_cgrp->memcg->css);
537
538
539
			return;
		}

Glauber Costa's avatar
Glauber Costa committed
540
541
		rcu_read_lock();
		memcg = mem_cgroup_from_task(current);
542
		cg_proto = sk->sk_prot->proto_cgroup(memcg);
543
544
		if (!mem_cgroup_is_root(memcg) &&
		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
545
			sk->sk_cgrp = cg_proto;
Glauber Costa's avatar
Glauber Costa committed
546
547
548
549
550
551
552
553
		}
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(sock_update_memcg);

void sock_release_memcg(struct sock *sk)
{
554
	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
Glauber Costa's avatar
Glauber Costa committed
555
556
557
		struct mem_cgroup *memcg;
		WARN_ON(!sk->sk_cgrp->memcg);
		memcg = sk->sk_cgrp->memcg;
558
		css_put(&sk->sk_cgrp->memcg->css);
Glauber Costa's avatar
Glauber Costa committed
559
560
	}
}
Glauber Costa's avatar
Glauber Costa committed
561
562
563
564
565
566
567
568
569

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg || mem_cgroup_is_root(memcg))
		return NULL;

	return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
Glauber Costa's avatar
Glauber Costa committed
570

571
572
573
574
575
576
577
578
579
580
581
582
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
		return;
	static_key_slow_dec(&memcg_socket_limit_enabled);
}
#else
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
}
#endif

583
#ifdef CONFIG_MEMCG_KMEM
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
/*
 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
 * There are two main reasons for not using the css_id for this:
 *  1) this works better in sparse environments, where we have a lot of memcgs,
 *     but only a few kmem-limited. Or also, if we have, for instance, 200
 *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *     200 entry array for that.
 *
 *  2) In order not to violate the cgroup API, we would like to do all memory
 *     allocation in ->create(). At that point, we haven't yet allocated the
 *     css_id. Having a separate index prevents us from messing with the cgroup
 *     core for this
 *
 * The current size of the caches array is stored in
 * memcg_limited_groups_array_size.  It will double each time we have to
 * increase it.
 */
static DEFINE_IDA(kmem_limited_groups);
602
603
int memcg_limited_groups_array_size;

604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
 * css_id space is not getting any smaller, and we don't have to necessarily
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE 65535

619
620
621
622
623
624
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
625
struct static_key memcg_kmem_enabled_key;
626
EXPORT_SYMBOL(memcg_kmem_enabled_key);
627
628
629

static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
630
	if (memcg_kmem_is_active(memcg)) {
631
		static_key_slow_dec(&memcg_kmem_enabled_key);
632
633
		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
	}
634
635
636
637
638
	/*
	 * This check can't live in kmem destruction function,
	 * since the charges will outlive the cgroup
	 */
	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
639
640
641
642
643
644
645
646
647
648
649
650
651
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */

static void disarm_static_keys(struct mem_cgroup *memcg)
{
	disarm_sock_keys(memcg);
	disarm_kmem_keys(memcg);
}

652
static void drain_all_stock_async(struct mem_cgroup *memcg);
653

654
static struct mem_cgroup_per_zone *
655
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
656
{
657
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
658
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
659
660
}

661
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
662
{
663
	return &memcg->css;
664
665
}

666
static struct mem_cgroup_per_zone *
667
page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
668
{
669
670
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
671

672
	return mem_cgroup_zoneinfo(memcg, nid, zid);
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static void
691
__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
692
				struct mem_cgroup_per_zone *mz,
693
694
				struct mem_cgroup_tree_per_zone *mctz,
				unsigned long long new_usage_in_excess)
695
696
697
698
699
700
701
702
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_zone *mz_node;

	if (mz->on_tree)
		return;

703
704
705
	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess)
			p = &(*p)->rb_left;
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
722
723
724
}

static void
725
__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
726
727
728
729
730
731
732
733
734
				struct mem_cgroup_per_zone *mz,
				struct mem_cgroup_tree_per_zone *mctz)
{
	if (!mz->on_tree)
		return;
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

735
static void
736
mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
737
738
739
740
				struct mem_cgroup_per_zone *mz,
				struct mem_cgroup_tree_per_zone *mctz)
{
	spin_lock(&mctz->lock);
741
	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
742
743
744
745
	spin_unlock(&mctz->lock);
}


746
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
747
{
748
	unsigned long long excess;
749
750
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;
751
752
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
753
754
755
	mctz = soft_limit_tree_from_page(page);

	/*
756
757
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
758
	 */
759
760
761
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
		excess = res_counter_soft_limit_excess(&memcg->res);
762
763
764
765
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
766
		if (excess || mz->on_tree) {
767
768
769
			spin_lock(&mctz->lock);
			/* if on-tree, remove it */
			if (mz->on_tree)
770
				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
771
			/*
772
773
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
774
			 */
775
			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
776
777
			spin_unlock(&mctz->lock);
		}
778
779
780
	}
}

781
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
782
783
784
785
786
{
	int node, zone;
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;

Bob Liu's avatar
Bob Liu committed
787
	for_each_node(node) {
788
		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
789
			mz = mem_cgroup_zoneinfo(memcg, node, zone);
790
			mctz = soft_limit_tree_node_zone(node, zone);
791
			mem_cgroup_remove_exceeded(memcg, mz, mctz);
792
793
794
795
		}
	}
}

796
797
798
799
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct rb_node *rightmost = NULL;
800
	struct mem_cgroup_per_zone *mz;
801
802

retry:
803
	mz = NULL;
804
805
806
807
808
809
810
811
812
813
	rightmost = rb_last(&mctz->rb_root);
	if (!rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
814
815
816
	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
		!css_tryget(&mz->memcg->css))
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct mem_cgroup_per_zone *mz;

	spin_lock(&mctz->lock);
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
	spin_unlock(&mctz->lock);
	return mz;
}

833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
/*
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
 * a periodic synchronizion of counter in memcg's counter.
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
852
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
853
				 enum mem_cgroup_stat_index idx)
854
{
855
	long val = 0;
856
857
	int cpu;

858
859
	get_online_cpus();
	for_each_online_cpu(cpu)
860
		val += per_cpu(memcg->stat->count[idx], cpu);
861
#ifdef CONFIG_HOTPLUG_CPU
862
863
864
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.count[idx];
	spin_unlock(&memcg->pcp_counter_lock);
865
866
#endif
	put_online_cpus();
867
868
869
	return val;
}

870
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
871
872
873
					 bool charge)
{
	int val = (charge) ? 1 : -1;
874
	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
875
876
}

877
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
878
879
880
881
882
883
					    enum mem_cgroup_events_index idx)
{
	unsigned long val = 0;
	int cpu;

	for_each_online_cpu(cpu)
884
		val += per_cpu(memcg->stat->events[idx], cpu);
885
#ifdef CONFIG_HOTPLUG_CPU
886
887
888
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.events[idx];
	spin_unlock(&memcg->pcp_counter_lock);
889
890
891
892
#endif
	return val;
}

893
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
894
					 struct page *page,
895
					 bool anon, int nr_pages)
896
{
897
898
	preempt_disable();

899
900
901
902
903
904
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
	if (anon)
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
905
				nr_pages);
906
	else
907
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
908
				nr_pages);
909

910
911
912
913
	if (PageTransHuge(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
				nr_pages);

914
915
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
916
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
917
	else {
918
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
919
920
		nr_pages = -nr_pages; /* for event */
	}
921

922
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
923

924
	preempt_enable();
925
926
}

927
unsigned long
928
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
929
930
931
932
933
934
935
936
{
	struct mem_cgroup_per_zone *mz;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	return mz->lru_size[lru];
}

static unsigned long
937
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
938
			unsigned int lru_mask)
939
940
{
	struct mem_cgroup_per_zone *mz;
Hugh Dickins's avatar
Hugh Dickins committed
941
	enum lru_list lru;
942
943
	unsigned long ret = 0;

944
	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
945

Hugh Dickins's avatar
Hugh Dickins committed
946
947
948
	for_each_lru(lru) {
		if (BIT(lru) & lru_mask)
			ret += mz->lru_size[lru];
949
950
951
952
953
	}
	return ret;
}

static unsigned long
954
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
955
956
			int nid, unsigned int lru_mask)
{
957
958
959
	u64 total = 0;
	int zid;

960
	for (zid = 0; zid < MAX_NR_ZONES; zid++)
961
962
		total += mem_cgroup_zone_nr_lru_pages(memcg,
						nid, zid, lru_mask);
963

964
965
	return total;
}
966

967
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
968
			unsigned int lru_mask)
969
{
970
	int nid;
971
972
	u64 total = 0;

973
	for_each_node_state(nid, N_MEMORY)
974
		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
975
	return total;
976
977
}

978
979
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
980
981
982
{
	unsigned long val, next;

983
	val = __this_cpu_read(memcg->stat->nr_page_events);
984
	next = __this_cpu_read(memcg->stat->targets[target]);
985
	/* from time_after() in jiffies.h */
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
	if ((long)next - (long)val < 0) {
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
		__this_cpu_write(memcg->stat->targets[target], next);
		return true;
1002
	}
1003
	return false;
1004
1005
1006
1007
1008
1009
}

/*
 * Check events in order.
 *
 */
1010
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1011
{
1012
	preempt_disable();
1013
	/* threshold event is triggered in finer grain than soft limit */
1014
1015
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
1016
1017
		bool do_softlimit;
		bool do_numainfo __maybe_unused;
1018
1019
1020
1021
1022
1023
1024
1025
1026

		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
		preempt_enable();

1027
		mem_cgroup_threshold(memcg);
1028
		if (unlikely(do_softlimit))
1029
			mem_cgroup_update_tree(memcg, page);
1030
#if MAX_NUMNODES > 1
1031
		if (unlikely(do_numainfo))
1032
			atomic_inc(&memcg->numainfo_events);
1033
#endif
1034
1035
	} else
		preempt_enable();
1036
1037
}

Glauber Costa's avatar
Glauber Costa committed
1038
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1039
{
1040
1041
	return mem_cgroup_from_css(
		cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1042
1043
}

1044
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1045
{
1046
1047
1048
1049
1050
1051
1052
1053
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

1054
	return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
1055
1056
}

1057
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1058
{
1059
	struct mem_cgroup *memcg = NULL;
1060
1061
1062

	if (!mm)
		return NULL;
1063
1064
1065
1066
1067
1068
1069
	/*
	 * Because we have no locks, mm->owner's may be being moved to other
	 * cgroup. We use css_tryget() here even if this looks
	 * pessimistic (rather than adding locks here).
	 */
	rcu_read_lock();
	do {
1070
1071
		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
		if (unlikely(!memcg))
1072
			break;
1073
	} while (!css_tryget(&memcg->css));
1074
	rcu_read_unlock();
1075
	return memcg;
1076
1077
}

1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
/*
 * Returns a next (in a pre-order walk) alive memcg (with elevated css
 * ref. count) or NULL if the whole root's subtree has been visited.
 *
 * helper function to be used by mem_cgroup_iter
 */
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
		struct mem_cgroup *last_visited)
{
	struct cgroup *prev_cgroup, *next_cgroup;

	/*
	 * Root is not visited by cgroup iterators so it needs an
	 * explicit visit.
	 */
	if (!last_visited)
		return root;

	prev_cgroup = (last_visited == root) ? NULL
		: last_visited->css.cgroup;
skip_node:
	next_cgroup = cgroup_next_descendant_pre(
			prev_cgroup, root->css.cgroup);

	/*
	 * Even if we found a group we have to make sure it is
	 * alive. css && !memcg means that the groups should be
	 * skipped and we should continue the tree walk.
	 * last_visited css is safe to use because it is
	 * protected by css_get and the tree walk is rcu safe.
	 */
	if (next_cgroup) {
		struct mem_cgroup *mem = mem_cgroup_from_cont(
				next_cgroup);
		if (css_tryget(&mem->css))
			return mem;
		else {
			prev_cgroup = next_cgroup;
			goto skip_node;
		}
	}

	return NULL;
}

1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
{
	/*
	 * When a group in the hierarchy below root is destroyed, the
	 * hierarchy iterator can no longer be trusted since it might
	 * have pointed to the destroyed group.  Invalidate it.
	 */
	atomic_inc(&root->dead_count);
}

static struct mem_cgroup *
mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
		     struct mem_cgroup *root,
		     int *sequence)
{
	struct mem_cgroup *position = NULL;
	/*
	 * A cgroup destruction happens in two stages: offlining and
	 * release.  They are separated by a RCU grace period.
	 *
	 * If the iterator is valid, we may still race with an
	 * offlining.  The RCU lock ensures the object won't be
	 * released, tryget will fail if we lost the race.
	 */
	*sequence = atomic_read(&root->dead_count);
	if (iter->last_dead_count == *sequence) {
		smp_rmb();
		position = iter->last_visited;
		if (position && !css_tryget(&position->css))
			position = NULL;
	}
	return position;
}

static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
				   struct mem_cgroup *last_visited,
				   struct mem_cgroup *new_position,
				   int sequence)
{
	if (last_visited)
		css_put(&last_visited->css);
	/*
	 * We store the sequence count from the time @last_visited was
	 * loaded successfully instead of rereading it here so that we
	 * don't lose destruction events in between.  We could have
	 * raced with the destruction of @new_position after all.
	 */
	iter->last_visited = new_position;
	smp_wmb();
	iter->last_dead_count = sequence;
}

1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
 * Reclaimers can specify a zone and a priority level in @reclaim to
 * divide up the memcgs in the hierarchy among all concurrent
 * reclaimers operating on the same zone and priority.
 */
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
				   struct mem_cgroup *prev,
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1195
{
1196
	struct mem_cgroup *memcg = NULL;
1197
	struct mem_cgroup *last_visited = NULL;
1198

1199
1200
1201
	if (mem_cgroup_disabled())
		return NULL;

1202
1203
	if (!root)
		root = root_mem_cgroup;
1204

1205
	if (prev && !reclaim)
1206
		last_visited = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1207

1208
1209
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
1210
			goto