memcontrol.c 173 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13
14
15
16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17
18
19
20
21
22
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
23
24
25
26
27
28
29
30
31
32
33
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

34
#include <linux/page_counter.h>
35
36
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
37
#include <linux/mm.h>
38
#include <linux/sched/mm.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
41
#include <linux/pagemap.h>
42
#include <linux/vm_event_item.h>
43
#include <linux/smp.h>
44
#include <linux/page-flags.h>
45
#include <linux/backing-dev.h>
46
47
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
48
#include <linux/limits.h>
49
#include <linux/export.h>
50
#include <linux/mutex.h>
51
#include <linux/rbtree.h>
52
#include <linux/slab.h>
53
#include <linux/swap.h>
54
#include <linux/swapops.h>
55
#include <linux/spinlock.h>
56
#include <linux/eventfd.h>
57
#include <linux/poll.h>
58
#include <linux/sort.h>
59
#include <linux/fs.h>
60
#include <linux/seq_file.h>
61
#include <linux/vmpressure.h>
62
#include <linux/mm_inline.h>
63
#include <linux/swap_cgroup.h>
64
#include <linux/cpu.h>
65
#include <linux/oom.h>
66
#include <linux/lockdep.h>
67
#include <linux/file.h>
68
#include <linux/tracehook.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
69
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
70
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
71
#include <net/ip.h>
72
#include "slab.h"
73

74
#include <linux/uaccess.h>
75

76
77
#include <trace/events/vmscan.h>

78
79
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
80

81
82
struct mem_cgroup *root_mem_cgroup __read_mostly;

83
#define MEM_CGROUP_RECLAIM_RETRIES	5
84

85
86
87
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;

88
89
90
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem;

91
/* Whether the swap controller is active */
Andrew Morton's avatar
Andrew Morton committed
92
#ifdef CONFIG_MEMCG_SWAP
93
94
int do_swap_account __read_mostly;
#else
95
#define do_swap_account		0
96
97
#endif

98
99
100
101
102
103
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}

104
static const char *const mem_cgroup_lru_names[] = {
105
106
107
108
109
110
111
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

112
113
114
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
115

116
117
118
119
120
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

121
struct mem_cgroup_tree_per_node {
122
	struct rb_root rb_root;
123
	struct rb_node *rb_rightmost;
124
125
126
127
128
129
130
131
132
	spinlock_t lock;
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
133
134
135
136
137
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
138

139
140
141
/*
 * cgroup_event represents events which userspace want to receive.
 */
142
struct mem_cgroup_event {
143
	/*
144
	 * memcg which the event belongs to.
145
	 */
146
	struct mem_cgroup *memcg;
147
148
149
150
151
152
153
154
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
155
156
157
158
159
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
160
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
161
			      struct eventfd_ctx *eventfd, const char *args);
162
163
164
165
166
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
167
	void (*unregister_event)(struct mem_cgroup *memcg,
168
				 struct eventfd_ctx *eventfd);
169
170
171
172
173
174
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
175
	wait_queue_entry_t wait;
176
177
178
	struct work_struct remove;
};

179
180
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
181

182
183
/* Stuffs for move charges at task migration. */
/*
184
 * Types of charges to be moved.
185
 */
186
187
188
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
189

190
191
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
192
	spinlock_t	  lock; /* for from, to */
193
	struct mm_struct  *mm;
194
195
	struct mem_cgroup *from;
	struct mem_cgroup *to;
196
	unsigned long flags;
197
	unsigned long precharge;
198
	unsigned long moved_charge;
199
	unsigned long moved_swap;
200
201
202
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
203
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
204
205
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
206

207
208
209
210
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
211
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
212
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
213

214
215
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
216
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
217
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
218
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
219
220
221
	NR_CHARGE_TYPE,
};

222
/* for encoding cft->private value on file */
223
224
225
226
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
227
	_KMEM,
Vladimir Davydov's avatar
Vladimir Davydov committed
228
	_TCP,
229
230
};

231
232
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
233
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
234
235
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
236

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
	     iter != NULL;				\
	     iter = mem_cgroup_iter(root, iter, NULL))

#define for_each_mem_cgroup(iter)			\
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
	     iter != NULL;				\
	     iter = mem_cgroup_iter(NULL, iter, NULL))

252
253
254
255
256
257
static inline bool should_force_charge(void)
{
	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
		(current->flags & PF_EXITING);
}

258
259
260
261
262
263
264
265
266
267
268
269
270
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

271
#ifdef CONFIG_MEMCG_KMEM
272
/*
273
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
Li Zefan's avatar
Li Zefan committed
274
275
276
277
278
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
279
 *
280
281
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
282
 */
283
284
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
285

286
287
288
289
290
291
292
293
294
295
296
297
298
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

299
300
301
302
303
304
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
305
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
306
307
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
308
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
309
310
311
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
312
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
313

314
315
316
317
318
319
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
320
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
321
EXPORT_SYMBOL(memcg_kmem_enabled_key);
322

323
324
struct workqueue_struct *memcg_kmem_cache_wq;

325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);

static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
{
	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
}

static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
					 int size, int old_size)
{
	struct memcg_shrinker_map *new, *old;
	int nid;

	lockdep_assert_held(&memcg_shrinker_map_mutex);

	for_each_node(nid) {
		old = rcu_dereference_protected(
			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
		/* Not yet online memcg */
		if (!old)
			return 0;

		new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
		if (!new)
			return -ENOMEM;

		/* Set all old bits, clear all new bits */
		memset(new->map, (int)0xff, old_size);
		memset((void *)new->map + old_size, 0, size - old_size);

		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
	}

	return 0;
}

static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
{
	struct mem_cgroup_per_node *pn;
	struct memcg_shrinker_map *map;
	int nid;

	if (mem_cgroup_is_root(memcg))
		return;

	for_each_node(nid) {
		pn = mem_cgroup_nodeinfo(memcg, nid);
		map = rcu_dereference_protected(pn->shrinker_map, true);
		if (map)
			kvfree(map);
		rcu_assign_pointer(pn->shrinker_map, NULL);
	}
}

static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
{
	struct memcg_shrinker_map *map;
	int nid, size, ret = 0;

	if (mem_cgroup_is_root(memcg))
		return 0;

	mutex_lock(&memcg_shrinker_map_mutex);
	size = memcg_shrinker_map_size;
	for_each_node(nid) {
		map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
		if (!map) {
			memcg_free_shrinker_maps(memcg);
			ret = -ENOMEM;
			break;
		}
		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
	}
	mutex_unlock(&memcg_shrinker_map_mutex);

	return ret;
}

int memcg_expand_shrinker_maps(int new_id)
{
	int size, old_size, ret = 0;
	struct mem_cgroup *memcg;

	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
	old_size = memcg_shrinker_map_size;
	if (size <= old_size)
		return 0;

	mutex_lock(&memcg_shrinker_map_mutex);
	if (!root_mem_cgroup)
		goto unlock;

	for_each_mem_cgroup(memcg) {
		if (mem_cgroup_is_root(memcg))
			continue;
		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
		if (ret)
			goto unlock;
	}
unlock:
	if (!ret)
		memcg_shrinker_map_size = size;
	mutex_unlock(&memcg_shrinker_map_mutex);
	return ret;
}
432
433
434
435
436
437
438
439

void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
		struct memcg_shrinker_map *map;

		rcu_read_lock();
		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
440
441
		/* Pairs with smp mb in shrink_slab() */
		smp_mb__before_atomic();
442
443
444
445
446
		set_bit(shrinker_id, map->map);
		rcu_read_unlock();
	}
}

447
448
449
450
451
452
#else /* CONFIG_MEMCG_KMEM */
static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
{
	return 0;
}
static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
453
#endif /* CONFIG_MEMCG_KMEM */
454

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
/**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
 *
 * If memcg is bound to the default hierarchy, css of the memcg associated
 * with @page is returned.  The returned css remains associated with @page
 * until it is released.
 *
 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 * is returned.
 */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
	struct mem_cgroup *memcg;

	memcg = page->mem_cgroup;

472
	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
473
474
475
476
477
		memcg = root_mem_cgroup;

	return &memcg->css;
}

478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/**
 * page_cgroup_ino - return inode number of the memcg a page is charged to
 * @page: the page
 *
 * Look up the closest online ancestor of the memory cgroup @page is charged to
 * and return its inode number or 0 if @page is not charged to any cgroup. It
 * is safe to call this function without holding a reference to @page.
 *
 * Note, this function is inherently racy, because there is nothing to prevent
 * the cgroup inode from getting torn down and potentially reallocated a moment
 * after page_cgroup_ino() returns, so it only should be used by callers that
 * do not care (such as procfs interfaces).
 */
ino_t page_cgroup_ino(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long ino = 0;

	rcu_read_lock();
	memcg = READ_ONCE(page->mem_cgroup);
	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
	if (memcg)
		ino = cgroup_ino(memcg->css.cgroup);
	rcu_read_unlock();
	return ino;
}

506
507
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
508
{
509
	int nid = page_to_nid(page);
510

511
	return memcg->nodeinfo[nid];
512
513
}

514
515
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
516
{
517
	return soft_limit_tree.rb_tree_per_node[nid];
518
519
}

520
static struct mem_cgroup_tree_per_node *
521
522
523
524
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);

525
	return soft_limit_tree.rb_tree_per_node[nid];
526
527
}

528
529
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz,
530
					 unsigned long new_usage_in_excess)
531
532
533
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
534
	struct mem_cgroup_per_node *mz_node;
535
	bool rightmost = true;
536
537
538
539
540
541
542
543
544

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
545
		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
546
					tree_node);
547
		if (mz->usage_in_excess < mz_node->usage_in_excess) {
548
			p = &(*p)->rb_left;
549
550
551
			rightmost = false;
		}

552
553
554
555
556
557
558
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
559
560
561
562

	if (rightmost)
		mctz->rb_rightmost = &mz->tree_node;

563
564
565
566
567
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

568
569
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz)
570
571
572
{
	if (!mz->on_tree)
		return;
573
574
575
576

	if (&mz->tree_node == mctz->rb_rightmost)
		mctz->rb_rightmost = rb_prev(&mz->tree_node);

577
578
579
580
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

581
582
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
				       struct mem_cgroup_tree_per_node *mctz)
583
{
584
585
586
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
587
	__mem_cgroup_remove_exceeded(mz, mctz);
588
	spin_unlock_irqrestore(&mctz->lock, flags);
589
590
}

591
592
593
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
594
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
595
596
597
598
599
600
601
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}
602
603
604

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
605
	unsigned long excess;
606
607
	struct mem_cgroup_per_node *mz;
	struct mem_cgroup_tree_per_node *mctz;
608

609
	mctz = soft_limit_tree_from_page(page);
610
611
	if (!mctz)
		return;
612
613
614
615
616
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
617
		mz = mem_cgroup_page_nodeinfo(memcg, page);
618
		excess = soft_limit_excess(memcg);
619
620
621
622
623
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
624
625
626
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
627
628
			/* if on-tree, remove it */
			if (mz->on_tree)
629
				__mem_cgroup_remove_exceeded(mz, mctz);
630
631
632
633
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
634
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
635
			spin_unlock_irqrestore(&mctz->lock, flags);
636
637
638
639
640
641
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
642
643
644
	struct mem_cgroup_tree_per_node *mctz;
	struct mem_cgroup_per_node *mz;
	int nid;
645

646
	for_each_node(nid) {
647
648
		mz = mem_cgroup_nodeinfo(memcg, nid);
		mctz = soft_limit_tree_node(nid);
649
650
		if (mctz)
			mem_cgroup_remove_exceeded(mz, mctz);
651
652
653
	}
}

654
655
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
656
{
657
	struct mem_cgroup_per_node *mz;
658
659
660

retry:
	mz = NULL;
661
	if (!mctz->rb_rightmost)
662
663
		goto done;		/* Nothing to reclaim from */

664
665
	mz = rb_entry(mctz->rb_rightmost,
		      struct mem_cgroup_per_node, tree_node);
666
667
668
669
670
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
671
	__mem_cgroup_remove_exceeded(mz, mctz);
672
	if (!soft_limit_excess(mz->memcg) ||
673
	    !css_tryget_online(&mz->memcg->css))
674
675
676
677
678
		goto retry;
done:
	return mz;
}

679
680
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
681
{
682
	struct mem_cgroup_per_node *mz;
683

684
	spin_lock_irq(&mctz->lock);
685
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
686
	spin_unlock_irq(&mctz->lock);
687
688
689
	return mz;
}

690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
/**
 * __mod_memcg_state - update cgroup memory statistics
 * @memcg: the memory cgroup
 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 * @val: delta to add to the counter, can be negative
 */
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
	long x;

	if (mem_cgroup_disabled())
		return;

	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
705
706
707
708
709
		struct mem_cgroup *mi;

		atomic_long_add(x, &memcg->vmstats_local[idx]);
		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
			atomic_long_add(x, &mi->vmstats[idx]);
710
711
712
713
714
		x = 0;
	}
	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
}

715
716
717
718
719
720
721
722
723
724
725
static struct mem_cgroup_per_node *
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
{
	struct mem_cgroup *parent;

	parent = parent_mem_cgroup(pn->memcg);
	if (!parent)
		return NULL;
	return mem_cgroup_nodeinfo(parent, nid);
}

726
727
728
729
730
731
732
733
734
735
736
737
738
/**
 * __mod_lruvec_state - update lruvec memory statistics
 * @lruvec: the lruvec
 * @idx: the stat item
 * @val: delta to add to the counter, can be negative
 *
 * The lruvec is the intersection of the NUMA node and a cgroup. This
 * function updates the all three counters that are affected by a
 * change of state at this level: per-node, per-cgroup, per-lruvec.
 */
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
			int val)
{
739
	pg_data_t *pgdat = lruvec_pgdat(lruvec);
740
	struct mem_cgroup_per_node *pn;
741
	struct mem_cgroup *memcg;
742
743
744
	long x;

	/* Update node */
745
	__mod_node_page_state(pgdat, idx, val);
746
747
748
749
750

	if (mem_cgroup_disabled())
		return;

	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
751
	memcg = pn->memcg;
752
753

	/* Update memcg */
754
	__mod_memcg_state(memcg, idx, val);
755
756
757
758

	/* Update lruvec */
	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
759
760
761
762
763
		struct mem_cgroup_per_node *pi;

		atomic_long_add(x, &pn->lruvec_stat_local[idx]);
		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
			atomic_long_add(x, &pi->lruvec_stat[idx]);
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
		x = 0;
	}
	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}

/**
 * __count_memcg_events - account VM events in a cgroup
 * @memcg: the memory cgroup
 * @idx: the event item
 * @count: the number of events that occured
 */
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
			  unsigned long count)
{
	unsigned long x;

	if (mem_cgroup_disabled())
		return;

	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
785
786
787
788
789
		struct mem_cgroup *mi;

		atomic_long_add(x, &memcg->vmevents_local[idx]);
		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
			atomic_long_add(x, &mi->vmevents[idx]);
790
791
792
793
794
		x = 0;
	}
	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
}

795
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
796
{
797
	return atomic_long_read(&memcg->vmevents[event]);
798
799
}

800
801
802
803
804
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
	return atomic_long_read(&memcg->vmevents_local[event]);
}

805
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
806
					 struct page *page,
807
					 bool compound, int nr_pages)
808
{
809
810
811
812
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
813
	if (PageAnon(page))
814
		__mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
815
	else {
816
		__mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
817
		if (PageSwapBacked(page))
818
			__mod_memcg_state(memcg, NR_SHMEM, nr_pages);
819
	}
820

821
822
	if (compound) {
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
823
		__mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
824
	}
825

826
827
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
828
		__count_memcg_events(memcg, PGPGIN, 1);
829
	else {
830
		__count_memcg_events(memcg, PGPGOUT, 1);
831
832
		nr_pages = -nr_pages; /* for event */
	}
833

834
	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
835
836
}

837
838
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
839
840
841
{
	unsigned long val, next;

842
843
	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
844
	/* from time_after() in jiffies.h */
845
	if ((long)(next - val) < 0) {
846
847
848
849
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
850
851
852
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
853
854
855
856
857
858
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
859
		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
860
		return true;
861
	}
862
	return false;
863
864
865
866
867
868
}

/*
 * Check events in order.
 *
 */
869
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
870
871
{
	/* threshold event is triggered in finer grain than soft limit */
872
873
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
874
		bool do_softlimit;
875
		bool do_numainfo __maybe_unused;
876

877
878
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
879
880
881
882
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
883
		mem_cgroup_threshold(memcg);
884
885
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
886
#if MAX_NUMNODES > 1
887
		if (unlikely(do_numainfo))
888
			atomic_inc(&memcg->numainfo_events);
889
#endif
890
	}
891
892
}

893
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
894
{
895
896
897
898
899
900
901
902
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

903
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
904
}
Michal Hocko's avatar
Michal Hocko committed
905
EXPORT_SYMBOL(mem_cgroup_from_task);
906

907
908
909
910
911
912
913
914
915
/**
 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 * @mm: mm from which memcg should be extracted. It can be NULL.
 *
 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
 * returned.
 */
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
916
{
917
918
919
920
	struct mem_cgroup *memcg;

	if (mem_cgroup_disabled())
		return NULL;
921

922
923
	rcu_read_lock();
	do {
924
925
926
927
928
929
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
930
			memcg = root_mem_cgroup;
931
932
933
934
935
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
936
	} while (!css_tryget_online(&memcg->css));
937
	rcu_read_unlock();
938
	return memcg;
939
}
940
941
EXPORT_SYMBOL(get_mem_cgroup_from_mm);

942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
/**
 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
 * @page: page from which memcg should be extracted.
 *
 * Obtain a reference on page->memcg and returns it if successful. Otherwise
 * root_mem_cgroup is returned.
 */
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
	struct mem_cgroup *memcg = page->mem_cgroup;

	if (mem_cgroup_disabled())
		return NULL;

	rcu_read_lock();
	if (!memcg || !css_tryget_online(&memcg->css))
		memcg = root_mem_cgroup;
	rcu_read_unlock();
	return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);

964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
/**
 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
 */
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
	if (unlikely(current->active_memcg)) {
		struct mem_cgroup *memcg = root_mem_cgroup;

		rcu_read_lock();
		if (css_tryget_online(&current->active_memcg->css))
			memcg = current->active_memcg;
		rcu_read_unlock();
		return memcg;
	}
	return get_mem_cgroup_from_mm(current->mm);
}
980

981
982
983
984
985
986
987
988
989
990
991
992
993
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
994
 * Reclaimers can specify a node and a priority level in @reclaim to
995
 * divide up the memcgs in the hierarchy among all concurrent
996
 * reclaimers operating on the same node and priority.
997
 */
998
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
999
				   struct mem_cgroup *prev,
1000
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1001
{
Michal Hocko's avatar
Michal Hocko committed
1002
	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1003
	struct cgroup_subsys_state *css = NULL;
1004
	struct mem_cgroup *memcg = NULL;
1005
	struct mem_cgroup *pos = NULL;
1006

1007
1008
	if (mem_cgroup_disabled())
		return NULL;
1009

1010
1011
	if (!root)
		root = root_mem_cgroup;
1012

1013
	if (prev && !reclaim)
1014
		pos = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1015

1016
1017
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
1018
			goto out;
1019
		return root;
1020
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1021

1022
	rcu_read_lock();
Michal Hocko's avatar
Michal Hocko committed
1023

1024
	if (reclaim) {
1025
		struct mem_cgroup_per_node *mz;
1026

1027
		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1028
1029
1030
1031
1032
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

1033
		while (1) {
1034
			pos = READ_ONCE(iter->position);
1035
1036
			if (!pos || css_tryget(&pos->css))
				break;
1037
			/*
1038
1039
1040
1041
1042
1043
			 * css reference reached zero, so iter->position will
			 * be cleared by ->css_released. However, we should not
			 * rely on this happening soon, because ->css_released
			 * is called from a work queue, and by busy-waiting we
			 * might block it. So we clear iter->position right
			 * away.
1044
			 */
1045
1046
			(void)cmpxchg(&iter->position, pos, NULL);
		}
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
1064
		}
1065

1066
1067
1068
1069
1070
1071
		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1072

1073
1074
		if (css == &root->css)
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1075

1076
1077
		if (css_tryget(css))
			break;
1078

1079
		memcg = NULL;
1080
	}
1081
1082
1083

	if (reclaim) {
		/*
1084
1085
1086
		 * The position could have already been updated by a competing
		 * thread, so check that the value hasn't changed since we read
		 * it to avoid reclaiming from the same cgroup twice.
1087
		 */
1088
1089
		(void)cmpxchg(&iter->position, pos, memcg);

1090
1091
1092
1093
1094
1095
1096
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
1097
	}
1098

1099
1100
out_unlock:
	rcu_read_unlock();
1101
out:
1102
1103
1104
	if (prev && prev != root)
		css_put(&prev->css);

1105
	return memcg;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1106
}
1107

1108
1109
1110
1111
1112
1113
1114
/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
1115
1116
1117
1118
1119
1120
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}
1121

1122
1123
1124
1125
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
	struct mem_cgroup *memcg = dead_memcg;
	struct mem_cgroup_reclaim_iter *iter;
1126
1127
	struct mem_cgroup_per_node *mz;
	int nid;
1128
1129
	int i;

1130
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1131
		for_each_node(nid) {
1132
1133
1134
1135
1136
			mz = mem_cgroup_nodeinfo(memcg, nid);
			for (i = 0; i <= DEF_PRIORITY; i++) {
				iter = &mz->iter[i];
				cmpxchg(&iter->position,
					dead_memcg, NULL);
1137
1138
1139
1140
1141
			}
		}
	}
}

1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
/**
 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 * @memcg: hierarchy root
 * @fn: function to call for each task
 * @arg: argument passed to @fn
 *
 * This function iterates over tasks attached to @memcg or to any of its
 * descendants and calls @fn for each task. If @fn returns a non-zero
 * value, the function breaks the iteration loop and returns the value.
 * Otherwise, it will iterate over all tasks and return 0.
 *
 * This function must not be called for the root memory cgroup.
 */
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
			  int (*fn)(struct task_struct *, void *), void *arg)
{
	struct mem_cgroup *iter;
	int ret = 0;

	BUG_ON(memcg == root_mem_cgroup);

	for_each_mem_cgroup_tree(iter, memcg) {
		struct css_task_iter it;
		struct task_struct *task;

1167
		css_task_iter_start(&iter->css, 0, &it);
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
		while (!ret && (task = css_task_iter_next(&it)))
			ret = fn(task, arg);
		css_task_iter_end(&it);
		if (ret) {
			mem_cgroup_iter_break(memcg, iter);
			break;
		}
	}
	return ret;
}

1179
/**
1180
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1181
 * @page: the page
1182
 * @pgdat: pgdat of the page
1183
1184
1185
1186
 *
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
1187
 */
1188
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1189
{
1190
	struct mem_cgroup_per_node *mz;
1191
	struct mem_cgroup *memcg;