memcontrol.c 157 KB
Newer Older
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9 10 11 12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13 14 15 16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17 18 19 20 21 22
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
23 24 25 26 27 28 29 30 31 32 33
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

34
#include <linux/page_counter.h>
35 36
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
37
#include <linux/mm.h>
38
#include <linux/sched/mm.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
41
#include <linux/pagemap.h>
42
#include <linux/smp.h>
43
#include <linux/page-flags.h>
44
#include <linux/backing-dev.h>
45 46
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
47
#include <linux/limits.h>
48
#include <linux/export.h>
49
#include <linux/mutex.h>
50
#include <linux/rbtree.h>
51
#include <linux/slab.h>
52
#include <linux/swap.h>
53
#include <linux/swapops.h>
54
#include <linux/spinlock.h>
55
#include <linux/eventfd.h>
56
#include <linux/poll.h>
57
#include <linux/sort.h>
58
#include <linux/fs.h>
59
#include <linux/seq_file.h>
60
#include <linux/vmpressure.h>
61
#include <linux/mm_inline.h>
62
#include <linux/swap_cgroup.h>
63
#include <linux/cpu.h>
64
#include <linux/oom.h>
65
#include <linux/lockdep.h>
66
#include <linux/file.h>
67
#include <linux/tracehook.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
68
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
69
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
70
#include <net/ip.h>
71
#include "slab.h"
72

73
#include <linux/uaccess.h>
74

75 76
#include <trace/events/vmscan.h>

77 78
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
79

80 81
struct mem_cgroup *root_mem_cgroup __read_mostly;

82
#define MEM_CGROUP_RECLAIM_RETRIES	5
83

84 85 86
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;

87 88 89
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem;

90
/* Whether the swap controller is active */
Andrew Morton's avatar
Andrew Morton committed
91
#ifdef CONFIG_MEMCG_SWAP
92 93
int do_swap_account __read_mostly;
#else
94
#define do_swap_account		0
95 96
#endif

97 98 99 100 101 102
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}

103
static const char *const mem_cgroup_lru_names[] = {
104 105 106 107 108 109 110
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

111 112 113
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
114

115 116 117 118 119
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

120
struct mem_cgroup_tree_per_node {
121
	struct rb_root rb_root;
122
	struct rb_node *rb_rightmost;
123 124 125 126 127 128 129 130 131
	spinlock_t lock;
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
132 133 134 135 136
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
137

138 139 140
/*
 * cgroup_event represents events which userspace want to receive.
 */
141
struct mem_cgroup_event {
142
	/*
143
	 * memcg which the event belongs to.
144
	 */
145
	struct mem_cgroup *memcg;
146 147 148 149 150 151 152 153
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
154 155 156 157 158
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
159
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
160
			      struct eventfd_ctx *eventfd, const char *args);
161 162 163 164 165
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
166
	void (*unregister_event)(struct mem_cgroup *memcg,
167
				 struct eventfd_ctx *eventfd);
168 169 170 171 172 173
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
174
	wait_queue_entry_t wait;
175 176 177
	struct work_struct remove;
};

178 179
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
180

181 182
/* Stuffs for move charges at task migration. */
/*
183
 * Types of charges to be moved.
184
 */
185 186 187
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
188

189 190
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
191
	spinlock_t	  lock; /* for from, to */
192
	struct mm_struct  *mm;
193 194
	struct mem_cgroup *from;
	struct mem_cgroup *to;
195
	unsigned long flags;
196
	unsigned long precharge;
197
	unsigned long moved_charge;
198
	unsigned long moved_swap;
199 200 201
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
202
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
203 204
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
205

206 207 208 209
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
210
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
211
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
212

213 214
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
216
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
217
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
218 219 220
	NR_CHARGE_TYPE,
};

221
/* for encoding cft->private value on file */
222 223 224 225
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
226
	_KMEM,
Vladimir Davydov's avatar
Vladimir Davydov committed
227
	_TCP,
228 229
};

230 231
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
232
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
233 234
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
235

236 237 238 239 240 241 242 243 244 245 246 247 248
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

249 250 251 252 253
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

254
#ifndef CONFIG_SLOB
255
/*
256
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
Li Zefan's avatar
Li Zefan committed
257 258 259 260 261
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
262
 *
263 264
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
265
 */
266 267
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
268

269 270 271 272 273 274 275 276 277 278 279 280 281
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

282 283 284 285 286 287
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
288
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
289 290
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
291
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
292 293 294
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
295
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
296

297 298 299 300 301 302
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
303
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
304
EXPORT_SYMBOL(memcg_kmem_enabled_key);
305

306 307
struct workqueue_struct *memcg_kmem_cache_wq;

308
#endif /* !CONFIG_SLOB */
309

310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
/**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
 *
 * If memcg is bound to the default hierarchy, css of the memcg associated
 * with @page is returned.  The returned css remains associated with @page
 * until it is released.
 *
 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 * is returned.
 */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
	struct mem_cgroup *memcg;

	memcg = page->mem_cgroup;

327
	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
328 329 330 331 332
		memcg = root_mem_cgroup;

	return &memcg->css;
}

333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
/**
 * page_cgroup_ino - return inode number of the memcg a page is charged to
 * @page: the page
 *
 * Look up the closest online ancestor of the memory cgroup @page is charged to
 * and return its inode number or 0 if @page is not charged to any cgroup. It
 * is safe to call this function without holding a reference to @page.
 *
 * Note, this function is inherently racy, because there is nothing to prevent
 * the cgroup inode from getting torn down and potentially reallocated a moment
 * after page_cgroup_ino() returns, so it only should be used by callers that
 * do not care (such as procfs interfaces).
 */
ino_t page_cgroup_ino(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long ino = 0;

	rcu_read_lock();
	memcg = READ_ONCE(page->mem_cgroup);
	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
	if (memcg)
		ino = cgroup_ino(memcg->css.cgroup);
	rcu_read_unlock();
	return ino;
}

361 362
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
363
{
364
	int nid = page_to_nid(page);
365

366
	return memcg->nodeinfo[nid];
367 368
}

369 370
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
371
{
372
	return soft_limit_tree.rb_tree_per_node[nid];
373 374
}

375
static struct mem_cgroup_tree_per_node *
376 377 378 379
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);

380
	return soft_limit_tree.rb_tree_per_node[nid];
381 382
}

383 384
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz,
385
					 unsigned long new_usage_in_excess)
386 387 388
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
389
	struct mem_cgroup_per_node *mz_node;
390
	bool rightmost = true;
391 392 393 394 395 396 397 398 399

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
400
		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
401
					tree_node);
402
		if (mz->usage_in_excess < mz_node->usage_in_excess) {
403
			p = &(*p)->rb_left;
404 405 406
			rightmost = false;
		}

407 408 409 410 411 412 413
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
414 415 416 417

	if (rightmost)
		mctz->rb_rightmost = &mz->tree_node;

418 419 420 421 422
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

423 424
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz)
425 426 427
{
	if (!mz->on_tree)
		return;
428 429 430 431

	if (&mz->tree_node == mctz->rb_rightmost)
		mctz->rb_rightmost = rb_prev(&mz->tree_node);

432 433 434 435
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

436 437
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
				       struct mem_cgroup_tree_per_node *mctz)
438
{
439 440 441
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
442
	__mem_cgroup_remove_exceeded(mz, mctz);
443
	spin_unlock_irqrestore(&mctz->lock, flags);
444 445
}

446 447 448
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
449
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
450 451 452 453 454 455 456
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}
457 458 459

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
460
	unsigned long excess;
461 462
	struct mem_cgroup_per_node *mz;
	struct mem_cgroup_tree_per_node *mctz;
463

464
	mctz = soft_limit_tree_from_page(page);
465 466
	if (!mctz)
		return;
467 468 469 470 471
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
472
		mz = mem_cgroup_page_nodeinfo(memcg, page);
473
		excess = soft_limit_excess(memcg);
474 475 476 477 478
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
479 480 481
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
482 483
			/* if on-tree, remove it */
			if (mz->on_tree)
484
				__mem_cgroup_remove_exceeded(mz, mctz);
485 486 487 488
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
489
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
490
			spin_unlock_irqrestore(&mctz->lock, flags);
491 492 493 494 495 496
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
497 498 499
	struct mem_cgroup_tree_per_node *mctz;
	struct mem_cgroup_per_node *mz;
	int nid;
500

501
	for_each_node(nid) {
502 503
		mz = mem_cgroup_nodeinfo(memcg, nid);
		mctz = soft_limit_tree_node(nid);
504 505
		if (mctz)
			mem_cgroup_remove_exceeded(mz, mctz);
506 507 508
	}
}

509 510
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
511
{
512
	struct mem_cgroup_per_node *mz;
513 514 515

retry:
	mz = NULL;
516
	if (!mctz->rb_rightmost)
517 518
		goto done;		/* Nothing to reclaim from */

519 520
	mz = rb_entry(mctz->rb_rightmost,
		      struct mem_cgroup_per_node, tree_node);
521 522 523 524 525
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
526
	__mem_cgroup_remove_exceeded(mz, mctz);
527
	if (!soft_limit_excess(mz->memcg) ||
528
	    !css_tryget_online(&mz->memcg->css))
529 530 531 532 533
		goto retry;
done:
	return mz;
}

534 535
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
536
{
537
	struct mem_cgroup_per_node *mz;
538

539
	spin_lock_irq(&mctz->lock);
540
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
541
	spin_unlock_irq(&mctz->lock);
542 543 544
	return mz;
}

545
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
546
				      int event)
547
{
548
	return atomic_long_read(&memcg->events[event]);
549 550
}

551
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
552
					 struct page *page,
553
					 bool compound, int nr_pages)
554
{
555 556 557 558
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
559
	if (PageAnon(page))
560
		__mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
561
	else {
562
		__mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
563
		if (PageSwapBacked(page))
564
			__mod_memcg_state(memcg, NR_SHMEM, nr_pages);
565
	}
566

567 568
	if (compound) {
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
569
		__mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
570
	}
571

572 573
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
574
		__count_memcg_events(memcg, PGPGIN, 1);
575
	else {
576
		__count_memcg_events(memcg, PGPGOUT, 1);
577 578
		nr_pages = -nr_pages; /* for event */
	}
579

580
	__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
581 582
}

583 584
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
					   int nid, unsigned int lru_mask)
585
{
586
	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
587
	unsigned long nr = 0;
588
	enum lru_list lru;
589

590
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
591

592 593 594
	for_each_lru(lru) {
		if (!(BIT(lru) & lru_mask))
			continue;
595
		nr += mem_cgroup_get_lru_size(lruvec, lru);
596 597
	}
	return nr;
598
}
599

600
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
601
			unsigned int lru_mask)
602
{
603
	unsigned long nr = 0;
604
	int nid;
605

606
	for_each_node_state(nid, N_MEMORY)
607 608
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
609 610
}

611 612
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
613 614 615
{
	unsigned long val, next;

616 617
	val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
	next = __this_cpu_read(memcg->stat_cpu->targets[target]);
618
	/* from time_after() in jiffies.h */
619
	if ((long)(next - val) < 0) {
620 621 622 623
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
624 625 626
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
627 628 629 630 631 632
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
633
		__this_cpu_write(memcg->stat_cpu->targets[target], next);
634
		return true;
635
	}
636
	return false;
637 638 639 640 641 642
}

/*
 * Check events in order.
 *
 */
643
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
644 645
{
	/* threshold event is triggered in finer grain than soft limit */
646 647
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
648
		bool do_softlimit;
649
		bool do_numainfo __maybe_unused;
650

651 652
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
653 654 655 656
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
657
		mem_cgroup_threshold(memcg);
658 659
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
660
#if MAX_NUMNODES > 1
661
		if (unlikely(do_numainfo))
662
			atomic_inc(&memcg->numainfo_events);
663
#endif
664
	}
665 666
}

667
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
668
{
669 670 671 672 673 674 675 676
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

677
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
678
}
Michal Hocko's avatar
Michal Hocko committed
679
EXPORT_SYMBOL(mem_cgroup_from_task);
680

681
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
682
{
683
	struct mem_cgroup *memcg = NULL;
684

685 686
	rcu_read_lock();
	do {
687 688 689 690 691 692
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
693
			memcg = root_mem_cgroup;
694 695 696 697 698
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
699
	} while (!css_tryget_online(&memcg->css));
700
	rcu_read_unlock();
701
	return memcg;
702 703
}

704 705 706 707 708 709 710 711 712 713 714 715 716
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
717
 * Reclaimers can specify a node and a priority level in @reclaim to
718
 * divide up the memcgs in the hierarchy among all concurrent
719
 * reclaimers operating on the same node and priority.
720
 */
721
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
722
				   struct mem_cgroup *prev,
723
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
724
{
Michal Hocko's avatar
Michal Hocko committed
725
	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
726
	struct cgroup_subsys_state *css = NULL;
727
	struct mem_cgroup *memcg = NULL;
728
	struct mem_cgroup *pos = NULL;
729

730 731
	if (mem_cgroup_disabled())
		return NULL;
732

733 734
	if (!root)
		root = root_mem_cgroup;
735

736
	if (prev && !reclaim)
737
		pos = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
738

739 740
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
741
			goto out;
742
		return root;
743
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
744

745
	rcu_read_lock();
Michal Hocko's avatar
Michal Hocko committed
746

747
	if (reclaim) {
748
		struct mem_cgroup_per_node *mz;
749

750
		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
751 752 753 754 755
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

756
		while (1) {
757
			pos = READ_ONCE(iter->position);
758 759
			if (!pos || css_tryget(&pos->css))
				break;
760
			/*
761 762 763 764 765 766
			 * css reference reached zero, so iter->position will
			 * be cleared by ->css_released. However, we should not
			 * rely on this happening soon, because ->css_released
			 * is called from a work queue, and by busy-waiting we
			 * might block it. So we clear iter->position right
			 * away.
767
			 */
768 769
			(void)cmpxchg(&iter->position, pos, NULL);
		}
770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
787
		}
788

789 790 791 792 793 794
		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
795

796 797
		if (css == &root->css)
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
798

799 800
		if (css_tryget(css))
			break;
801

802
		memcg = NULL;
803
	}
804 805 806

	if (reclaim) {
		/*
807 808 809
		 * The position could have already been updated by a competing
		 * thread, so check that the value hasn't changed since we read
		 * it to avoid reclaiming from the same cgroup twice.
810
		 */
811 812
		(void)cmpxchg(&iter->position, pos, memcg);

813 814 815 816 817 818 819
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
820
	}
821

822 823
out_unlock:
	rcu_read_unlock();
824
out:
825 826 827
	if (prev && prev != root)
		css_put(&prev->css);

828
	return memcg;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
829
}
830

831 832 833 834 835 836 837
/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
838 839 840 841 842 843
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}
844

845 846 847 848
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
	struct mem_cgroup *memcg = dead_memcg;
	struct mem_cgroup_reclaim_iter *iter;
849 850
	struct mem_cgroup_per_node *mz;
	int nid;
851 852 853 854
	int i;

	while ((memcg = parent_mem_cgroup(memcg))) {
		for_each_node(nid) {
855 856 857 858 859
			mz = mem_cgroup_nodeinfo(memcg, nid);
			for (i = 0; i <= DEF_PRIORITY; i++) {
				iter = &mz->iter[i];
				cmpxchg(&iter->position,
					dead_memcg, NULL);
860 861 862 863 864
			}
		}
	}
}

865 866 867 868 869 870
/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
871
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
872
	     iter != NULL;				\
873
	     iter = mem_cgroup_iter(root, iter, NULL))
874

875
#define for_each_mem_cgroup(iter)			\
876
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
877
	     iter != NULL;				\
878
	     iter = mem_cgroup_iter(NULL, iter, NULL))
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
879

880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
/**
 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 * @memcg: hierarchy root
 * @fn: function to call for each task
 * @arg: argument passed to @fn
 *
 * This function iterates over tasks attached to @memcg or to any of its
 * descendants and calls @fn for each task. If @fn returns a non-zero
 * value, the function breaks the iteration loop and returns the value.
 * Otherwise, it will iterate over all tasks and return 0.
 *
 * This function must not be called for the root memory cgroup.
 */
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
			  int (*fn)(struct task_struct *, void *), void *arg)
{
	struct mem_cgroup *iter;
	int ret = 0;

	BUG_ON(memcg == root_mem_cgroup);

	for_each_mem_cgroup_tree(iter, memcg) {
		struct css_task_iter it;
		struct task_struct *task;

905
		css_task_iter_start(&iter->css, 0, &it);
906 907 908 909 910 911 912 913 914 915 916
		while (!ret && (task = css_task_iter_next(&it)))
			ret = fn(task, arg);
		css_task_iter_end(&it);
		if (ret) {
			mem_cgroup_iter_break(memcg, iter);
			break;
		}
	}
	return ret;
}

917
/**
918
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
919
 * @page: the page
920
 * @pgdat: pgdat of the page
921 922 923 924
 *
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
925
 */
926
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
927
{
928
	struct mem_cgroup_per_node *mz;
929
	struct mem_cgroup *memcg;
930
	struct lruvec *lruvec;
931

932
	if (mem_cgroup_disabled()) {
933
		lruvec = &pgdat->lruvec;
934 935
		goto out;
	}
936

937
	memcg = page->mem_cgroup;
938
	/*
939
	 * Swapcache readahead pages are added to the LRU - and
940
	 * possibly migrated - before they are charged.
941
	 */
942 943
	if (!memcg)
		memcg = root_mem_cgroup;
944

945
	mz = mem_cgroup_page_nodeinfo(memcg, page);
946 947 948 949 950 951 952
	lruvec = &mz->lruvec;
out:
	/*
	 * Since a node can be onlined after the mem_cgroup was created,
	 * we have to be prepared to initialize lruvec->zone here;
	 * and if offlined then reonlined, we need to reinitialize it.
	 */
953 954
	if (unlikely(lruvec->pgdat != pgdat))
		lruvec->pgdat = pgdat;
955
	return lruvec;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
956
}
957

958
/**
959 960 961
 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 * @lruvec: mem_cgroup per zone lru vector
 * @lru: index of lru list the page is sitting on
962
 * @zid: zone id of the accounted pages
963
 * @nr_pages: positive when adding or negative when removing
964
 *
965 966 967
 * This function must be called under lru_lock, just before a page is added
 * to or just after a page is removed from an lru list (that ordering being
 * so as to allow it to check that lru_size 0 is consistent with list_empty).
968
 */
969
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
970
				int zid, int nr_pages)
971
{
972
	struct mem_cgroup_per_node *mz;
973
	unsigned long *lru_size;
974
	long size;
975 976 977 978

	if (mem_cgroup_disabled())
		return;

979
	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
980
	lru_size = &mz->lru_zone_size[zid][lru];
981 982 983 984 985

	if (nr_pages < 0)
		*lru_size += nr_pages;

	size = *lru_size;
986 987 988
	if (WARN_ONCE(size < 0,
		"%s(%p, %d, %d): lru_size %ld\n",
		__func__, lruvec, lru, nr_pages, size)) {
989 990 991 992 993 994
		VM_BUG_ON(1);
		*lru_size = 0;
	}

	if (nr_pages > 0)
		*lru_size += nr_pages;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
995
}
996

997
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
998
{
999
	struct mem_cgroup *task_memcg;
1000
	struct task_struct *p;
1001
	bool ret;
1002

1003
	p = find_lock_task_mm(task);
1004
	if (p) {
1005
		task_memcg = get_mem_cgroup_from_mm(p->mm);
1006 1007 1008 1009 1010 1011 1012
		task_unlock(p);
	} else {
		/*
		 * All threads may have already detached their mm's, but the oom
		 * killer still needs to detect if they have already been oom
		 * killed to prevent needlessly killing additional tasks.
		 */
1013
		rcu_read_lock();
1014 1015
		task_memcg = mem_cgroup_from_task(task);
		css_get(&task_memcg->css);
1016
		rcu_read_unlock();
1017
	}
1018 1019
	ret = mem_cgroup_is_descendant(task_memcg, memcg);
	css_put(&task_memcg->css);
1020 1021 1022
	return ret;
}

1023
/**
1024
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup