hugetlb_cgroup.c 15.5 KB
Newer Older
1 2 3 4 5
/*
 *
 * Copyright IBM Corporation, 2012
 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 *
6 7 8 9
 * Cgroup v2
 * Copyright (C) 2019 Red Hat, Inc.
 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
 *
10 11 12 13 14 15 16 17 18 19 20
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 */

#include <linux/cgroup.h>
21
#include <linux/page_counter.h>
22 23 24 25
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>

26 27 28 29 30
enum hugetlb_memory_event {
	HUGETLB_MAX,
	HUGETLB_NR_MEMORY_EVENTS,
};

31 32
struct hugetlb_cgroup {
	struct cgroup_subsys_state css;
33

34 35 36
	/*
	 * the counter to account for hugepages from hugetlb.
	 */
37
	struct page_counter hugepage[HUGE_MAX_HSTATE];
38 39 40 41 42 43 44 45 46

	atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
	atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];

	/* Handle for "hugetlb.events" */
	struct cgroup_file events_file[HUGE_MAX_HSTATE];

	/* Handle for "hugetlb.events.local" */
	struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
47 48
};

49 50 51 52
#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
#define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)

53 54 55
#define hugetlb_cgroup_from_counter(counter, idx)                   \
	container_of(counter, struct hugetlb_cgroup, hugepage[idx])

56 57 58 59 60
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
61
	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
62 63 64 65 66
}

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
67
	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
68 69 70 71 72 73 74
}

static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
{
	return (h_cg == root_h_cgroup);
}

75 76
static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
77
{
Tejun Heo's avatar
Tejun Heo committed
78
	return hugetlb_cgroup_from_css(h_cg->css.parent);
79 80
}

81
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
82 83 84 85
{
	int idx;

	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
86
		if (page_counter_read(&h_cg->hugepage[idx]))
87 88 89 90 91
			return true;
	}
	return false;
}

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
				struct hugetlb_cgroup *parent_h_cgroup)
{
	int idx;

	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
		struct page_counter *counter = &h_cgroup->hugepage[idx];
		struct page_counter *parent = NULL;
		unsigned long limit;
		int ret;

		if (parent_h_cgroup)
			parent = &parent_h_cgroup->hugepage[idx];
		page_counter_init(counter, parent);

		limit = round_down(PAGE_COUNTER_MAX,
				   1 << huge_page_order(&hstates[idx]));
109
		ret = page_counter_set_max(counter, limit);
110 111 112 113
		VM_BUG_ON(ret);
	}
}

114 115
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
116
{
117 118
	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
	struct hugetlb_cgroup *h_cgroup;
119 120 121 122 123

	h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
	if (!h_cgroup)
		return ERR_PTR(-ENOMEM);

124
	if (!parent_h_cgroup)
125
		root_h_cgroup = h_cgroup;
126 127

	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
128 129 130
	return &h_cgroup->css;
}

131
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
132 133 134
{
	struct hugetlb_cgroup *h_cgroup;

135
	h_cgroup = hugetlb_cgroup_from_css(css);
136 137 138
	kfree(h_cgroup);
}

139 140 141 142 143 144 145 146

/*
 * Should be called with hugetlb_lock held.
 * Since we are holding hugetlb_lock, pages cannot get moved from
 * active list or uncharged from the cgroup, So no need to get
 * page reference and test for page active here. This function
 * cannot fail.
 */
147
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
148 149
				       struct page *page)
{
150 151
	unsigned int nr_pages;
	struct page_counter *counter;
152
	struct hugetlb_cgroup *page_hcg;
153
	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
154 155 156 157 158 159 160 161 162 163

	page_hcg = hugetlb_cgroup_from_page(page);
	/*
	 * We can have pages in active list without any cgroup
	 * ie, hugepage with less than 3 pages. We can safely
	 * ignore those pages.
	 */
	if (!page_hcg || page_hcg != h_cg)
		goto out;

164
	nr_pages = compound_nr(page);
165 166 167
	if (!parent) {
		parent = root_h_cgroup;
		/* root has no limit */
168
		page_counter_charge(&parent->hugepage[idx], nr_pages);
169 170
	}
	counter = &h_cg->hugepage[idx];
171 172
	/* Take the pages off the local counter */
	page_counter_cancel(counter, nr_pages);
173 174 175 176 177 178 179 180 181 182

	set_hugetlb_cgroup(page, parent);
out:
	return;
}

/*
 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 * the parent cgroup.
 */
183
static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
184
{
185
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
186 187
	struct hstate *h;
	struct page *page;
188
	int idx = 0;
189 190 191 192 193

	do {
		for_each_hstate(h) {
			spin_lock(&hugetlb_lock);
			list_for_each_entry(page, &h->hugepage_activelist, lru)
194
				hugetlb_cgroup_move_parent(idx, h_cg, page);
195 196 197 198 199

			spin_unlock(&hugetlb_lock);
			idx++;
		}
		cond_resched();
200
	} while (hugetlb_cgroup_have_usage(h_cg));
201 202
}

203 204 205 206 207 208 209 210 211 212 213 214 215
static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
				 enum hugetlb_memory_event event)
{
	atomic_long_inc(&hugetlb->events_local[idx][event]);
	cgroup_file_notify(&hugetlb->events_local_file[idx]);

	do {
		atomic_long_inc(&hugetlb->events[idx][event]);
		cgroup_file_notify(&hugetlb->events_file[idx]);
	} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
		 !hugetlb_cgroup_is_root(hugetlb));
}

216 217 218 219
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
				 struct hugetlb_cgroup **ptr)
{
	int ret = 0;
220
	struct page_counter *counter;
221 222 223 224 225 226 227 228 229 230 231 232 233
	struct hugetlb_cgroup *h_cg = NULL;

	if (hugetlb_cgroup_disabled())
		goto done;
	/*
	 * We don't charge any cgroup if the compound page have less
	 * than 3 pages.
	 */
	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
		goto done;
again:
	rcu_read_lock();
	h_cg = hugetlb_cgroup_from_task(current);
234
	if (!css_tryget(&h_cg->css)) {
235 236 237 238 239
		rcu_read_unlock();
		goto again;
	}
	rcu_read_unlock();

240 241
	if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
				     &counter)) {
242
		ret = -ENOMEM;
243
		hugetlb_event(h_cg, idx, HUGETLB_MAX);
244
	}
245 246 247 248 249 250
	css_put(&h_cg->css);
done:
	*ptr = h_cg;
	return ret;
}

251
/* Should be called with hugetlb_lock held */
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
				  struct hugetlb_cgroup *h_cg,
				  struct page *page)
{
	if (hugetlb_cgroup_disabled() || !h_cg)
		return;

	set_hugetlb_cgroup(page, h_cg);
	return;
}

/*
 * Should be called with hugetlb_lock held
 */
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
				  struct page *page)
{
	struct hugetlb_cgroup *h_cg;

	if (hugetlb_cgroup_disabled())
		return;
273
	lockdep_assert_held(&hugetlb_lock);
274 275 276 277
	h_cg = hugetlb_cgroup_from_page(page);
	if (unlikely(!h_cg))
		return;
	set_hugetlb_cgroup(page, NULL);
278
	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
279 280 281 282 283 284 285 286 287 288 289 290
	return;
}

void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
				    struct hugetlb_cgroup *h_cg)
{
	if (hugetlb_cgroup_disabled() || !h_cg)
		return;

	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
		return;

291
	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
292 293 294
	return;
}

295 296 297 298 299 300 301
enum {
	RES_USAGE,
	RES_LIMIT,
	RES_MAX_USAGE,
	RES_FAILCNT,
};

302 303
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
				   struct cftype *cft)
304
{
305
	struct page_counter *counter;
306
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
307

308
	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
309

310 311 312 313
	switch (MEMFILE_ATTR(cft->private)) {
	case RES_USAGE:
		return (u64)page_counter_read(counter) * PAGE_SIZE;
	case RES_LIMIT:
314
		return (u64)counter->max * PAGE_SIZE;
315 316 317 318 319 320 321
	case RES_MAX_USAGE:
		return (u64)counter->watermark * PAGE_SIZE;
	case RES_FAILCNT:
		return counter->failcnt;
	default:
		BUG();
	}
322 323
}

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
	int idx;
	u64 val;
	struct cftype *cft = seq_cft(seq);
	unsigned long limit;
	struct page_counter *counter;
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

	idx = MEMFILE_IDX(cft->private);
	counter = &h_cg->hugepage[idx];

	limit = round_down(PAGE_COUNTER_MAX,
			   1 << huge_page_order(&hstates[idx]));

	switch (MEMFILE_ATTR(cft->private)) {
	case RES_USAGE:
		val = (u64)page_counter_read(counter);
		seq_printf(seq, "%llu\n", val * PAGE_SIZE);
		break;
	case RES_LIMIT:
		val = (u64)counter->max;
		if (val == limit)
			seq_puts(seq, "max\n");
		else
			seq_printf(seq, "%llu\n", val * PAGE_SIZE);
		break;
	default:
		BUG();
	}

	return 0;
}

358 359
static DEFINE_MUTEX(hugetlb_limit_mutex);

360
static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
361 362
				    char *buf, size_t nbytes, loff_t off,
				    const char *max)
363
{
364 365
	int ret, idx;
	unsigned long nr_pages;
366
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
367

368 369 370
	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
		return -EINVAL;

371
	buf = strstrip(buf);
372
	ret = page_counter_memparse(buf, max, &nr_pages);
373 374 375
	if (ret)
		return ret;

376
	idx = MEMFILE_IDX(of_cft(of)->private);
377
	nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
378

379
	switch (MEMFILE_ATTR(of_cft(of)->private)) {
380
	case RES_LIMIT:
381
		mutex_lock(&hugetlb_limit_mutex);
382
		ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
383
		mutex_unlock(&hugetlb_limit_mutex);
384 385 386 387 388
		break;
	default:
		ret = -EINVAL;
		break;
	}
389
	return ret ?: nbytes;
390 391
}

392 393 394 395 396 397 398 399 400 401 402 403
static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
					   char *buf, size_t nbytes, loff_t off)
{
	return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
}

static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
					char *buf, size_t nbytes, loff_t off)
{
	return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
}

404 405
static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
				    char *buf, size_t nbytes, loff_t off)
406
{
407 408
	int ret = 0;
	struct page_counter *counter;
409
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
410

411
	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
412

413
	switch (MEMFILE_ATTR(of_cft(of)->private)) {
414
	case RES_MAX_USAGE:
415
		page_counter_reset_watermark(counter);
416 417
		break;
	case RES_FAILCNT:
418
		counter->failcnt = 0;
419 420 421 422 423
		break;
	default:
		ret = -EINVAL;
		break;
	}
424
	return ret ?: nbytes;
425 426 427 428 429 430 431 432 433 434 435 436 437
}

static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
	if (hsize >= (1UL << 30))
		snprintf(buf, size, "%luGB", hsize >> 30);
	else if (hsize >= (1UL << 20))
		snprintf(buf, size, "%luMB", hsize >> 20);
	else
		snprintf(buf, size, "%luKB", hsize >> 10);
	return buf;
}

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
static int __hugetlb_events_show(struct seq_file *seq, bool local)
{
	int idx;
	long max;
	struct cftype *cft = seq_cft(seq);
	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

	idx = MEMFILE_IDX(cft->private);

	if (local)
		max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
	else
		max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);

	seq_printf(seq, "max %lu\n", max);

	return 0;
}

static int hugetlb_events_show(struct seq_file *seq, void *v)
{
	return __hugetlb_events_show(seq, false);
}

static int hugetlb_events_local_show(struct seq_file *seq, void *v)
{
	return __hugetlb_events_show(seq, true);
}

static void __init __hugetlb_cgroup_file_dfl_init(int idx)
468 469 470 471 472 473 474 475 476
{
	char buf[32];
	struct cftype *cft;
	struct hstate *h = &hstates[idx];

	/* format the size */
	mem_fmt(buf, 32, huge_page_size(h));

	/* Add the limit file */
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
	cft = &h->cgroup_files_dfl[0];
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
	cft->seq_show = hugetlb_cgroup_read_u64_max;
	cft->write = hugetlb_cgroup_write_dfl;
	cft->flags = CFTYPE_NOT_ON_ROOT;

	/* Add the current usage file */
	cft = &h->cgroup_files_dfl[1];
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
	cft->seq_show = hugetlb_cgroup_read_u64_max;
	cft->flags = CFTYPE_NOT_ON_ROOT;

	/* Add the events file */
	cft = &h->cgroup_files_dfl[2];
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
	cft->private = MEMFILE_PRIVATE(idx, 0);
	cft->seq_show = hugetlb_events_show;
	cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
	cft->flags = CFTYPE_NOT_ON_ROOT;

	/* Add the events.local file */
	cft = &h->cgroup_files_dfl[3];
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
	cft->private = MEMFILE_PRIVATE(idx, 0);
	cft->seq_show = hugetlb_events_local_show;
	cft->file_offset = offsetof(struct hugetlb_cgroup,
				    events_local_file[idx]),
	cft->flags = CFTYPE_NOT_ON_ROOT;

	/* NULL terminate the last cft */
	cft = &h->cgroup_files_dfl[4];
	memset(cft, 0, sizeof(*cft));

	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
				       h->cgroup_files_dfl));
}

static void __init __hugetlb_cgroup_file_legacy_init(int idx)
{
	char buf[32];
	struct cftype *cft;
	struct hstate *h = &hstates[idx];

	/* format the size */
	mem_fmt(buf, 32, huge_page_size(h));

	/* Add the limit file */
	cft = &h->cgroup_files_legacy[0];
527 528
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
529
	cft->read_u64 = hugetlb_cgroup_read_u64;
530
	cft->write = hugetlb_cgroup_write_legacy;
531 532

	/* Add the usage file */
533
	cft = &h->cgroup_files_legacy[1];
534 535
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
536
	cft->read_u64 = hugetlb_cgroup_read_u64;
537 538

	/* Add the MAX usage file */
539
	cft = &h->cgroup_files_legacy[2];
540 541
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
	cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
542
	cft->write = hugetlb_cgroup_reset;
543
	cft->read_u64 = hugetlb_cgroup_read_u64;
544 545

	/* Add the failcntfile */
546
	cft = &h->cgroup_files_legacy[3];
547 548
	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
	cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
549
	cft->write = hugetlb_cgroup_reset;
550
	cft->read_u64 = hugetlb_cgroup_read_u64;
551 552

	/* NULL terminate the last cft */
553
	cft = &h->cgroup_files_legacy[4];
554 555
	memset(cft, 0, sizeof(*cft));

556
	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
557 558 559 560 561 562 563
					  h->cgroup_files_legacy));
}

static void __init __hugetlb_cgroup_file_init(int idx)
{
	__hugetlb_cgroup_file_dfl_init(idx);
	__hugetlb_cgroup_file_legacy_init(idx);
564 565 566 567 568 569 570 571 572 573
}

void __init hugetlb_cgroup_file_init(void)
{
	struct hstate *h;

	for_each_hstate(h) {
		/*
		 * Add cgroup control files only if the huge page consists
		 * of more than two normal pages. This is because we use
574
		 * page[2].private for storing cgroup details.
575 576 577 578
		 */
		if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
			__hugetlb_cgroup_file_init(hstate_index(h));
	}
579 580
}

581 582 583 584
/*
 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
 * when we migrate hugepages
 */
585 586 587
void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
{
	struct hugetlb_cgroup *h_cg;
588
	struct hstate *h = page_hstate(oldhpage);
589 590 591 592

	if (hugetlb_cgroup_disabled())
		return;

593
	VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
594 595 596 597 598 599
	spin_lock(&hugetlb_lock);
	h_cg = hugetlb_cgroup_from_page(oldhpage);
	set_hugetlb_cgroup(oldhpage, NULL);

	/* move the h_cg details to new cgroup */
	set_hugetlb_cgroup(newhpage, h_cg);
600
	list_move(&newhpage->lru, &h->hugepage_activelist);
601 602 603 604
	spin_unlock(&hugetlb_lock);
	return;
}

605 606 607 608
static struct cftype hugetlb_files[] = {
	{} /* terminate */
};

609
struct cgroup_subsys hugetlb_cgrp_subsys = {
610 611 612
	.css_alloc	= hugetlb_cgroup_css_alloc,
	.css_offline	= hugetlb_cgroup_css_offline,
	.css_free	= hugetlb_cgroup_css_free,
613 614
	.dfl_cftypes	= hugetlb_files,
	.legacy_cftypes	= hugetlb_files,
615
};