memory.c 19.9 KB
Newer Older
1
/*
2
 * Memory subsystem support
3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
16
#include <linux/capability.h>
17 18 19 20
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
21
#include <linux/mutex.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24

Arun Sharma's avatar
Arun Sharma committed
25
#include <linux/atomic.h>
26
#include <linux/uaccess.h>
27

28 29
static DEFINE_MUTEX(mem_sysfs_mutex);

30
#define MEMORY_CLASS_NAME	"memory"
31

32 33
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

34 35 36 37 38 39
static int sections_per_block;

static inline int base_memory_block_id(int section_nr)
{
	return section_nr / sections_per_block;
}
40

41 42 43
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

44
static struct bus_type memory_subsys = {
45
	.name = MEMORY_CLASS_NAME,
46
	.dev_name = MEMORY_CLASS_NAME,
47 48
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
49 50
};

51
static BLOCKING_NOTIFIER_HEAD(memory_chain);
52

53
int register_memory_notifier(struct notifier_block *nb)
54
{
55
	return blocking_notifier_chain_register(&memory_chain, nb);
56
}
57
EXPORT_SYMBOL(register_memory_notifier);
58

59
void unregister_memory_notifier(struct notifier_block *nb)
60
{
61
	blocking_notifier_chain_unregister(&memory_chain, nb);
62
}
63
EXPORT_SYMBOL(unregister_memory_notifier);
64

65 66 67 68 69 70 71 72 73 74 75 76 77 78
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

79 80
static void memory_block_release(struct device *dev)
{
81
	struct memory_block *mem = to_memory_block(dev);
82 83 84 85

	kfree(mem);
}

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}

static unsigned long get_memory_block_size(void)
{
	unsigned long block_sz;

	block_sz = memory_block_size_bytes();

	/* Validate blk_sz is a power of 2 and not less than section size */
	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
		WARN_ON(1);
		block_sz = MIN_MEMORY_BLOCK_SIZE;
	}

	return block_sz;
}

106 107 108 109 110
/*
 * use this as the physical section index that this memsection
 * uses.
 */

111 112
static ssize_t show_mem_start_phys_index(struct device *dev,
			struct device_attribute *attr, char *buf)
113
{
114
	struct memory_block *mem = to_memory_block(dev);
115 116 117 118 119 120
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
}

121 122 123
/*
 * Show whether the section of memory is likely to be hot-removable
 */
124 125
static ssize_t show_mem_removable(struct device *dev,
			struct device_attribute *attr, char *buf)
126
{
127 128
	unsigned long i, pfn;
	int ret = 1;
129
	struct memory_block *mem = to_memory_block(dev);
130

131 132 133
	if (mem->state != MEM_ONLINE)
		goto out;

134
	for (i = 0; i < sections_per_block; i++) {
135 136
		if (!present_section_nr(mem->start_section_nr + i))
			continue;
137
		pfn = section_nr_to_pfn(mem->start_section_nr + i);
138 139 140
		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
	}

141
out:
142 143 144
	return sprintf(buf, "%d\n", ret);
}

145 146 147
/*
 * online, offline, going offline, etc.
 */
148 149
static ssize_t show_mem_state(struct device *dev,
			struct device_attribute *attr, char *buf)
150
{
151
	struct memory_block *mem = to_memory_block(dev);
152 153 154 155 156 157 158
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
159 160 161 162 163 164 165 166 167 168 169 170 171 172
	case MEM_ONLINE:
		len = sprintf(buf, "online\n");
		break;
	case MEM_OFFLINE:
		len = sprintf(buf, "offline\n");
		break;
	case MEM_GOING_OFFLINE:
		len = sprintf(buf, "going-offline\n");
		break;
	default:
		len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
				mem->state);
		WARN_ON(1);
		break;
173 174 175 176 177
	}

	return len;
}

178
int memory_notify(unsigned long val, void *v)
179
{
180
	return blocking_notifier_call_chain(&memory_chain, val, v);
181 182
}

183 184 185 186 187
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

188 189 190 191
/*
 * The probe routines leave the pages reserved, just as the bootmem code does.
 * Make sure they're still that way.
 */
192
static bool pages_correctly_reserved(unsigned long start_pfn)
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
{
	int i, j;
	struct page *page;
	unsigned long pfn = start_pfn;

	/*
	 * memmap between sections is not contiguous except with
	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
	 * and assume memmap is contiguous within each section
	 */
	for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
		if (WARN_ON_ONCE(!pfn_valid(pfn)))
			return false;
		page = pfn_to_page(pfn);

		for (j = 0; j < PAGES_PER_SECTION; j++) {
			if (PageReserved(page + j))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online?\n",
				pfn_to_section_nr(pfn), j);

			return false;
		}
	}

	return true;
}

223 224 225
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
226
 * Must already be protected by mem_hotplug_begin().
227 228
 */
static int
229
memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
230
{
231
	unsigned long start_pfn;
232
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
233 234
	int ret;

235
	start_pfn = section_nr_to_pfn(phys_index);
236

237
	switch (action) {
238 239 240 241 242 243 244 245 246 247 248 249 250
	case MEM_ONLINE:
		if (!pages_correctly_reserved(start_pfn))
			return -EBUSY;

		ret = online_pages(start_pfn, nr_pages, online_type);
		break;
	case MEM_OFFLINE:
		ret = offline_pages(start_pfn, nr_pages);
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
		     "%ld\n", __func__, phys_index, action, action);
		ret = -EINVAL;
251 252 253 254 255
	}

	return ret;
}

256
static int memory_block_change_state(struct memory_block *mem,
257
		unsigned long to_state, unsigned long from_state_req)
258
{
259
	int ret = 0;
260

261 262
	if (mem->state != from_state_req)
		return -EINVAL;
263

264 265 266
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

267 268 269
	ret = memory_block_action(mem->start_section_nr, to_state,
				mem->online_type);

270
	mem->state = ret ? from_state_req : to_state;
271

272 273
	return ret;
}
274

275
/* The device lock serializes operations on memory_subsys_[online|offline] */
276 277
static int memory_subsys_online(struct device *dev)
{
278
	struct memory_block *mem = to_memory_block(dev);
279
	int ret;
280

281 282
	if (mem->state == MEM_ONLINE)
		return 0;
283

284 285 286 287 288 289
	/*
	 * If we are called from store_mem_state(), online_type will be
	 * set >= 0 Otherwise we were called from the device online
	 * attribute and need to set the online_type.
	 */
	if (mem->online_type < 0)
290
		mem->online_type = MMOP_ONLINE_KEEP;
291

292
	/* Already under protection of mem_hotplug_begin() */
293
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
294

295 296
	/* clear online_type */
	mem->online_type = -1;
297 298 299 300 301

	return ret;
}

static int memory_subsys_offline(struct device *dev)
302
{
303
	struct memory_block *mem = to_memory_block(dev);
304

305 306
	if (mem->state == MEM_OFFLINE)
		return 0;
307

308 309 310 311
	/* Can't offline block with non-present sections */
	if (mem->section_count != sections_per_block)
		return -EINVAL;

312
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
313
}
314

315
static ssize_t
316 317
store_mem_state(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
318
{
319
	struct memory_block *mem = to_memory_block(dev);
320
	int ret, online_type;
321

322 323 324
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
325

326
	if (sysfs_streq(buf, "online_kernel"))
327
		online_type = MMOP_ONLINE_KERNEL;
328
	else if (sysfs_streq(buf, "online_movable"))
329
		online_type = MMOP_ONLINE_MOVABLE;
330
	else if (sysfs_streq(buf, "online"))
331
		online_type = MMOP_ONLINE_KEEP;
332
	else if (sysfs_streq(buf, "offline"))
333
		online_type = MMOP_OFFLINE;
334 335 336 337
	else {
		ret = -EINVAL;
		goto err;
	}
338

339 340 341 342 343 344 345 346 347
	/*
	 * Memory hotplug needs to hold mem_hotplug_begin() for probe to find
	 * the correct memory block to online before doing device_online(dev),
	 * which will take dev->mutex.  Take the lock early to prevent an
	 * inversion, memory_subsys_online() callbacks will be implemented by
	 * assuming it's already protected.
	 */
	mem_hotplug_begin();

348
	switch (online_type) {
349 350 351
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
	case MMOP_ONLINE_KEEP:
352 353 354
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
355
	case MMOP_OFFLINE:
356 357 358 359
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
360 361
	}

362
	mem_hotplug_done();
363
err:
364
	unlock_device_hotplug();
365

366
	if (ret < 0)
367
		return ret;
368 369 370
	if (ret)
		return -EINVAL;

371 372 373 374 375 376 377 378 379 380 381 382
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
383 384
static ssize_t show_phys_device(struct device *dev,
				struct device_attribute *attr, char *buf)
385
{
386
	struct memory_block *mem = to_memory_block(dev);
387 388 389
	return sprintf(buf, "%d\n", mem->phys_device);
}

390 391 392 393 394
#ifdef CONFIG_MEMORY_HOTREMOVE
static ssize_t show_valid_zones(struct device *dev,
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
395
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
396
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
397 398 399
	unsigned long valid_start_pfn, valid_end_pfn;
	bool append = false;
	int nid;
400

401 402 403 404 405
	/*
	 * The block contains more than one zone can not be offlined.
	 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
	 */
	if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
406 407
		return sprintf(buf, "none\n");

408 409
	start_pfn = valid_start_pfn;
	nr_pages = valid_end_pfn - start_pfn;
410

411 412 413 414 415 416 417
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
		strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
		goto out;
418 419
	}

420 421
	nid = pfn_to_nid(start_pfn);
	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
422
		strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
423
		append = true;
424 425
	}

426 427 428 429 430 431
	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
		if (append)
			strcat(buf, " ");
		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
	}
out:
432 433 434
	strcat(buf, "\n");

	return strlen(buf);
435 436 437 438
}
static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
#endif

439 440 441 442
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
443 444 445 446 447

/*
 * Block size attribute stuff
 */
static ssize_t
448
print_block_size(struct device *dev, struct device_attribute *attr,
449
		 char *buf)
450
{
451
	return sprintf(buf, "%lx\n", get_memory_block_size());
452 453
}

454
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
455

456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
/*
 * Memory auto online policy.
 */

static ssize_t
show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
			char *buf)
{
	if (memhp_auto_online)
		return sprintf(buf, "online\n");
	else
		return sprintf(buf, "offline\n");
}

static ssize_t
store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
			 const char *buf, size_t count)
{
	if (sysfs_streq(buf, "online"))
		memhp_auto_online = true;
	else if (sysfs_streq(buf, "offline"))
		memhp_auto_online = false;
	else
		return -EINVAL;

	return count;
}

static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
		   store_auto_online_blocks);

487 488 489 490 491 492 493 494
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
495
memory_probe_store(struct device *dev, struct device_attribute *attr,
496
		   const char *buf, size_t count)
497 498
{
	u64 phys_addr;
499
	int nid, ret;
500
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
501

502 503 504
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
505

506 507 508
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

509 510 511
	nid = memory_add_physaddr_to_nid(phys_addr);
	ret = add_memory(nid, phys_addr,
			 MIN_MEMORY_BLOCK_SIZE * sections_per_block);
512

513 514
	if (ret)
		goto out;
515

516 517 518
	ret = count;
out:
	return ret;
519 520
}

521
static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
522 523
#endif

524 525 526 527 528 529 530
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
531 532
store_soft_offline_page(struct device *dev,
			struct device_attribute *attr,
533
			const char *buf, size_t count)
534 535 536 537 538
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
539
	if (kstrtoull(buf, 0, &pfn) < 0)
540 541 542 543 544 545 546 547 548 549
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
550 551
store_hard_offline_page(struct device *dev,
			struct device_attribute *attr,
552
			const char *buf, size_t count)
553 554 555 556 557
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
558
	if (kstrtoull(buf, 0, &pfn) < 0)
559 560
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
561
	ret = memory_failure(pfn, 0, 0);
562 563 564
	return ret ? ret : count;
}

565 566
static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
567 568
#endif

569 570 571 572 573
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
574 575 576 577
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
578

579 580 581 582
/*
 * A reference for the returned object is held and the reference for the
 * hinted object is released.
 */
583 584
struct memory_block *find_memory_block_hinted(struct mem_section *section,
					      struct memory_block *hint)
585
{
586
	int block_id = base_memory_block_id(__section_nr(section));
587 588
	struct device *hintdev = hint ? &hint->dev : NULL;
	struct device *dev;
589

590 591 592 593
	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
	if (hint)
		put_device(&hint->dev);
	if (!dev)
594
		return NULL;
595
	return to_memory_block(dev);
596 597
}

598 599 600 601 602 603
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
604
 * This could be made generic for all device subsystems.
605 606 607 608 609 610
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
	return find_memory_block_hinted(section, NULL);
}

611 612 613 614 615
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
616 617 618
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
	NULL
};

static struct attribute_group memory_memblk_attr_group = {
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
641
	memory->dev.offline = memory->state == MEM_OFFLINE;
642

643
	return device_register(&memory->dev);
644 645
}

646 647
static int init_memory_block(struct memory_block **memory,
			     struct mem_section *section, unsigned long state)
648
{
649
	struct memory_block *mem;
650
	unsigned long start_pfn;
651
	int scn_nr;
652 653
	int ret = 0;

654
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
655 656 657
	if (!mem)
		return -ENOMEM;

658
	scn_nr = __section_nr(section);
659 660 661
	mem->start_section_nr =
			base_memory_block_id(scn_nr) * sections_per_block;
	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
662
	mem->state = state;
663
	start_pfn = section_nr_to_pfn(mem->start_section_nr);
664 665
	mem->phys_device = arch_get_memory_phys_device(start_pfn);

666 667 668 669 670 671
	ret = register_memory(mem);

	*memory = mem;
	return ret;
}

672
static int add_memory_block(int base_section_nr)
673
{
674 675
	struct memory_block *mem;
	int i, ret, section_count = 0, section_nr;
676

677 678 679 680 681 682 683 684
	for (i = base_section_nr;
	     (i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS;
	     i++) {
		if (!present_section_nr(i))
			continue;
		if (section_count == 0)
			section_nr = i;
		section_count++;
685 686
	}

687 688 689 690 691 692 693
	if (section_count == 0)
		return 0;
	ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE);
	if (ret)
		return ret;
	mem->section_count = section_count;
	return 0;
694 695
}

696 697 698 699 700 701
/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
int register_new_memory(int nid, struct mem_section *section)
{
702 703
	int ret = 0;
	struct memory_block *mem;
704 705 706

	mutex_lock(&mem_sysfs_mutex);

707 708 709 710 711 712 713 714
	mem = find_memory_block(section);
	if (mem) {
		mem->section_count++;
		put_device(&mem->dev);
	} else {
		ret = init_memory_block(&mem, section, MEM_OFFLINE);
		if (ret)
			goto out;
715
		mem->section_count++;
716 717 718 719 720 721
	}

	if (mem->section_count == sections_per_block)
		ret = register_mem_sect_under_node(mem, nid);
out:
	mutex_unlock(&mem_sysfs_mutex);
722
	return ret;
723 724 725 726 727 728 729 730 731
}

#ifdef CONFIG_MEMORY_HOTREMOVE
static void
unregister_memory(struct memory_block *memory)
{
	BUG_ON(memory->dev.bus != &memory_subsys);

	/* drop the ref. we got in remove_memory_block() */
732
	put_device(&memory->dev);
733 734 735
	device_unregister(&memory->dev);
}

736
static int remove_memory_section(unsigned long node_id,
737
			       struct mem_section *section, int phys_device)
738 739 740
{
	struct memory_block *mem;

741
	mutex_lock(&mem_sysfs_mutex);
742 743 744 745 746

	/*
	 * Some users of the memory hotplug do not want/need memblock to
	 * track all sections. Skip over those.
	 */
747
	mem = find_memory_block(section);
748 749 750
	if (!mem)
		goto out_unlock;

751
	unregister_mem_sect_under_nodes(mem, __section_nr(section));
752 753

	mem->section_count--;
754
	if (mem->section_count == 0)
755
		unregister_memory(mem);
756
	else
757
		put_device(&mem->dev);
758

759
out_unlock:
760
	mutex_unlock(&mem_sysfs_mutex);
761 762 763 764 765
	return 0;
}

int unregister_memory_section(struct mem_section *section)
{
766
	if (!present_section(section))
767 768
		return -EINVAL;

769
	return remove_memory_section(0, section, 0);
770
}
771
#endif /* CONFIG_MEMORY_HOTREMOVE */
772

773 774 775 776 777 778
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

779 780 781 782 783 784 785 786 787 788 789
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
790
	&dev_attr_auto_online_blocks.attr,
791 792 793 794 795 796 797 798 799 800 801 802
	NULL
};

static struct attribute_group memory_root_attr_group = {
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

803 804 805 806 807 808 809
/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
810
	int err;
811
	unsigned long block_sz;
812

813
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
814 815
	if (ret)
		goto out;
816

817 818 819
	block_sz = get_memory_block_size();
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

820 821 822 823
	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
824
	mutex_lock(&mem_sysfs_mutex);
825
	for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
826 827 828 829
		/* Don't iterate over sections we know are !present: */
		if (i > __highest_present_section_nr)
			break;

830
		err = add_memory_block(i);
831 832
		if (!ret)
			ret = err;
833
	}
834
	mutex_unlock(&mem_sysfs_mutex);
835

836 837
out:
	if (ret)
838
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
839 840
	return ret;
}