memory.c 21.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Memory subsystem support
4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24

Arun Sharma's avatar
Arun Sharma committed
25
#include <linux/atomic.h>
26
#include <linux/uaccess.h>
27 28

#define MEMORY_CLASS_NAME	"memory"
29

30 31
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

32 33
static int sections_per_block;

34
static inline unsigned long base_memory_block_id(unsigned long section_nr)
35 36 37
{
	return section_nr / sections_per_block;
}
38

39
static inline unsigned long pfn_to_block_id(unsigned long pfn)
40 41 42 43
{
	return base_memory_block_id(pfn_to_section_nr(pfn));
}

44 45 46 47 48
static inline unsigned long phys_to_block_id(unsigned long phys)
{
	return pfn_to_block_id(PFN_DOWN(phys));
}

49 50 51
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

52
static struct bus_type memory_subsys = {
53
	.name = MEMORY_CLASS_NAME,
54
	.dev_name = MEMORY_CLASS_NAME,
55 56
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
57 58
};

59
static BLOCKING_NOTIFIER_HEAD(memory_chain);
60

61
int register_memory_notifier(struct notifier_block *nb)
62
{
63
	return blocking_notifier_chain_register(&memory_chain, nb);
64
}
65
EXPORT_SYMBOL(register_memory_notifier);
66

67
void unregister_memory_notifier(struct notifier_block *nb)
68
{
69
	blocking_notifier_chain_unregister(&memory_chain, nb);
70
}
71
EXPORT_SYMBOL(unregister_memory_notifier);
72

73 74
static void memory_block_release(struct device *dev)
{
75
	struct memory_block *mem = to_memory_block(dev);
76 77 78 79

	kfree(mem);
}

80 81 82 83
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}
84
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
85

86
/*
87
 * Show the first physical section index (number) of this memory block.
88
 */
89 90
static ssize_t phys_index_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
91
{
92
	struct memory_block *mem = to_memory_block(dev);
93 94 95 96 97 98
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
}

99
/*
100 101
 * Legacy interface that we cannot remove. Always indicate "removable"
 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
102
 */
103 104
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
			      char *buf)
105
{
106
	return sprintf(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
107 108
}

109 110 111
/*
 * online, offline, going offline, etc.
 */
112 113
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
			  char *buf)
114
{
115
	struct memory_block *mem = to_memory_block(dev);
116 117 118 119 120 121 122
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
123 124 125 126 127 128 129 130 131 132 133 134 135 136
	case MEM_ONLINE:
		len = sprintf(buf, "online\n");
		break;
	case MEM_OFFLINE:
		len = sprintf(buf, "offline\n");
		break;
	case MEM_GOING_OFFLINE:
		len = sprintf(buf, "going-offline\n");
		break;
	default:
		len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
				mem->state);
		WARN_ON(1);
		break;
137 138 139 140 141
	}

	return len;
}

142
int memory_notify(unsigned long val, void *v)
143
{
144
	return blocking_notifier_call_chain(&memory_chain, val, v);
145 146
}

147
/*
148 149 150
 * The probe routines leave the pages uninitialized, just as the bootmem code
 * does. Make sure we do not access them, but instead use only information from
 * within sections.
151
 */
152
static bool pages_correctly_probed(unsigned long start_pfn)
153
{
154 155
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	unsigned long section_nr_end = section_nr + sections_per_block;
156 157 158 159 160 161 162
	unsigned long pfn = start_pfn;

	/*
	 * memmap between sections is not contiguous except with
	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
	 * and assume memmap is contiguous within each section
	 */
163
	for (; section_nr < section_nr_end; section_nr++) {
164 165 166
		if (WARN_ON_ONCE(!pfn_valid(pfn)))
			return false;

167
		if (!present_section_nr(section_nr)) {
168
			pr_warn("section %ld pfn[%lx, %lx) not present\n",
169 170 171
				section_nr, pfn, pfn + PAGES_PER_SECTION);
			return false;
		} else if (!valid_section_nr(section_nr)) {
172
			pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n",
173 174 175
				section_nr, pfn, pfn + PAGES_PER_SECTION);
			return false;
		} else if (online_section_nr(section_nr)) {
176
			pr_warn("section %ld pfn[%lx, %lx) is already online\n",
177
				section_nr, pfn, pfn + PAGES_PER_SECTION);
178 179
			return false;
		}
180
		pfn += PAGES_PER_SECTION;
181 182 183 184 185
	}

	return true;
}

186 187 188 189 190
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
191
memory_block_action(unsigned long start_section_nr, unsigned long action,
192
		    int online_type, int nid)
193
{
194
	unsigned long start_pfn;
195
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
196 197
	int ret;

198
	start_pfn = section_nr_to_pfn(start_section_nr);
199

200
	switch (action) {
201
	case MEM_ONLINE:
202
		if (!pages_correctly_probed(start_pfn))
203 204
			return -EBUSY;

205
		ret = online_pages(start_pfn, nr_pages, online_type, nid);
206 207 208 209 210 211
		break;
	case MEM_OFFLINE:
		ret = offline_pages(start_pfn, nr_pages);
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
212
		     "%ld\n", __func__, start_section_nr, action, action);
213
		ret = -EINVAL;
214 215 216 217 218
	}

	return ret;
}

219
static int memory_block_change_state(struct memory_block *mem,
220
		unsigned long to_state, unsigned long from_state_req)
221
{
222
	int ret = 0;
223

224 225
	if (mem->state != from_state_req)
		return -EINVAL;
226

227 228 229
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

230
	ret = memory_block_action(mem->start_section_nr, to_state,
231
				  mem->online_type, mem->nid);
232

233
	mem->state = ret ? from_state_req : to_state;
234

235 236
	return ret;
}
237

238
/* The device lock serializes operations on memory_subsys_[online|offline] */
239 240
static int memory_subsys_online(struct device *dev)
{
241
	struct memory_block *mem = to_memory_block(dev);
242
	int ret;
243

244 245
	if (mem->state == MEM_ONLINE)
		return 0;
246

247
	/*
248
	 * If we are called from state_store(), online_type will be
249 250 251 252
	 * set >= 0 Otherwise we were called from the device online
	 * attribute and need to set the online_type.
	 */
	if (mem->online_type < 0)
253
		mem->online_type = MMOP_ONLINE_KEEP;
254

255
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
256

257 258
	/* clear online_type */
	mem->online_type = -1;
259 260 261 262 263

	return ret;
}

static int memory_subsys_offline(struct device *dev)
264
{
265
	struct memory_block *mem = to_memory_block(dev);
266

267 268
	if (mem->state == MEM_OFFLINE)
		return 0;
269

270 271 272 273
	/* Can't offline block with non-present sections */
	if (mem->section_count != sections_per_block)
		return -EINVAL;

274
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
275
}
276

277 278
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
279
{
280
	struct memory_block *mem = to_memory_block(dev);
281
	int ret, online_type;
282

283 284 285
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
286

287
	if (sysfs_streq(buf, "online_kernel"))
288
		online_type = MMOP_ONLINE_KERNEL;
289
	else if (sysfs_streq(buf, "online_movable"))
290
		online_type = MMOP_ONLINE_MOVABLE;
291
	else if (sysfs_streq(buf, "online"))
292
		online_type = MMOP_ONLINE_KEEP;
293
	else if (sysfs_streq(buf, "offline"))
294
		online_type = MMOP_OFFLINE;
295 296 297 298
	else {
		ret = -EINVAL;
		goto err;
	}
299 300

	switch (online_type) {
301 302 303
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
	case MMOP_ONLINE_KEEP:
304
		/* mem->online_type is protected by device_hotplug_lock */
305 306 307
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
308
	case MMOP_OFFLINE:
309 310 311 312
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
313 314
	}

315
err:
316
	unlock_device_hotplug();
317

318
	if (ret < 0)
319
		return ret;
320 321 322
	if (ret)
		return -EINVAL;

323 324 325 326 327 328 329 330 331 332 333 334
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
335
static ssize_t phys_device_show(struct device *dev,
336
				struct device_attribute *attr, char *buf)
337
{
338
	struct memory_block *mem = to_memory_block(dev);
339 340 341
	return sprintf(buf, "%d\n", mem->phys_device);
}

342
#ifdef CONFIG_MEMORY_HOTREMOVE
343 344 345 346 347 348 349 350 351 352 353 354 355
static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
		unsigned long nr_pages, int online_type,
		struct zone *default_zone)
{
	struct zone *zone;

	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
	if (zone != default_zone) {
		strcat(buf, " ");
		strcat(buf, zone->name);
	}
}

356
static ssize_t valid_zones_show(struct device *dev,
357 358 359
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
360
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
361
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
362
	struct zone *default_zone;
363
	int nid;
364

365 366 367 368 369
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
370 371 372 373
		/*
		 * The block contains more than one zone can not be offlined.
		 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
		 */
374 375 376
		default_zone = test_pages_in_a_zone(start_pfn,
						    start_pfn + nr_pages);
		if (!default_zone)
377
			return sprintf(buf, "none\n");
378
		strcat(buf, default_zone->name);
379
		goto out;
380 381
	}

382
	nid = mem->nid;
383 384
	default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
	strcat(buf, default_zone->name);
385

386 387 388 389
	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
			default_zone);
	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
			default_zone);
390
out:
391 392 393
	strcat(buf, "\n");

	return strlen(buf);
394
}
395
static DEVICE_ATTR_RO(valid_zones);
396 397
#endif

398 399 400 401
static DEVICE_ATTR_RO(phys_index);
static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
402 403

/*
404
 * Show the memory block size (shared by all memory blocks).
405
 */
406 407
static ssize_t block_size_bytes_show(struct device *dev,
				     struct device_attribute *attr, char *buf)
408
{
409
	return sprintf(buf, "%lx\n", memory_block_size_bytes());
410 411
}

412
static DEVICE_ATTR_RO(block_size_bytes);
413

414 415 416 417
/*
 * Memory auto online policy.
 */

418 419
static ssize_t auto_online_blocks_show(struct device *dev,
				       struct device_attribute *attr, char *buf)
420 421 422 423 424 425 426
{
	if (memhp_auto_online)
		return sprintf(buf, "online\n");
	else
		return sprintf(buf, "offline\n");
}

427 428 429
static ssize_t auto_online_blocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
430 431 432 433 434 435 436 437 438 439 440
{
	if (sysfs_streq(buf, "online"))
		memhp_auto_online = true;
	else if (sysfs_streq(buf, "offline"))
		memhp_auto_online = false;
	else
		return -EINVAL;

	return count;
}

441
static DEVICE_ATTR_RW(auto_online_blocks);
442

443 444 445 446 447 448 449
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
450 451
static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
452 453
{
	u64 phys_addr;
454
	int nid, ret;
455
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
456

457 458 459
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
460

461 462 463
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

464 465
	ret = lock_device_hotplug_sysfs();
	if (ret)
466
		return ret;
467

468
	nid = memory_add_physaddr_to_nid(phys_addr);
469 470
	ret = __add_memory(nid, phys_addr,
			   MIN_MEMORY_BLOCK_SIZE * sections_per_block);
471

472 473
	if (ret)
		goto out;
474

475 476
	ret = count;
out:
477
	unlock_device_hotplug();
478
	return ret;
479 480
}

481
static DEVICE_ATTR_WO(probe);
482 483
#endif

484 485 486 487 488 489
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
490 491 492
static ssize_t soft_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
493 494 495 496 497
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
498
	if (kstrtoull(buf, 0, &pfn) < 0)
499 500
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
501
	ret = soft_offline_page(pfn, 0);
502 503 504 505
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
506 507 508
static ssize_t hard_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
509 510 511 512 513
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
514
	if (kstrtoull(buf, 0, &pfn) < 0)
515 516
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
517
	ret = memory_failure(pfn, 0);
518 519 520
	return ret ? ret : count;
}

521 522
static DEVICE_ATTR_WO(soft_offline_page);
static DEVICE_ATTR_WO(hard_offline_page);
523 524
#endif

525 526 527 528 529
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
530 531 532 533
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
534

535 536
/* A reference for the returned memory block device is acquired. */
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
537
{
538
	struct device *dev;
539

540 541
	dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
	return dev ? to_memory_block(dev) : NULL;
542 543
}

544 545 546 547 548 549
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
550
 * This could be made generic for all device subsystems.
551 552 553
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
554 555 556
	unsigned long block_id = base_memory_block_id(__section_nr(section));

	return find_memory_block_by_id(block_id);
557 558
}

559 560 561 562 563
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
564 565 566
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
	NULL
};

static struct attribute_group memory_memblk_attr_group = {
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
585 586
	int ret;

587 588 589 590
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
591
	memory->dev.offline = memory->state == MEM_OFFLINE;
592

593 594 595 596 597
	ret = device_register(&memory->dev);
	if (ret)
		put_device(&memory->dev);

	return ret;
598 599
}

600 601
static int init_memory_block(struct memory_block **memory,
			     unsigned long block_id, unsigned long state)
602
{
603
	struct memory_block *mem;
604 605 606
	unsigned long start_pfn;
	int ret = 0;

607
	mem = find_memory_block_by_id(block_id);
608 609 610 611
	if (mem) {
		put_device(&mem->dev);
		return -EEXIST;
	}
612
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
613 614 615
	if (!mem)
		return -ENOMEM;

616
	mem->start_section_nr = block_id * sections_per_block;
617
	mem->state = state;
618
	start_pfn = section_nr_to_pfn(mem->start_section_nr);
619
	mem->phys_device = arch_get_memory_phys_device(start_pfn);
620
	mem->nid = NUMA_NO_NODE;
621

622 623 624 625 626 627
	ret = register_memory(mem);

	*memory = mem;
	return ret;
}

628
static int add_memory_block(unsigned long base_section_nr)
629
{
630
	int ret, section_count = 0;
631
	struct memory_block *mem;
632
	unsigned long nr;
633

634 635 636
	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
	     nr++)
		if (present_section_nr(nr))
637
			section_count++;
638

639 640
	if (section_count == 0)
		return 0;
641 642
	ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
				MEM_ONLINE);
643 644 645 646
	if (ret)
		return ret;
	mem->section_count = section_count;
	return 0;
647 648
}

649 650 651 652 653 654 655 656 657 658
static void unregister_memory(struct memory_block *memory)
{
	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
		return;

	/* drop the ref. we got via find_memory_block() */
	put_device(&memory->dev);
	device_unregister(&memory->dev);
}

659
/*
660 661 662
 * Create memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * will be initialized as offline.
663 664
 *
 * Called under device_hotplug_lock.
665
 */
666
int create_memory_block_devices(unsigned long start, unsigned long size)
667
{
668 669
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
670
	struct memory_block *mem;
671 672
	unsigned long block_id;
	int ret = 0;
673

674 675 676
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
		return -EINVAL;
677

678
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
679
		ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
680
		if (ret)
681 682 683 684 685 686 687
			break;
		mem->section_count = sections_per_block;
	}
	if (ret) {
		end_block_id = block_id;
		for (block_id = start_block_id; block_id != end_block_id;
		     block_id++) {
688
			mem = find_memory_block_by_id(block_id);
689 690
			if (WARN_ON_ONCE(!mem))
				continue;
691 692 693
			mem->section_count = 0;
			unregister_memory(mem);
		}
694
	}
695
	return ret;
696 697
}

698 699 700 701
/*
 * Remove memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * have to be offline.
702 703
 *
 * Called under device_hotplug_lock.
704 705
 */
void remove_memory_block_devices(unsigned long start, unsigned long size)
706
{
707 708
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
709
	struct memory_block *mem;
710
	unsigned long block_id;
711

712 713
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
714 715
		return;

716
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
717
		mem = find_memory_block_by_id(block_id);
718 719 720 721
		if (WARN_ON_ONCE(!mem))
			continue;
		mem->section_count = 0;
		unregister_memory_block_under_nodes(mem);
722
		unregister_memory(mem);
723
	}
724 725
}

726 727 728 729 730 731
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

732 733 734 735 736 737 738 739 740 741 742
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
743
	&dev_attr_auto_online_blocks.attr,
744 745 746 747 748 749 750 751 752 753 754 755
	NULL
};

static struct attribute_group memory_root_attr_group = {
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

756
/*
757 758 759
 * Initialize the sysfs support for memory devices. At the time this function
 * is called, we cannot have concurrent creation/deletion of memory block
 * devices, the device_hotplug_lock is not needed.
760
 */
761
void __init memory_dev_init(void)
762 763
{
	int ret;
764
	unsigned long block_sz, nr;
765

766 767 768 769 770 771
	/* Validate the configured memory block size */
	block_sz = memory_block_size_bytes();
	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
		panic("Memory block size not suitable: 0x%lx\n", block_sz);
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

772
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
773
	if (ret)
774
		panic("%s() failed to register subsystem: %d\n", __func__, ret);
775 776 777 778 779

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
780 781
	for (nr = 0; nr <= __highest_present_section_nr;
	     nr += sections_per_block) {
782 783 784 785
		ret = add_memory_block(nr);
		if (ret)
			panic("%s() failed to add memory block: %d\n", __func__,
			      ret);
786 787
	}
}
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812

/**
 * walk_memory_blocks - walk through all present memory blocks overlapped
 *			by the range [start, start + size)
 *
 * @start: start address of the memory range
 * @size: size of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present memory blocks overlapped by the
 * range [start, start + size), calling func on each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int walk_memory_blocks(unsigned long start, unsigned long size,
		       void *arg, walk_memory_blocks_func_t func)
{
	const unsigned long start_block_id = phys_to_block_id(start);
	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
	struct memory_block *mem;
	unsigned long block_id;
	int ret = 0;

813 814 815
	if (!size)
		return 0;

816
	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
817
		mem = find_memory_block_by_id(block_id);
818 819 820 821 822 823 824 825 826 827
		if (!mem)
			continue;

		ret = func(mem, arg);
		put_device(&mem->dev);
		if (ret)
			break;
	}
	return ret;
}
828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863

struct for_each_memory_block_cb_data {
	walk_memory_blocks_func_t func;
	void *arg;
};

static int for_each_memory_block_cb(struct device *dev, void *data)
{
	struct memory_block *mem = to_memory_block(dev);
	struct for_each_memory_block_cb_data *cb_data = data;

	return cb_data->func(mem, cb_data->arg);
}

/**
 * for_each_memory_block - walk through all present memory blocks
 *
 * @arg: argument passed to func
 * @func: callback for each memory block walked
 *
 * This function walks through all present memory blocks, calling func on
 * each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
{
	struct for_each_memory_block_cb_data cb_data = {
		.func = func,
		.arg = arg,
	};

	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
				for_each_memory_block_cb);
}