memory.c 16.3 KB
Newer Older
1
/*
2
 * Memory subsystem support
3
4
5
6
7
8
9
10
11
12
13
14
15
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
16
#include <linux/capability.h>
17
18
19
20
21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/mutex.h>
23
#include <linux/stat.h>
24
#include <linux/slab.h>
25

Arun Sharma's avatar
Arun Sharma committed
26
#include <linux/atomic.h>
27
28
#include <asm/uaccess.h>

29
30
static DEFINE_MUTEX(mem_sysfs_mutex);

31
#define MEMORY_CLASS_NAME	"memory"
32
33
34
35
36
37
38

static int sections_per_block;

static inline int base_memory_block_id(int section_nr)
{
	return section_nr / sections_per_block;
}
39

40
static struct bus_type memory_subsys = {
41
	.name = MEMORY_CLASS_NAME,
42
	.dev_name = MEMORY_CLASS_NAME,
43
44
};

45
static BLOCKING_NOTIFIER_HEAD(memory_chain);
46

47
int register_memory_notifier(struct notifier_block *nb)
48
{
49
        return blocking_notifier_chain_register(&memory_chain, nb);
50
}
51
EXPORT_SYMBOL(register_memory_notifier);
52

53
void unregister_memory_notifier(struct notifier_block *nb)
54
{
55
        blocking_notifier_chain_unregister(&memory_chain, nb);
56
}
57
EXPORT_SYMBOL(unregister_memory_notifier);
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

73
74
75
/*
 * register_memory - Setup a sysfs device for a memory block
 */
76
static
77
int register_memory(struct memory_block *memory)
78
79
80
{
	int error;

81
82
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
83

84
	error = device_register(&memory->dev);
85
86
87
88
	return error;
}

static void
89
unregister_memory(struct memory_block *memory)
90
{
91
	BUG_ON(memory->dev.bus != &memory_subsys);
92

93
	/* drop the ref. we got in remove_memory_block() */
94
95
	kobject_put(&memory->dev.kobj);
	device_unregister(&memory->dev);
96
97
}

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}

static unsigned long get_memory_block_size(void)
{
	unsigned long block_sz;

	block_sz = memory_block_size_bytes();

	/* Validate blk_sz is a power of 2 and not less than section size */
	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
		WARN_ON(1);
		block_sz = MIN_MEMORY_BLOCK_SIZE;
	}

	return block_sz;
}

118
119
120
121
122
/*
 * use this as the physical section index that this memsection
 * uses.
 */

123
124
static ssize_t show_mem_start_phys_index(struct device *dev,
			struct device_attribute *attr, char *buf)
125
126
{
	struct memory_block *mem =
127
		container_of(dev, struct memory_block, dev);
128
129
130
131
132
133
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
}

134
135
static ssize_t show_mem_end_phys_index(struct device *dev,
			struct device_attribute *attr, char *buf)
136
137
{
	struct memory_block *mem =
138
		container_of(dev, struct memory_block, dev);
139
140
141
142
	unsigned long phys_index;

	phys_index = mem->end_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
143
144
}

145
146
147
/*
 * Show whether the section of memory is likely to be hot-removable
 */
148
149
static ssize_t show_mem_removable(struct device *dev,
			struct device_attribute *attr, char *buf)
150
{
151
152
	unsigned long i, pfn;
	int ret = 1;
153
	struct memory_block *mem =
154
		container_of(dev, struct memory_block, dev);
155

156
	for (i = 0; i < sections_per_block; i++) {
157
		pfn = section_nr_to_pfn(mem->start_section_nr + i);
158
159
160
		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
	}

161
162
163
	return sprintf(buf, "%d\n", ret);
}

164
165
166
/*
 * online, offline, going offline, etc.
 */
167
168
static ssize_t show_mem_state(struct device *dev,
			struct device_attribute *attr, char *buf)
169
170
{
	struct memory_block *mem =
171
		container_of(dev, struct memory_block, dev);
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

198
int memory_notify(unsigned long val, void *v)
199
{
200
	return blocking_notifier_call_chain(&memory_chain, val, v);
201
202
}

203
204
205
206
207
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/*
 * The probe routines leave the pages reserved, just as the bootmem code does.
 * Make sure they're still that way.
 */
static bool pages_correctly_reserved(unsigned long start_pfn,
					unsigned long nr_pages)
{
	int i, j;
	struct page *page;
	unsigned long pfn = start_pfn;

	/*
	 * memmap between sections is not contiguous except with
	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
	 * and assume memmap is contiguous within each section
	 */
	for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
		if (WARN_ON_ONCE(!pfn_valid(pfn)))
			return false;
		page = pfn_to_page(pfn);

		for (j = 0; j < PAGES_PER_SECTION; j++) {
			if (PageReserved(page + j))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online?\n",
				pfn_to_section_nr(pfn), j);

			return false;
		}
	}

	return true;
}

244
245
246
247
248
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
249
memory_block_action(unsigned long phys_index, unsigned long action)
250
251
{
	unsigned long start_pfn, start_paddr;
252
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
253
	struct page *first_page;
254
255
	int ret;

256
257
	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);

258
259
	switch (action) {
		case MEM_ONLINE:
260
			start_pfn = page_to_pfn(first_page);
261
262
263
264

			if (!pages_correctly_reserved(start_pfn, nr_pages))
				return -EBUSY;

265
			ret = online_pages(start_pfn, nr_pages);
266
267
			break;
		case MEM_OFFLINE:
268
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
269
			ret = remove_memory(start_paddr,
270
					    nr_pages << PAGE_SHIFT);
271
272
			break;
		default:
273
274
			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
			     "%ld\n", __func__, phys_index, action, action);
275
276
277
278
279
280
281
282
283
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
284
	int ret = 0;
285

286
	mutex_lock(&mem->state_mutex);
287
288
289
290
291
292

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

293
294
295
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

296
	ret = memory_block_action(mem->start_section_nr, to_state);
297

298
	if (ret) {
299
		mem->state = from_state_req;
300
301
		goto out;
	}
302

303
304
305
306
307
308
309
310
311
312
313
	mem->state = to_state;
	switch (mem->state) {
	case MEM_OFFLINE:
		kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
		break;
	case MEM_ONLINE:
		kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
		break;
	default:
		break;
	}
314
out:
315
	mutex_unlock(&mem->state_mutex);
316
317
318
319
	return ret;
}

static ssize_t
320
321
store_mem_state(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
322
323
324
325
{
	struct memory_block *mem;
	int ret = -EINVAL;

326
	mem = container_of(dev, struct memory_block, dev);
327
328
329
330
331

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
332

333
334
335
336
337
338
339
340
341
342
343
344
345
346
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
347
348
static ssize_t show_phys_device(struct device *dev,
				struct device_attribute *attr, char *buf)
349
350
{
	struct memory_block *mem =
351
		container_of(dev, struct memory_block, dev);
352
353
354
	return sprintf(buf, "%d\n", mem->phys_device);
}

355
356
357
358
359
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
360
361

#define mem_create_simple_file(mem, attr_name)	\
362
	device_create_file(&mem->dev, &dev_attr_##attr_name)
363
#define mem_remove_simple_file(mem, attr_name)	\
364
	device_remove_file(&mem->dev, &dev_attr_##attr_name)
365
366
367
368
369

/*
 * Block size attribute stuff
 */
static ssize_t
370
print_block_size(struct device *dev, struct device_attribute *attr,
371
		 char *buf)
372
{
373
	return sprintf(buf, "%lx\n", get_memory_block_size());
374
375
}

376
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
377
378
379

static int block_size_init(void)
{
380
381
	return device_create_file(memory_subsys.dev_root,
				  &dev_attr_block_size_bytes);
382
383
384
385
386
387
388
389
390
391
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
392
memory_probe_store(struct device *dev, struct device_attribute *attr,
393
		   const char *buf, size_t count)
394
395
{
	u64 phys_addr;
396
	int nid;
397
	int i, ret;
398
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
399
400
401

	phys_addr = simple_strtoull(buf, NULL, 0);

402
403
404
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

405
406
407
408
409
	for (i = 0; i < sections_per_block; i++) {
		nid = memory_add_physaddr_to_nid(phys_addr);
		ret = add_memory(nid, phys_addr,
				 PAGES_PER_SECTION << PAGE_SHIFT);
		if (ret)
410
			goto out;
411
412
413

		phys_addr += MIN_MEMORY_BLOCK_SIZE;
	}
414

415
416
417
	ret = count;
out:
	return ret;
418
}
419
static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
420
421
422

static int memory_probe_init(void)
{
423
	return device_create_file(memory_subsys.dev_root, &dev_attr_probe);
424
425
}
#else
426
427
428
429
static inline int memory_probe_init(void)
{
	return 0;
}
430
431
#endif

432
433
434
435
436
437
438
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
439
440
store_soft_offline_page(struct device *dev,
			struct device_attribute *attr,
441
			const char *buf, size_t count)
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
458
459
store_hard_offline_page(struct device *dev,
			struct device_attribute *attr,
460
			const char *buf, size_t count)
461
462
463
464
465
466
467
468
469
470
471
472
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

473
474
static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
475
476
477
478
479

static __init int memory_fail_init(void)
{
	int err;

480
481
	err = device_create_file(memory_subsys.dev_root,
				&dev_attr_soft_offline_page);
482
	if (!err)
483
484
		err = device_create_file(memory_subsys.dev_root,
				&dev_attr_hard_offline_page);
485
486
487
488
489
490
491
492
493
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

494
495
496
497
498
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
499
500
501
502
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
503

504
505
506
507
/*
 * A reference for the returned object is held and the reference for the
 * hinted object is released.
 */
508
509
struct memory_block *find_memory_block_hinted(struct mem_section *section,
					      struct memory_block *hint)
510
{
511
	int block_id = base_memory_block_id(__section_nr(section));
512
513
	struct device *hintdev = hint ? &hint->dev : NULL;
	struct device *dev;
514

515
516
517
518
	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
	if (hint)
		put_device(&hint->dev);
	if (!dev)
519
		return NULL;
520
	return container_of(dev, struct memory_block, dev);
521
522
}

523
524
525
526
527
528
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
529
 * This could be made generic for all device subsystems.
530
531
532
533
534
535
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
	return find_memory_block_hinted(section, NULL);
}

536
537
static int init_memory_block(struct memory_block **memory,
			     struct mem_section *section, unsigned long state)
538
{
539
	struct memory_block *mem;
540
	unsigned long start_pfn;
541
	int scn_nr;
542
543
	int ret = 0;

544
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
545
546
547
	if (!mem)
		return -ENOMEM;

548
	scn_nr = __section_nr(section);
549
550
551
	mem->start_section_nr =
			base_memory_block_id(scn_nr) * sections_per_block;
	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
552
	mem->state = state;
553
	mem->section_count++;
554
	mutex_init(&mem->state_mutex);
555
	start_pfn = section_nr_to_pfn(mem->start_section_nr);
556
557
	mem->phys_device = arch_get_memory_phys_device(start_pfn);

558
	ret = register_memory(mem);
559
560
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
561
562
	if (!ret)
		ret = mem_create_simple_file(mem, end_phys_index);
563
564
565
566
567
568
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584

	*memory = mem;
	return ret;
}

static int add_memory_section(int nid, struct mem_section *section,
			unsigned long state, enum mem_add_context context)
{
	struct memory_block *mem;
	int ret = 0;

	mutex_lock(&mem_sysfs_mutex);

	mem = find_memory_block(section);
	if (mem) {
		mem->section_count++;
585
		kobject_put(&mem->dev.kobj);
586
587
588
	} else
		ret = init_memory_block(&mem, section, state);

589
	if (!ret) {
590
591
		if (context == HOTPLUG &&
		    mem->section_count == sections_per_block)
592
593
594
			ret = register_mem_sect_under_node(mem, nid);
	}

595
	mutex_unlock(&mem_sysfs_mutex);
596
597
598
	return ret;
}

599
600
601
602
603
int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

604
	mutex_lock(&mem_sysfs_mutex);
605
	mem = find_memory_block(section);
606
	unregister_mem_sect_under_nodes(mem, __section_nr(section));
607
608
609
610

	mem->section_count--;
	if (mem->section_count == 0) {
		mem_remove_simple_file(mem, phys_index);
611
		mem_remove_simple_file(mem, end_phys_index);
612
613
614
		mem_remove_simple_file(mem, state);
		mem_remove_simple_file(mem, phys_device);
		mem_remove_simple_file(mem, removable);
615
616
617
		unregister_memory(mem);
		kfree(mem);
	} else
618
		kobject_put(&mem->dev.kobj);
619

620
	mutex_unlock(&mem_sysfs_mutex);
621
622
623
624
625
626
627
	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
628
int register_new_memory(int nid, struct mem_section *section)
629
{
630
	return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG);
631
632
633
634
}

int unregister_memory_section(struct mem_section *section)
{
635
	if (!present_section(section))
636
637
638
639
640
641
642
643
644
645
646
647
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
648
	int err;
649
	unsigned long block_sz;
650

651
	ret = subsys_system_register(&memory_subsys, NULL);
652
653
	if (ret)
		goto out;
654

655
656
657
	block_sz = get_memory_block_size();
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

658
659
660
661
662
	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
663
		if (!present_section_nr(i))
664
			continue;
665
666
		err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE,
					 BOOT);
667
668
		if (!ret)
			ret = err;
669
670
	}

671
	err = memory_probe_init();
672
673
674
	if (!ret)
		ret = err;
	err = memory_fail_init();
675
676
677
678
679
680
681
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
682
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
683
684
	return ret;
}