memory.c 16.1 KB
Newer Older
1
/*
2
 * Memory subsystem support
3
4
5
6
7
8
9
10
11
12
13
14
15
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
16
#include <linux/capability.h>
17
18
19
20
21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/mutex.h>
23
#include <linux/stat.h>
24
#include <linux/slab.h>
25

Arun Sharma's avatar
Arun Sharma committed
26
#include <linux/atomic.h>
27
28
#include <asm/uaccess.h>

29
30
static DEFINE_MUTEX(mem_sysfs_mutex);

31
#define MEMORY_CLASS_NAME	"memory"
32
33
34
35
36
37
38

static int sections_per_block;

static inline int base_memory_block_id(int section_nr)
{
	return section_nr / sections_per_block;
}
39

40
static struct bus_type memory_subsys = {
41
	.name = MEMORY_CLASS_NAME,
42
	.dev_name = MEMORY_CLASS_NAME,
43
44
};

45
static BLOCKING_NOTIFIER_HEAD(memory_chain);
46

47
int register_memory_notifier(struct notifier_block *nb)
48
{
49
        return blocking_notifier_chain_register(&memory_chain, nb);
50
}
51
EXPORT_SYMBOL(register_memory_notifier);
52

53
void unregister_memory_notifier(struct notifier_block *nb)
54
{
55
        blocking_notifier_chain_unregister(&memory_chain, nb);
56
}
57
EXPORT_SYMBOL(unregister_memory_notifier);
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

73
74
75
/*
 * register_memory - Setup a sysfs device for a memory block
 */
76
static
77
int register_memory(struct memory_block *memory)
78
79
80
{
	int error;

81
82
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
83

84
	error = device_register(&memory->dev);
85
86
87
88
	return error;
}

static void
89
unregister_memory(struct memory_block *memory)
90
{
91
	BUG_ON(memory->dev.bus != &memory_subsys);
92

93
	/* drop the ref. we got in remove_memory_block() */
94
95
	kobject_put(&memory->dev.kobj);
	device_unregister(&memory->dev);
96
97
}

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}

static unsigned long get_memory_block_size(void)
{
	unsigned long block_sz;

	block_sz = memory_block_size_bytes();

	/* Validate blk_sz is a power of 2 and not less than section size */
	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
		WARN_ON(1);
		block_sz = MIN_MEMORY_BLOCK_SIZE;
	}

	return block_sz;
}

118
119
120
121
122
/*
 * use this as the physical section index that this memsection
 * uses.
 */

123
124
static ssize_t show_mem_start_phys_index(struct device *dev,
			struct device_attribute *attr, char *buf)
125
126
{
	struct memory_block *mem =
127
		container_of(dev, struct memory_block, dev);
128
129
130
131
132
133
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
}

134
135
static ssize_t show_mem_end_phys_index(struct device *dev,
			struct device_attribute *attr, char *buf)
136
137
{
	struct memory_block *mem =
138
		container_of(dev, struct memory_block, dev);
139
140
141
142
	unsigned long phys_index;

	phys_index = mem->end_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
143
144
}

145
146
147
/*
 * Show whether the section of memory is likely to be hot-removable
 */
148
149
static ssize_t show_mem_removable(struct device *dev,
			struct device_attribute *attr, char *buf)
150
{
151
152
	unsigned long i, pfn;
	int ret = 1;
153
	struct memory_block *mem =
154
		container_of(dev, struct memory_block, dev);
155

156
	for (i = 0; i < sections_per_block; i++) {
157
		pfn = section_nr_to_pfn(mem->start_section_nr + i);
158
159
160
		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
	}

161
162
163
	return sprintf(buf, "%d\n", ret);
}

164
165
166
/*
 * online, offline, going offline, etc.
 */
167
168
static ssize_t show_mem_state(struct device *dev,
			struct device_attribute *attr, char *buf)
169
170
{
	struct memory_block *mem =
171
		container_of(dev, struct memory_block, dev);
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

198
int memory_notify(unsigned long val, void *v)
199
{
200
	return blocking_notifier_call_chain(&memory_chain, val, v);
201
202
}

203
204
205
206
207
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/*
 * The probe routines leave the pages reserved, just as the bootmem code does.
 * Make sure they're still that way.
 */
static bool pages_correctly_reserved(unsigned long start_pfn,
					unsigned long nr_pages)
{
	int i, j;
	struct page *page;
	unsigned long pfn = start_pfn;

	/*
	 * memmap between sections is not contiguous except with
	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
	 * and assume memmap is contiguous within each section
	 */
	for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
		if (WARN_ON_ONCE(!pfn_valid(pfn)))
			return false;
		page = pfn_to_page(pfn);

		for (j = 0; j < PAGES_PER_SECTION; j++) {
			if (PageReserved(page + j))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online?\n",
				pfn_to_section_nr(pfn), j);

			return false;
		}
	}

	return true;
}

244
245
246
247
248
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
249
memory_block_action(unsigned long phys_index, unsigned long action)
250
251
{
	unsigned long start_pfn, start_paddr;
252
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
253
	struct page *first_page;
254
255
	int ret;

256
257
	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);

258
259
	switch (action) {
		case MEM_ONLINE:
260
			start_pfn = page_to_pfn(first_page);
261
262
263
264

			if (!pages_correctly_reserved(start_pfn, nr_pages))
				return -EBUSY;

265
			ret = online_pages(start_pfn, nr_pages);
266
267
			break;
		case MEM_OFFLINE:
268
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
269
			ret = remove_memory(start_paddr,
270
					    nr_pages << PAGE_SHIFT);
271
272
			break;
		default:
273
274
			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
			     "%ld\n", __func__, phys_index, action, action);
275
276
277
278
279
280
281
282
283
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
284
	int ret = 0;
285

286
	mutex_lock(&mem->state_mutex);
287
288
289
290
291
292

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

293
294
295
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

296
	ret = memory_block_action(mem->start_section_nr, to_state);
297

298
	if (ret)
299
		mem->state = from_state_req;
300
	else
301
302
303
		mem->state = to_state;

out:
304
	mutex_unlock(&mem->state_mutex);
305
306
307
308
	return ret;
}

static ssize_t
309
310
store_mem_state(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
311
312
313
314
{
	struct memory_block *mem;
	int ret = -EINVAL;

315
	mem = container_of(dev, struct memory_block, dev);
316
317
318
319
320

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
321

322
323
324
325
326
327
328
329
330
331
332
333
334
335
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
336
337
static ssize_t show_phys_device(struct device *dev,
				struct device_attribute *attr, char *buf)
338
339
{
	struct memory_block *mem =
340
		container_of(dev, struct memory_block, dev);
341
342
343
	return sprintf(buf, "%d\n", mem->phys_device);
}

344
345
346
347
348
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
349
350

#define mem_create_simple_file(mem, attr_name)	\
351
	device_create_file(&mem->dev, &dev_attr_##attr_name)
352
#define mem_remove_simple_file(mem, attr_name)	\
353
	device_remove_file(&mem->dev, &dev_attr_##attr_name)
354
355
356
357
358

/*
 * Block size attribute stuff
 */
static ssize_t
359
print_block_size(struct device *dev, struct device_attribute *attr,
360
		 char *buf)
361
{
362
	return sprintf(buf, "%lx\n", get_memory_block_size());
363
364
}

365
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
366
367
368

static int block_size_init(void)
{
369
370
	return device_create_file(memory_subsys.dev_root,
				  &dev_attr_block_size_bytes);
371
372
373
374
375
376
377
378
379
380
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
381
memory_probe_store(struct device *dev, struct device_attribute *attr,
382
		   const char *buf, size_t count)
383
384
{
	u64 phys_addr;
385
	int nid;
386
	int i, ret;
387
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
388
389
390

	phys_addr = simple_strtoull(buf, NULL, 0);

391
392
393
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

394
395
396
397
398
	for (i = 0; i < sections_per_block; i++) {
		nid = memory_add_physaddr_to_nid(phys_addr);
		ret = add_memory(nid, phys_addr,
				 PAGES_PER_SECTION << PAGE_SHIFT);
		if (ret)
399
			goto out;
400
401
402

		phys_addr += MIN_MEMORY_BLOCK_SIZE;
	}
403

404
405
406
	ret = count;
out:
	return ret;
407
}
408
static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
409
410
411

static int memory_probe_init(void)
{
412
	return device_create_file(memory_subsys.dev_root, &dev_attr_probe);
413
414
}
#else
415
416
417
418
static inline int memory_probe_init(void)
{
	return 0;
}
419
420
#endif

421
422
423
424
425
426
427
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
428
429
store_soft_offline_page(struct device *dev,
			struct device_attribute *attr,
430
			const char *buf, size_t count)
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
447
448
store_hard_offline_page(struct device *dev,
			struct device_attribute *attr,
449
			const char *buf, size_t count)
450
451
452
453
454
455
456
457
458
459
460
461
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

462
463
static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
464
465
466
467
468

static __init int memory_fail_init(void)
{
	int err;

469
470
	err = device_create_file(memory_subsys.dev_root,
				&dev_attr_soft_offline_page);
471
	if (!err)
472
473
		err = device_create_file(memory_subsys.dev_root,
				&dev_attr_hard_offline_page);
474
475
476
477
478
479
480
481
482
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

483
484
485
486
487
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
488
489
490
491
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
492

493
494
495
496
/*
 * A reference for the returned object is held and the reference for the
 * hinted object is released.
 */
497
498
struct memory_block *find_memory_block_hinted(struct mem_section *section,
					      struct memory_block *hint)
499
{
500
	int block_id = base_memory_block_id(__section_nr(section));
501
502
	struct device *hintdev = hint ? &hint->dev : NULL;
	struct device *dev;
503

504
505
506
507
	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
	if (hint)
		put_device(&hint->dev);
	if (!dev)
508
		return NULL;
509
	return container_of(dev, struct memory_block, dev);
510
511
}

512
513
514
515
516
517
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
518
 * This could be made generic for all device subsystems.
519
520
521
522
523
524
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
	return find_memory_block_hinted(section, NULL);
}

525
526
static int init_memory_block(struct memory_block **memory,
			     struct mem_section *section, unsigned long state)
527
{
528
	struct memory_block *mem;
529
	unsigned long start_pfn;
530
	int scn_nr;
531
532
	int ret = 0;

533
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
534
535
536
	if (!mem)
		return -ENOMEM;

537
	scn_nr = __section_nr(section);
538
539
540
	mem->start_section_nr =
			base_memory_block_id(scn_nr) * sections_per_block;
	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
541
	mem->state = state;
542
	mem->section_count++;
543
	mutex_init(&mem->state_mutex);
544
	start_pfn = section_nr_to_pfn(mem->start_section_nr);
545
546
	mem->phys_device = arch_get_memory_phys_device(start_pfn);

547
	ret = register_memory(mem);
548
549
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
550
551
	if (!ret)
		ret = mem_create_simple_file(mem, end_phys_index);
552
553
554
555
556
557
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573

	*memory = mem;
	return ret;
}

static int add_memory_section(int nid, struct mem_section *section,
			unsigned long state, enum mem_add_context context)
{
	struct memory_block *mem;
	int ret = 0;

	mutex_lock(&mem_sysfs_mutex);

	mem = find_memory_block(section);
	if (mem) {
		mem->section_count++;
574
		kobject_put(&mem->dev.kobj);
575
576
577
	} else
		ret = init_memory_block(&mem, section, state);

578
	if (!ret) {
579
580
		if (context == HOTPLUG &&
		    mem->section_count == sections_per_block)
581
582
583
			ret = register_mem_sect_under_node(mem, nid);
	}

584
	mutex_unlock(&mem_sysfs_mutex);
585
586
587
	return ret;
}

588
589
590
591
592
int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

593
	mutex_lock(&mem_sysfs_mutex);
594
	mem = find_memory_block(section);
595
	unregister_mem_sect_under_nodes(mem, __section_nr(section));
596
597
598
599

	mem->section_count--;
	if (mem->section_count == 0) {
		mem_remove_simple_file(mem, phys_index);
600
		mem_remove_simple_file(mem, end_phys_index);
601
602
603
		mem_remove_simple_file(mem, state);
		mem_remove_simple_file(mem, phys_device);
		mem_remove_simple_file(mem, removable);
604
605
606
		unregister_memory(mem);
		kfree(mem);
	} else
607
		kobject_put(&mem->dev.kobj);
608

609
	mutex_unlock(&mem_sysfs_mutex);
610
611
612
613
614
615
616
	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
617
int register_new_memory(int nid, struct mem_section *section)
618
{
619
	return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG);
620
621
622
623
}

int unregister_memory_section(struct mem_section *section)
{
624
	if (!present_section(section))
625
626
627
628
629
630
631
632
633
634
635
636
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
637
	int err;
638
	unsigned long block_sz;
639

640
	ret = subsys_system_register(&memory_subsys, NULL);
641
642
	if (ret)
		goto out;
643

644
645
646
	block_sz = get_memory_block_size();
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

647
648
649
650
651
	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
652
		if (!present_section_nr(i))
653
			continue;
654
655
		err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE,
					 BOOT);
656
657
		if (!ret)
			ret = err;
658
659
	}

660
	err = memory_probe_init();
661
662
663
	if (!ret)
		ret = err;
	err = memory_fail_init();
664
665
666
667
668
669
670
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
671
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
672
673
	return ret;
}