memory.c 13.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * drivers/base/memory.c - basic Memory class support
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/sysdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18
19
20
21
22
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
23
#include <linux/mutex.h>
24
#include <linux/stat.h>
25
#include <linux/slab.h>
26

27
28
29
30
31
32
#include <asm/atomic.h>
#include <asm/uaccess.h>

#define MEMORY_CLASS_NAME	"memory"

static struct sysdev_class memory_sysdev_class = {
33
	.name = MEMORY_CLASS_NAME,
34
35
};

36
static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
37
38
39
40
{
	return MEMORY_CLASS_NAME;
}

41
static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env)
42
43
44
45
46
47
{
	int retval = 0;

	return retval;
}

48
static const struct kset_uevent_ops memory_uevent_ops = {
49
50
	.name		= memory_uevent_name,
	.uevent		= memory_uevent,
51
52
};

53
static BLOCKING_NOTIFIER_HEAD(memory_chain);
54

55
int register_memory_notifier(struct notifier_block *nb)
56
{
57
        return blocking_notifier_chain_register(&memory_chain, nb);
58
}
59
EXPORT_SYMBOL(register_memory_notifier);
60

61
void unregister_memory_notifier(struct notifier_block *nb)
62
{
63
        blocking_notifier_chain_unregister(&memory_chain, nb);
64
}
65
EXPORT_SYMBOL(unregister_memory_notifier);
66

67
68
69
70
71
72
73
74
75
76
77
78
79
80
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

81
82
83
/*
 * register_memory - Setup a sysfs device for a memory block
 */
84
85
static
int register_memory(struct memory_block *memory, struct mem_section *section)
86
87
88
89
90
91
92
93
94
95
96
{
	int error;

	memory->sysdev.cls = &memory_sysdev_class;
	memory->sysdev.id = __section_nr(section);

	error = sysdev_register(&memory->sysdev);
	return error;
}

static void
97
unregister_memory(struct memory_block *memory, struct mem_section *section)
98
99
100
101
{
	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
	BUG_ON(memory->sysdev.id != __section_nr(section));

102
103
	/* drop the ref. we got in remove_memory_block() */
	kobject_put(&memory->sysdev.kobj);
104
105
106
107
108
109
110
111
	sysdev_unregister(&memory->sysdev);
}

/*
 * use this as the physical section index that this memsection
 * uses.
 */

112
113
static ssize_t show_mem_phys_index(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
114
115
116
117
118
119
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%08lx\n", mem->phys_index);
}

120
121
122
/*
 * Show whether the section of memory is likely to be hot-removable
 */
123
124
static ssize_t show_mem_removable(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
125
126
127
128
129
130
131
132
133
134
135
{
	unsigned long start_pfn;
	int ret;
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);

	start_pfn = section_nr_to_pfn(mem->phys_index);
	ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
	return sprintf(buf, "%d\n", ret);
}

136
137
138
/*
 * online, offline, going offline, etc.
 */
139
140
static ssize_t show_mem_state(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

170
int memory_notify(unsigned long val, void *v)
171
{
172
	return blocking_notifier_call_chain(&memory_chain, val, v);
173
174
}

175
176
177
178
179
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
memory_block_action(struct memory_block *mem, unsigned long action)
{
	int i;
	unsigned long psection;
	unsigned long start_pfn, start_paddr;
	struct page *first_page;
	int ret;
	int old_state = mem->state;

	psection = mem->phys_index;
	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);

	/*
	 * The probe routines leave the pages reserved, just
	 * as the bootmem code does.  Make sure they're still
	 * that way.
	 */
	if (action == MEM_ONLINE) {
		for (i = 0; i < PAGES_PER_SECTION; i++) {
			if (PageReserved(first_page+i))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online? \n",
				psection, i);
			return -EBUSY;
		}
	}

	switch (action) {
		case MEM_ONLINE:
			start_pfn = page_to_pfn(first_page);
			ret = online_pages(start_pfn, PAGES_PER_SECTION);
			break;
		case MEM_OFFLINE:
			mem->state = MEM_GOING_OFFLINE;
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
			ret = remove_memory(start_paddr,
					    PAGES_PER_SECTION << PAGE_SHIFT);
			if (ret) {
				mem->state = old_state;
				break;
			}
			break;
		default:
Arjan van de Ven's avatar
Arjan van de Ven committed
230
			WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
231
					__func__, mem, action, action);
232
233
234
235
236
237
238
239
240
241
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
	int ret = 0;
242
	mutex_lock(&mem->state_mutex);
243
244
245
246
247
248
249
250
251
252
253

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

	ret = memory_block_action(mem, to_state);
	if (!ret)
		mem->state = to_state;

out:
254
	mutex_unlock(&mem->state_mutex);
255
256
257
258
	return ret;
}

static ssize_t
259
260
store_mem_state(struct sys_device *dev,
		struct sysdev_attribute *attr, const char *buf, size_t count)
261
262
263
264
265
266
267
268
{
	struct memory_block *mem;
	unsigned int phys_section_nr;
	int ret = -EINVAL;

	mem = container_of(dev, struct memory_block, sysdev);
	phys_section_nr = mem->phys_index;

269
	if (!present_section_nr(phys_section_nr))
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
		goto out;

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
out:
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
291
292
static ssize_t show_phys_device(struct sys_device *dev,
				struct sysdev_attribute *attr, char *buf)
293
294
295
296
297
298
299
300
301
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%d\n", mem->phys_device);
}

static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
302
static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
303
304
305
306
307
308
309
310
311
312

#define mem_create_simple_file(mem, attr_name)	\
	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
#define mem_remove_simple_file(mem, attr_name)	\
	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)

/*
 * Block size attribute stuff
 */
static ssize_t
313
314
print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
		 char *buf)
315
{
316
	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
317
318
}

319
static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
320
321
322

static int block_size_init(void)
{
323
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
324
				&attr_block_size_bytes.attr);
325
326
327
328
329
330
331
332
333
334
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
335
336
memory_probe_store(struct class *class, struct class_attribute *attr,
		   const char *buf, size_t count)
337
338
{
	u64 phys_addr;
339
	int nid;
340
341
342
343
	int ret;

	phys_addr = simple_strtoull(buf, NULL, 0);

344
345
	nid = memory_add_physaddr_to_nid(phys_addr);
	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
346
347
348
349
350
351

	if (ret)
		count = ret;

	return count;
}
352
static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
353
354
355

static int memory_probe_init(void)
{
356
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
357
				&class_attr_probe.attr);
358
359
}
#else
360
361
362
363
static inline int memory_probe_init(void)
{
	return 0;
}
364
365
#endif

366
367
368
369
370
371
372
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
373
374
375
store_soft_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
392
393
394
store_hard_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
395
396
397
398
399
400
401
402
403
404
405
406
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

407
408
static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
409
410
411
412
413
414

static __init int memory_fail_init(void)
{
	int err;

	err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
415
				&class_attr_soft_offline_page.attr);
416
417
	if (!err)
		err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
418
				&class_attr_hard_offline_page.attr);
419
420
421
422
423
424
425
426
427
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

428
429
430
431
432
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
433
434
435
436
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
437

438
static int add_memory_block(int nid, struct mem_section *section,
439
			unsigned long state, enum mem_add_context context)
440
{
441
	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
442
	unsigned long start_pfn;
443
444
445
446
447
448
449
	int ret = 0;

	if (!mem)
		return -ENOMEM;

	mem->phys_index = __section_nr(section);
	mem->state = state;
450
	mutex_init(&mem->state_mutex);
451
452
	start_pfn = section_nr_to_pfn(mem->phys_index);
	mem->phys_device = arch_get_memory_phys_device(start_pfn);
453

454
	ret = register_memory(mem, section);
455
456
457
458
459
460
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
461
462
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
463
464
465
466
	if (!ret) {
		if (context == HOTPLUG)
			ret = register_mem_sect_under_node(mem, nid);
	}
467
468
469
470
471
472
473
474
475
476
477
478

	return ret;
}

/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
 * This could be made generic for all sysdev classes.
 */
479
struct memory_block *find_memory_block(struct mem_section *section)
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
{
	struct kobject *kobj;
	struct sys_device *sysdev;
	struct memory_block *mem;
	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];

	/*
	 * This only works because we know that section == sysdev->id
	 * slightly redundant with sysdev_register()
	 */
	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));

	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
	if (!kobj)
		return NULL;

	sysdev = container_of(kobj, struct sys_device, kobj);
	mem = container_of(sysdev, struct memory_block, sysdev);

	return mem;
}

int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

	mem = find_memory_block(section);
508
	unregister_mem_sect_under_nodes(mem);
509
510
511
	mem_remove_simple_file(mem, phys_index);
	mem_remove_simple_file(mem, state);
	mem_remove_simple_file(mem, phys_device);
512
	mem_remove_simple_file(mem, removable);
513
	unregister_memory(mem, section);
514
515
516
517
518
519
520
521

	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
522
int register_new_memory(int nid, struct mem_section *section)
523
{
524
	return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
525
526
527
528
}

int unregister_memory_section(struct mem_section *section)
{
529
	if (!present_section(section))
530
531
532
533
534
535
536
537
538
539
540
541
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
542
	int err;
543

544
	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
545
	ret = sysdev_class_register(&memory_sysdev_class);
546
547
	if (ret)
		goto out;
548
549
550
551
552
553

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
554
		if (!present_section_nr(i))
555
			continue;
556
		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE,
557
				       BOOT);
558
559
		if (!ret)
			ret = err;
560
561
	}

562
	err = memory_probe_init();
563
564
565
	if (!ret)
		ret = err;
	err = memory_fail_init();
566
567
568
569
570
571
572
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
573
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
574
575
	return ret;
}