memory.c 13.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * drivers/base/memory.c - basic Memory class support
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/sysdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18
19
20
21
22
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
23
#include <linux/mutex.h>
24
25
#include <linux/stat.h>

26
27
28
29
30
31
#include <asm/atomic.h>
#include <asm/uaccess.h>

#define MEMORY_CLASS_NAME	"memory"

static struct sysdev_class memory_sysdev_class = {
32
	.name = MEMORY_CLASS_NAME,
33
34
};

35
static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
36
37
38
39
{
	return MEMORY_CLASS_NAME;
}

40
static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env)
41
42
43
44
45
46
{
	int retval = 0;

	return retval;
}

47
static const struct kset_uevent_ops memory_uevent_ops = {
48
49
	.name		= memory_uevent_name,
	.uevent		= memory_uevent,
50
51
};

52
static BLOCKING_NOTIFIER_HEAD(memory_chain);
53

54
int register_memory_notifier(struct notifier_block *nb)
55
{
56
        return blocking_notifier_chain_register(&memory_chain, nb);
57
}
58
EXPORT_SYMBOL(register_memory_notifier);
59

60
void unregister_memory_notifier(struct notifier_block *nb)
61
{
62
        blocking_notifier_chain_unregister(&memory_chain, nb);
63
}
64
EXPORT_SYMBOL(unregister_memory_notifier);
65

66
67
68
69
70
71
72
73
74
75
76
77
78
79
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

80
81
82
/*
 * register_memory - Setup a sysfs device for a memory block
 */
83
84
static
int register_memory(struct memory_block *memory, struct mem_section *section)
85
86
87
88
89
90
91
92
93
94
95
{
	int error;

	memory->sysdev.cls = &memory_sysdev_class;
	memory->sysdev.id = __section_nr(section);

	error = sysdev_register(&memory->sysdev);
	return error;
}

static void
96
unregister_memory(struct memory_block *memory, struct mem_section *section)
97
98
99
100
{
	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
	BUG_ON(memory->sysdev.id != __section_nr(section));

101
102
	/* drop the ref. we got in remove_memory_block() */
	kobject_put(&memory->sysdev.kobj);
103
104
105
106
107
108
109
110
	sysdev_unregister(&memory->sysdev);
}

/*
 * use this as the physical section index that this memsection
 * uses.
 */

111
112
static ssize_t show_mem_phys_index(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
113
114
115
116
117
118
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%08lx\n", mem->phys_index);
}

119
120
121
/*
 * Show whether the section of memory is likely to be hot-removable
 */
122
123
static ssize_t show_mem_removable(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
124
125
126
127
128
129
130
131
132
133
134
{
	unsigned long start_pfn;
	int ret;
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);

	start_pfn = section_nr_to_pfn(mem->phys_index);
	ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
	return sprintf(buf, "%d\n", ret);
}

135
136
137
/*
 * online, offline, going offline, etc.
 */
138
139
static ssize_t show_mem_state(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

169
int memory_notify(unsigned long val, void *v)
170
{
171
	return blocking_notifier_call_chain(&memory_chain, val, v);
172
173
}

174
175
176
177
178
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
memory_block_action(struct memory_block *mem, unsigned long action)
{
	int i;
	unsigned long psection;
	unsigned long start_pfn, start_paddr;
	struct page *first_page;
	int ret;
	int old_state = mem->state;

	psection = mem->phys_index;
	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);

	/*
	 * The probe routines leave the pages reserved, just
	 * as the bootmem code does.  Make sure they're still
	 * that way.
	 */
	if (action == MEM_ONLINE) {
		for (i = 0; i < PAGES_PER_SECTION; i++) {
			if (PageReserved(first_page+i))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online? \n",
				psection, i);
			return -EBUSY;
		}
	}

	switch (action) {
		case MEM_ONLINE:
			start_pfn = page_to_pfn(first_page);
			ret = online_pages(start_pfn, PAGES_PER_SECTION);
			break;
		case MEM_OFFLINE:
			mem->state = MEM_GOING_OFFLINE;
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
			ret = remove_memory(start_paddr,
					    PAGES_PER_SECTION << PAGE_SHIFT);
			if (ret) {
				mem->state = old_state;
				break;
			}
			break;
		default:
Arjan van de Ven's avatar
Arjan van de Ven committed
229
			WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
230
					__func__, mem, action, action);
231
232
233
234
235
236
237
238
239
240
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
	int ret = 0;
241
	mutex_lock(&mem->state_mutex);
242
243
244
245
246
247
248
249
250
251
252

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

	ret = memory_block_action(mem, to_state);
	if (!ret)
		mem->state = to_state;

out:
253
	mutex_unlock(&mem->state_mutex);
254
255
256
257
	return ret;
}

static ssize_t
258
259
store_mem_state(struct sys_device *dev,
		struct sysdev_attribute *attr, const char *buf, size_t count)
260
261
262
263
264
265
266
267
{
	struct memory_block *mem;
	unsigned int phys_section_nr;
	int ret = -EINVAL;

	mem = container_of(dev, struct memory_block, sysdev);
	phys_section_nr = mem->phys_index;

268
	if (!present_section_nr(phys_section_nr))
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
		goto out;

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
out:
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
290
291
static ssize_t show_phys_device(struct sys_device *dev,
				struct sysdev_attribute *attr, char *buf)
292
293
294
295
296
297
298
299
300
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%d\n", mem->phys_device);
}

static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
301
static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
302
303
304
305
306
307
308
309
310
311

#define mem_create_simple_file(mem, attr_name)	\
	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
#define mem_remove_simple_file(mem, attr_name)	\
	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)

/*
 * Block size attribute stuff
 */
static ssize_t
312
313
print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
		 char *buf)
314
{
315
	return sprintf(buf, "%#lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
316
317
}

318
static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
319
320
321

static int block_size_init(void)
{
322
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
323
				&attr_block_size_bytes.attr);
324
325
326
327
328
329
330
331
332
333
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
334
335
memory_probe_store(struct class *class, struct class_attribute *attr,
		   const char *buf, size_t count)
336
337
{
	u64 phys_addr;
338
	int nid;
339
340
341
342
	int ret;

	phys_addr = simple_strtoull(buf, NULL, 0);

343
344
	nid = memory_add_physaddr_to_nid(phys_addr);
	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
345
346
347
348
349
350

	if (ret)
		count = ret;

	return count;
}
351
static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
352
353
354

static int memory_probe_init(void)
{
355
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
356
				&class_attr_probe.attr);
357
358
}
#else
359
360
361
362
static inline int memory_probe_init(void)
{
	return 0;
}
363
364
#endif

365
366
367
368
369
370
371
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
372
373
374
store_soft_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
391
392
393
store_hard_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
394
395
396
397
398
399
400
401
402
403
404
405
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

406
407
static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
408
409
410
411
412
413

static __init int memory_fail_init(void)
{
	int err;

	err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
414
				&class_attr_soft_offline_page.attr);
415
416
	if (!err)
		err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
417
				&class_attr_hard_offline_page.attr);
418
419
420
421
422
423
424
425
426
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

427
428
429
430
431
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
432
433
434
435
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
436

437
static int add_memory_block(int nid, struct mem_section *section,
438
			unsigned long state, enum mem_add_context context)
439
{
440
	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
441
	unsigned long start_pfn;
442
443
444
445
446
447
448
	int ret = 0;

	if (!mem)
		return -ENOMEM;

	mem->phys_index = __section_nr(section);
	mem->state = state;
449
	mutex_init(&mem->state_mutex);
450
451
	start_pfn = section_nr_to_pfn(mem->phys_index);
	mem->phys_device = arch_get_memory_phys_device(start_pfn);
452

453
	ret = register_memory(mem, section);
454
455
456
457
458
459
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
460
461
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
462
463
464
465
	if (!ret) {
		if (context == HOTPLUG)
			ret = register_mem_sect_under_node(mem, nid);
	}
466
467
468
469
470
471
472
473
474
475
476
477

	return ret;
}

/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
 * This could be made generic for all sysdev classes.
 */
478
struct memory_block *find_memory_block(struct mem_section *section)
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
{
	struct kobject *kobj;
	struct sys_device *sysdev;
	struct memory_block *mem;
	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];

	/*
	 * This only works because we know that section == sysdev->id
	 * slightly redundant with sysdev_register()
	 */
	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));

	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
	if (!kobj)
		return NULL;

	sysdev = container_of(kobj, struct sys_device, kobj);
	mem = container_of(sysdev, struct memory_block, sysdev);

	return mem;
}

int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

	mem = find_memory_block(section);
507
	unregister_mem_sect_under_nodes(mem);
508
509
510
	mem_remove_simple_file(mem, phys_index);
	mem_remove_simple_file(mem, state);
	mem_remove_simple_file(mem, phys_device);
511
	mem_remove_simple_file(mem, removable);
512
	unregister_memory(mem, section);
513
514
515
516
517
518
519
520

	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
521
int register_new_memory(int nid, struct mem_section *section)
522
{
523
	return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
524
525
526
527
}

int unregister_memory_section(struct mem_section *section)
{
528
	if (!present_section(section))
529
530
531
532
533
534
535
536
537
538
539
540
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
541
	int err;
542

543
	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
544
	ret = sysdev_class_register(&memory_sysdev_class);
545
546
	if (ret)
		goto out;
547
548
549
550
551
552

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
553
		if (!present_section_nr(i))
554
			continue;
555
		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE,
556
				       BOOT);
557
558
		if (!ret)
			ret = err;
559
560
	}

561
	err = memory_probe_init();
562
563
564
	if (!ret)
		ret = err;
	err = memory_fail_init();
565
566
567
568
569
570
571
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
572
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
573
574
	return ret;
}