memory.c 14.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * drivers/base/memory.c - basic Memory class support
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/sysdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18
19
20
21
22
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
23
#include <linux/mutex.h>
24
#include <linux/stat.h>
25
#include <linux/slab.h>
26

27
28
29
#include <asm/atomic.h>
#include <asm/uaccess.h>

30
31
static DEFINE_MUTEX(mem_sysfs_mutex);

32
33
34
#define MEMORY_CLASS_NAME	"memory"

static struct sysdev_class memory_sysdev_class = {
35
	.name = MEMORY_CLASS_NAME,
36
37
};

38
static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
39
40
41
42
{
	return MEMORY_CLASS_NAME;
}

43
static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env)
44
45
46
47
48
49
{
	int retval = 0;

	return retval;
}

50
static const struct kset_uevent_ops memory_uevent_ops = {
51
52
	.name		= memory_uevent_name,
	.uevent		= memory_uevent,
53
54
};

55
static BLOCKING_NOTIFIER_HEAD(memory_chain);
56

57
int register_memory_notifier(struct notifier_block *nb)
58
{
59
        return blocking_notifier_chain_register(&memory_chain, nb);
60
}
61
EXPORT_SYMBOL(register_memory_notifier);
62

63
void unregister_memory_notifier(struct notifier_block *nb)
64
{
65
        blocking_notifier_chain_unregister(&memory_chain, nb);
66
}
67
EXPORT_SYMBOL(unregister_memory_notifier);
68

69
70
71
72
73
74
75
76
77
78
79
80
81
82
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

83
84
85
/*
 * register_memory - Setup a sysfs device for a memory block
 */
86
87
static
int register_memory(struct memory_block *memory, struct mem_section *section)
88
89
90
91
92
93
94
95
96
97
98
{
	int error;

	memory->sysdev.cls = &memory_sysdev_class;
	memory->sysdev.id = __section_nr(section);

	error = sysdev_register(&memory->sysdev);
	return error;
}

static void
99
unregister_memory(struct memory_block *memory, struct mem_section *section)
100
101
102
103
{
	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
	BUG_ON(memory->sysdev.id != __section_nr(section));

104
105
	/* drop the ref. we got in remove_memory_block() */
	kobject_put(&memory->sysdev.kobj);
106
107
108
109
110
111
112
113
	sysdev_unregister(&memory->sysdev);
}

/*
 * use this as the physical section index that this memsection
 * uses.
 */

114
115
static ssize_t show_mem_phys_index(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
116
117
118
119
120
121
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%08lx\n", mem->phys_index);
}

122
123
124
/*
 * Show whether the section of memory is likely to be hot-removable
 */
125
126
static ssize_t show_mem_removable(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
127
128
129
130
131
132
133
134
135
136
137
{
	unsigned long start_pfn;
	int ret;
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);

	start_pfn = section_nr_to_pfn(mem->phys_index);
	ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
	return sprintf(buf, "%d\n", ret);
}

138
139
140
/*
 * online, offline, going offline, etc.
 */
141
142
static ssize_t show_mem_state(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

172
int memory_notify(unsigned long val, void *v)
173
{
174
	return blocking_notifier_call_chain(&memory_chain, val, v);
175
176
}

177
178
179
180
181
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
memory_block_action(struct memory_block *mem, unsigned long action)
{
	int i;
	unsigned long psection;
	unsigned long start_pfn, start_paddr;
	struct page *first_page;
	int ret;
	int old_state = mem->state;

	psection = mem->phys_index;
	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);

	/*
	 * The probe routines leave the pages reserved, just
	 * as the bootmem code does.  Make sure they're still
	 * that way.
	 */
	if (action == MEM_ONLINE) {
		for (i = 0; i < PAGES_PER_SECTION; i++) {
			if (PageReserved(first_page+i))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online? \n",
				psection, i);
			return -EBUSY;
		}
	}

	switch (action) {
		case MEM_ONLINE:
			start_pfn = page_to_pfn(first_page);
			ret = online_pages(start_pfn, PAGES_PER_SECTION);
			break;
		case MEM_OFFLINE:
			mem->state = MEM_GOING_OFFLINE;
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
			ret = remove_memory(start_paddr,
					    PAGES_PER_SECTION << PAGE_SHIFT);
			if (ret) {
				mem->state = old_state;
				break;
			}
			break;
		default:
Arjan van de Ven's avatar
Arjan van de Ven committed
232
			WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
233
					__func__, mem, action, action);
234
235
236
237
238
239
240
241
242
243
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
	int ret = 0;
244
	mutex_lock(&mem->state_mutex);
245
246
247
248
249
250
251
252
253
254
255

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

	ret = memory_block_action(mem, to_state);
	if (!ret)
		mem->state = to_state;

out:
256
	mutex_unlock(&mem->state_mutex);
257
258
259
260
	return ret;
}

static ssize_t
261
262
store_mem_state(struct sys_device *dev,
		struct sysdev_attribute *attr, const char *buf, size_t count)
263
264
265
266
267
268
269
270
{
	struct memory_block *mem;
	unsigned int phys_section_nr;
	int ret = -EINVAL;

	mem = container_of(dev, struct memory_block, sysdev);
	phys_section_nr = mem->phys_index;

271
	if (!present_section_nr(phys_section_nr))
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
		goto out;

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
out:
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
293
294
static ssize_t show_phys_device(struct sys_device *dev,
				struct sysdev_attribute *attr, char *buf)
295
296
297
298
299
300
301
302
303
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%d\n", mem->phys_device);
}

static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
304
static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
305
306
307
308
309
310
311
312
313
314

#define mem_create_simple_file(mem, attr_name)	\
	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
#define mem_remove_simple_file(mem, attr_name)	\
	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)

/*
 * Block size attribute stuff
 */
static ssize_t
315
316
print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
		 char *buf)
317
{
318
	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
319
320
}

321
static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
322
323
324

static int block_size_init(void)
{
325
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
326
				&attr_block_size_bytes.attr);
327
328
329
330
331
332
333
334
335
336
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
337
338
memory_probe_store(struct class *class, struct class_attribute *attr,
		   const char *buf, size_t count)
339
340
{
	u64 phys_addr;
341
	int nid;
342
343
344
345
	int ret;

	phys_addr = simple_strtoull(buf, NULL, 0);

346
347
	nid = memory_add_physaddr_to_nid(phys_addr);
	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
348
349
350
351
352
353

	if (ret)
		count = ret;

	return count;
}
354
static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
355
356
357

static int memory_probe_init(void)
{
358
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
359
				&class_attr_probe.attr);
360
361
}
#else
362
363
364
365
static inline int memory_probe_init(void)
{
	return 0;
}
366
367
#endif

368
369
370
371
372
373
374
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
375
376
377
store_soft_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
394
395
396
store_hard_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
397
398
399
400
401
402
403
404
405
406
407
408
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

409
410
static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
411
412
413
414
415
416

static __init int memory_fail_init(void)
{
	int err;

	err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
417
				&class_attr_soft_offline_page.attr);
418
419
	if (!err)
		err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
420
				&class_attr_hard_offline_page.attr);
421
422
423
424
425
426
427
428
429
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

430
431
432
433
434
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
435
436
437
438
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
439

440
441
struct memory_block *find_memory_block_hinted(struct mem_section *section,
					      struct memory_block *hint)
442
443
444
445
446
447
{
	struct kobject *kobj;
	struct sys_device *sysdev;
	struct memory_block *mem;
	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];

448
449
	kobj = hint ? &hint->sysdev.kobj : NULL;

450
451
452
453
454
455
	/*
	 * This only works because we know that section == sysdev->id
	 * slightly redundant with sysdev_register()
	 */
	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));

456
	kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj);
457
458
459
460
461
462
463
464
465
	if (!kobj)
		return NULL;

	sysdev = container_of(kobj, struct sys_device, kobj);
	mem = container_of(sysdev, struct memory_block, sysdev);

	return mem;
}

466
467
468
469
470
471
472
473
474
475
476
477
478
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
 * This could be made generic for all sysdev classes.
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
	return find_memory_block_hinted(section, NULL);
}

479
480
481
482
483
484
485
486
487
488
static int add_memory_block(int nid, struct mem_section *section,
			unsigned long state, enum mem_add_context context)
{
	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
	unsigned long start_pfn;
	int ret = 0;

	if (!mem)
		return -ENOMEM;

489
490
	mutex_lock(&mem_sysfs_mutex);

491
492
	mem->phys_index = __section_nr(section);
	mem->state = state;
493
	mem->section_count++;
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
	mutex_init(&mem->state_mutex);
	start_pfn = section_nr_to_pfn(mem->phys_index);
	mem->phys_device = arch_get_memory_phys_device(start_pfn);

	ret = register_memory(mem, section);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
	if (!ret) {
		if (context == HOTPLUG)
			ret = register_mem_sect_under_node(mem, nid);
	}

512
	mutex_unlock(&mem_sysfs_mutex);
513
514
515
	return ret;
}

516
517
518
519
520
int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

521
	mutex_lock(&mem_sysfs_mutex);
522
	mem = find_memory_block(section);
523
524
525
526
527
528
529
530
531
532

	mem->section_count--;
	if (mem->section_count == 0) {
		unregister_mem_sect_under_nodes(mem);
		mem_remove_simple_file(mem, phys_index);
		mem_remove_simple_file(mem, state);
		mem_remove_simple_file(mem, phys_device);
		mem_remove_simple_file(mem, removable);
		unregister_memory(mem, section);
	}
533

534
	mutex_unlock(&mem_sysfs_mutex);
535
536
537
538
539
540
541
	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
542
int register_new_memory(int nid, struct mem_section *section)
543
{
544
	return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
545
546
547
548
}

int unregister_memory_section(struct mem_section *section)
{
549
	if (!present_section(section))
550
551
552
553
554
555
556
557
558
559
560
561
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
562
	int err;
563

564
	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
565
	ret = sysdev_class_register(&memory_sysdev_class);
566
567
	if (ret)
		goto out;
568
569
570
571
572
573

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
574
		if (!present_section_nr(i))
575
			continue;
576
		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE,
577
				       BOOT);
578
579
		if (!ret)
			ret = err;
580
581
	}

582
	err = memory_probe_init();
583
584
585
	if (!ret)
		ret = err;
	err = memory_fail_init();
586
587
588
589
590
591
592
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
593
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
594
595
	return ret;
}