memory_hotplug.c 51.7 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
9
#include <linux/sched/signal.h>
10
11
12
13
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/compiler.h>
14
#include <linux/export.h>
15
#include <linux/pagevec.h>
16
#include <linux/writeback.h>
17
18
19
20
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
21
#include <linux/memremap.h>
22
23
24
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
25
#include <linux/ioport.h>
26
27
28
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
29
#include <linux/pfn.h>
30
#include <linux/suspend.h>
31
#include <linux/mm_inline.h>
32
#include <linux/firmware-map.h>
33
#include <linux/stop_machine.h>
34
#include <linux/hugetlb.h>
35
#include <linux/memblock.h>
36
#include <linux/bootmem.h>
37
#include <linux/compaction.h>
38
39
40

#include <asm/tlbflush.h>

41
42
#include "internal.h"

43
44
45
46
47
48
49
50
51
52
/*
 * online_page_callback contains pointer to current page onlining function.
 * Initially it is generic_online_page(). If it is required it could be
 * changed by calling set_online_page_callback() for callback registration
 * and restore_online_page_callback() for generic callback restore.
 */

static void generic_online_page(struct page *page);

static online_page_callback_t online_page_callback = generic_online_page;
53
static DEFINE_MUTEX(online_page_callback_lock);
54

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/* The same as the cpu_hotplug lock, but for memory hotplug. */
static struct {
	struct task_struct *active_writer;
	struct mutex lock; /* Synchronizes accesses to refcount, */
	/*
	 * Also blocks the new readers during
	 * an ongoing mem hotplug operation.
	 */
	int refcount;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif
} mem_hotplug = {
	.active_writer = NULL,
	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
	.refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	.dep_map = {.name = "mem_hotplug.lock" },
#endif
};

/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)

82
83
bool movable_node_enabled = false;

84
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
85
bool memhp_auto_online;
86
87
88
#else
bool memhp_auto_online = true;
#endif
89
90
EXPORT_SYMBOL_GPL(memhp_auto_online);

91
92
93
94
95
96
97
98
99
100
101
static int __init setup_memhp_default_state(char *str)
{
	if (!strcmp(str, "online"))
		memhp_auto_online = true;
	else if (!strcmp(str, "offline"))
		memhp_auto_online = false;

	return 1;
}
__setup("memhp_default_state=", setup_memhp_default_state);

102
103
104
105
106
107
108
109
110
111
112
void get_online_mems(void)
{
	might_sleep();
	if (mem_hotplug.active_writer == current)
		return;
	memhp_lock_acquire_read();
	mutex_lock(&mem_hotplug.lock);
	mem_hotplug.refcount++;
	mutex_unlock(&mem_hotplug.lock);

}
113

114
void put_online_mems(void)
115
{
116
117
118
119
120
121
122
123
124
125
126
127
	if (mem_hotplug.active_writer == current)
		return;
	mutex_lock(&mem_hotplug.lock);

	if (WARN_ON(!mem_hotplug.refcount))
		mem_hotplug.refcount++; /* try to fix things up */

	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
		wake_up_process(mem_hotplug.active_writer);
	mutex_unlock(&mem_hotplug.lock);
	memhp_lock_release();

128
129
}

130
131
132
/* Serializes write accesses to mem_hotplug.active_writer. */
static DEFINE_MUTEX(memory_add_remove_lock);

133
void mem_hotplug_begin(void)
134
{
135
	mutex_lock(&memory_add_remove_lock);
136

137
138
139
140
141
142
143
144
145
146
147
	mem_hotplug.active_writer = current;

	memhp_lock_acquire();
	for (;;) {
		mutex_lock(&mem_hotplug.lock);
		if (likely(!mem_hotplug.refcount))
			break;
		__set_current_state(TASK_UNINTERRUPTIBLE);
		mutex_unlock(&mem_hotplug.lock);
		schedule();
	}
148
149
}

150
void mem_hotplug_done(void)
151
152
153
154
{
	mem_hotplug.active_writer = NULL;
	mutex_unlock(&mem_hotplug.lock);
	memhp_lock_release();
155
	mutex_unlock(&memory_add_remove_lock);
156
}
157

158
159
160
161
162
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
163
164
	if (!res)
		return ERR_PTR(-ENOMEM);
165
166
167
168

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
169
	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
170
	if (request_resource(&iomem_resource, res) < 0) {
171
		pr_debug("System RAM resource %pR cannot be added\n", res);
172
		kfree(res);
173
		return ERR_PTR(-EEXIST);
174
175
176
177
178
179
180
181
182
183
184
185
186
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}

187
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188
189
void get_page_bootmem(unsigned long info,  struct page *page,
		      unsigned long type)
190
{
191
	page->freelist = (void *)type;
192
193
	SetPagePrivate(page);
	set_page_private(page, info);
194
	page_ref_inc(page);
195
196
}

197
void put_page_bootmem(struct page *page)
198
{
Andrea Arcangeli's avatar
Andrea Arcangeli committed
199
	unsigned long type;
200

201
	type = (unsigned long) page->freelist;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
202
203
	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
204

205
	if (page_ref_dec_return(page) == 1) {
206
		page->freelist = NULL;
207
208
		ClearPagePrivate(page);
		set_page_private(page, 0);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
209
		INIT_LIST_HEAD(&page->lru);
210
		free_reserved_page(page);
211
212
213
	}
}

214
215
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
216
static void register_page_bootmem_info_section(unsigned long start_pfn)
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
246
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
247
248

}
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	if (!pfn_valid(start_pfn))
		return;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
275

276
void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
277
278
279
280
281
282
283
284
285
286
287
288
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	pfn = pgdat->node_start_pfn;
289
	end_pfn = pgdat_end_pfn(pgdat);
290

291
	/* register section info */
292
293
294
295
296
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		/*
		 * Some platforms can assign the same pfn to multiple nodes - on
		 * node0 as well as nodeN.  To avoid registering a pfn against
		 * multiple nodes we check that this pfn does not already
297
		 * reside in some other nodes.
298
		 */
299
		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
300
301
			register_page_bootmem_info_section(pfn);
	}
302
}
303
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
304

305
306
static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
		bool want_memblock)
307
308
{
	int ret;
309
	int i;
310

311
312
313
	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

314
	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
315
316
317
	if (ret < 0)
		return ret;

318
319
320
321
322
323
324
325
326
327
328
	/*
	 * Make all the pages reserved so that nobody will stumble over half
	 * initialized state.
	 * FIXME: We also have to associate it with a node because pfn_to_node
	 * relies on having page with the proper node.
	 */
	for (i = 0; i < PAGES_PER_SECTION; i++) {
		unsigned long pfn = phys_start_pfn + i;
		struct page *page;
		if (!pfn_valid(pfn))
			continue;
329

330
331
332
333
		page = pfn_to_page(pfn);
		set_page_node(page, nid);
		SetPageReserved(page);
	}
334

335
336
337
	if (!want_memblock)
		return 0;

338
	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
339
340
}

341
342
343
344
345
346
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
347
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
348
			unsigned long nr_pages, bool want_memblock)
349
350
351
352
{
	unsigned long i;
	int err = 0;
	int start_sec, end_sec;
353
354
	struct vmem_altmap *altmap;

355
356
357
358
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);

359
360
361
362
363
364
365
366
	altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
	if (altmap) {
		/*
		 * Validate altmap is within bounds of the total request
		 */
		if (altmap->base_pfn != phys_start_pfn
				|| vmem_altmap_offset(altmap) > nr_pages) {
			pr_warn_once("memory add fail, invalid altmap\n");
367
368
			err = -EINVAL;
			goto out;
369
370
371
372
		}
		altmap->alloc = 0;
	}

373
	for (i = start_sec; i <= end_sec; i++) {
374
		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
375
376
377
378
379
380
381
382
383
384

		/*
		 * EEXIST is finally dealt with by ioresource collision
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
		 */
		if (err && (err != -EEXIST))
			break;
		err = 0;
	}
385
	vmemmap_populate_print_last();
386
out:
387
388
389
390
391
	return err;
}
EXPORT_SYMBOL_GPL(__add_pages);

#ifdef CONFIG_MEMORY_HOTREMOVE
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static int find_smallest_section_pfn(int nid, struct zone *zone,
				     unsigned long start_pfn,
				     unsigned long end_pfn)
{
	struct mem_section *ms;

	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(start_pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (unlikely(pfn_to_nid(start_pfn) != nid))
			continue;

		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
			continue;

		return start_pfn;
	}

	return 0;
}

/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
static int find_biggest_section_pfn(int nid, struct zone *zone,
				    unsigned long start_pfn,
				    unsigned long end_pfn)
{
	struct mem_section *ms;
	unsigned long pfn;

	/* pfn is the end pfn of a memory section. */
	pfn = end_pfn - 1;
	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (unlikely(pfn_to_nid(pfn) != nid))
			continue;

		if (zone && zone != page_zone(pfn_to_page(pfn)))
			continue;

		return pfn;
	}

	return 0;
}

static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
			     unsigned long end_pfn)
{
448
449
450
	unsigned long zone_start_pfn = zone->zone_start_pfn;
	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
	unsigned long zone_end_pfn = z;
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
	unsigned long pfn;
	struct mem_section *ms;
	int nid = zone_to_nid(zone);

	zone_span_writelock(zone);
	if (zone_start_pfn == start_pfn) {
		/*
		 * If the section is smallest section in the zone, it need
		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
						zone_end_pfn);
		if (pfn) {
			zone->zone_start_pfn = pfn;
			zone->spanned_pages = zone_end_pfn - pfn;
		}
	} else if (zone_end_pfn == end_pfn) {
		/*
		 * If the section is biggest section in the zone, it need
		 * shrink zone->spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
					       start_pfn);
		if (pfn)
			zone->spanned_pages = pfn - zone_start_pfn + 1;
	}

	/*
	 * The section is not biggest or smallest mem_section in the zone, it
	 * only creates a hole in the zone. So in this case, we need not
	 * change the zone. But perhaps, the zone has only hole data. Thus
	 * it check the zone has only hole or not.
	 */
	pfn = zone_start_pfn;
	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (page_zone(pfn_to_page(pfn)) != zone)
			continue;

		 /* If the section is current section, it continues the loop */
		if (start_pfn == pfn)
			continue;

		/* If we find valid section, we have nothing to do */
		zone_span_writeunlock(zone);
		return;
	}

	/* The zone has no valid section */
	zone->zone_start_pfn = 0;
	zone->spanned_pages = 0;
	zone_span_writeunlock(zone);
}

static void shrink_pgdat_span(struct pglist_data *pgdat,
			      unsigned long start_pfn, unsigned long end_pfn)
{
516
517
518
	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
	unsigned long pgdat_end_pfn = p;
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
	unsigned long pfn;
	struct mem_section *ms;
	int nid = pgdat->node_id;

	if (pgdat_start_pfn == start_pfn) {
		/*
		 * If the section is smallest section in the pgdat, it need
		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
						pgdat_end_pfn);
		if (pfn) {
			pgdat->node_start_pfn = pfn;
			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
		}
	} else if (pgdat_end_pfn == end_pfn) {
		/*
		 * If the section is biggest section in the pgdat, it need
		 * shrink pgdat->node_spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
					       start_pfn);
		if (pfn)
			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
	}

	/*
	 * If the section is not biggest or smallest mem_section in the pgdat,
	 * it only creates a hole in the pgdat. So in this case, we need not
	 * change the pgdat.
	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
	 * has only hole or not.
	 */
	pfn = pgdat_start_pfn;
	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (pfn_to_nid(pfn) != nid)
			continue;

		 /* If the section is current section, it continues the loop */
		if (start_pfn == pfn)
			continue;

		/* If we find valid section, we have nothing to do */
		return;
	}

	/* The pgdat has no valid section */
	pgdat->node_start_pfn = 0;
	pgdat->node_spanned_pages = 0;
}

static void __remove_zone(struct zone *zone, unsigned long start_pfn)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	unsigned long flags;

	pgdat_resize_lock(zone->zone_pgdat, &flags);
	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
}

591
592
static int __remove_section(struct zone *zone, struct mem_section *ms,
		unsigned long map_offset)
593
{
594
595
	unsigned long start_pfn;
	int scn_nr;
596
597
598
599
600
601
602
603
604
	int ret = -EINVAL;

	if (!valid_section(ms))
		return ret;

	ret = unregister_memory_section(ms);
	if (ret)
		return ret;

605
606
607
608
	scn_nr = __section_nr(ms);
	start_pfn = section_nr_to_pfn(scn_nr);
	__remove_zone(zone, start_pfn);

609
	sparse_remove_one_section(zone, ms, map_offset);
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
	return 0;
}

/**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 * @nr_pages: number of pages to remove (must be multiple of section size)
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
627
	unsigned long i;
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
	unsigned long map_offset = 0;
	int sections_to_remove, ret = 0;

	/* In the ZONE_DEVICE case device driver owns the memory region */
	if (is_dev_zone(zone)) {
		struct page *page = pfn_to_page(phys_start_pfn);
		struct vmem_altmap *altmap;

		altmap = to_vmem_altmap((unsigned long) page);
		if (altmap)
			map_offset = vmem_altmap_offset(altmap);
	} else {
		resource_size_t start, size;

		start = phys_start_pfn << PAGE_SHIFT;
		size = nr_pages * PAGE_SIZE;

		ret = release_mem_region_adjustable(&iomem_resource, start,
					size);
		if (ret) {
			resource_size_t endres = start + size - 1;

			pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
					&start, &endres, ret);
		}
	}
654

655
656
	clear_zone_contiguous(zone);

657
658
659
660
661
662
663
664
665
	/*
	 * We can only remove entire sections
	 */
	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	BUG_ON(nr_pages % PAGES_PER_SECTION);

	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	for (i = 0; i < sections_to_remove; i++) {
		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
666
667
668

		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
		map_offset = 0;
669
670
671
		if (ret)
			break;
	}
672
673
674

	set_zone_contiguous(zone);

675
676
	return ret;
}
677
#endif /* CONFIG_MEMORY_HOTREMOVE */
678

679
680
681
682
int set_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

683
684
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
685
686
687
688
689
690

	if (online_page_callback == generic_online_page) {
		online_page_callback = callback;
		rc = 0;
	}

691
692
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
693
694
695
696
697
698
699
700
701

	return rc;
}
EXPORT_SYMBOL_GPL(set_online_page_callback);

int restore_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

702
703
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
704
705
706
707
708
709

	if (online_page_callback == callback) {
		online_page_callback = generic_online_page;
		rc = 0;
	}

710
711
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
712
713
714
715
716
717

	return rc;
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);

void __online_page_set_limits(struct page *page)
718
{
719
720
721
722
723
}
EXPORT_SYMBOL_GPL(__online_page_set_limits);

void __online_page_increment_counters(struct page *page)
{
724
	adjust_managed_page_count(page, 1);
725
726
}
EXPORT_SYMBOL_GPL(__online_page_increment_counters);
727

728
729
void __online_page_free(struct page *page)
{
730
	__free_reserved_page(page);
731
}
732
733
734
735
736
737
738
739
EXPORT_SYMBOL_GPL(__online_page_free);

static void generic_online_page(struct page *page)
{
	__online_page_set_limits(page);
	__online_page_increment_counters(page);
	__online_page_free(page);
}
740

741
742
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
743
744
{
	unsigned long i;
745
746
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
747

748
749
750
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
751
			(*online_page_callback)(page);
752
753
			onlined_pages++;
		}
754
755
756

	online_mem_sections(start_pfn, start_pfn + nr_pages);

757
758
759
760
	*(unsigned long *)arg = onlined_pages;
	return 0;
}

761
762
763
764
765
766
767
768
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);
	enum zone_type zone_last = ZONE_NORMAL;

	/*
769
770
771
	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_NORMAL,
	 * set zone_last to ZONE_NORMAL.
772
	 *
773
774
775
	 * If we don't have HIGHMEM nor movable node,
	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
776
	 */
777
	if (N_MEMORY == N_NORMAL_MEMORY)
778
779
780
781
782
783
784
785
786
787
788
789
790
		zone_last = ZONE_MOVABLE;

	/*
	 * if the memory to be online is in a zone of 0...zone_last, and
	 * the zones of 0...zone_last don't have memory before online, we will
	 * need to set the node to node_states[N_NORMAL_MEMORY] after
	 * the memory is online.
	 */
	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
		arg->status_change_nid_normal = nid;
	else
		arg->status_change_nid_normal = -1;

791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
#ifdef CONFIG_HIGHMEM
	/*
	 * If we have movable node, node_states[N_HIGH_MEMORY]
	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
	 * set zone_last to ZONE_HIGHMEM.
	 *
	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_MOVABLE,
	 * set zone_last to ZONE_MOVABLE.
	 */
	zone_last = ZONE_HIGHMEM;
	if (N_MEMORY == N_HIGH_MEMORY)
		zone_last = ZONE_MOVABLE;

	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
		arg->status_change_nid_high = nid;
	else
		arg->status_change_nid_high = -1;
#else
	arg->status_change_nid_high = arg->status_change_nid_normal;
#endif

813
814
	/*
	 * if the node don't have memory befor online, we will need to
815
	 * set the node to node_states[N_MEMORY] after the memory
816
817
	 * is online.
	 */
818
	if (!node_state(nid, N_MEMORY))
819
820
821
822
823
824
825
826
827
828
		arg->status_change_nid = nid;
	else
		arg->status_change_nid = -1;
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

829
830
831
832
	if (arg->status_change_nid_high >= 0)
		node_set_state(node, N_HIGH_MEMORY);

	node_set_state(node, N_MEMORY);
833
834
}

835
bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
836
{
837
838
	struct pglist_data *pgdat = NODE_DATA(nid);
	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
839
	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
840

841
842
843
844
845
846
847
848
849
850
851
852
	/*
	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
	 * physically before ZONE_MOVABLE. All we need is they do not
	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
	 * though so let's stick with it for simplicity for now.
	 * TODO make sure we do not overlap with ZONE_DEVICE
	 */
	if (online_type == MMOP_ONLINE_KERNEL) {
		if (zone_is_empty(movable_zone))
			return true;
		return movable_zone->zone_start_pfn >= pfn + nr_pages;
	} else if (online_type == MMOP_ONLINE_MOVABLE) {
853
		return zone_end_pfn(default_zone) <= pfn;
854
	}
855

856
857
858
	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
	return online_type == MMOP_ONLINE_KEEP;
}
859

860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
		unsigned long nr_pages)
{
	unsigned long old_end_pfn = zone_end_pfn(zone);

	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
}

static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
                                     unsigned long nr_pages)
{
	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);

	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
}

882
void __ref move_pfn_range_to_zone(struct zone *zone,
883
884
885
886
887
		unsigned long start_pfn, unsigned long nr_pages)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nid = pgdat->node_id;
	unsigned long flags;
888

889
890
	if (zone_is_empty(zone))
		init_currently_empty_zone(zone, start_pfn, nr_pages);
891

892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
	clear_zone_contiguous(zone);

	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
	pgdat_resize_lock(pgdat, &flags);
	zone_span_writelock(zone);
	resize_zone_range(zone, start_pfn, nr_pages);
	zone_span_writeunlock(zone);
	resize_pgdat_range(pgdat, start_pfn, nr_pages);
	pgdat_resize_unlock(pgdat, &flags);

	/*
	 * TODO now we have a visible range of pages which are not associated
	 * with their zone properly. Not nice but set_pfnblock_flags_mask
	 * expects the zone spans the pfn range. All the pages in the range
	 * are reserved so nobody should be touching them so we should be safe
	 */
	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);

	set_zone_contiguous(zone);
}

913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
/*
 * Returns a default kernel memory zone for the given pfn range.
 * If no kernel zone covers this pfn range it will automatically go
 * to the ZONE_NORMAL.
 */
struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
		unsigned long nr_pages)
{
	struct pglist_data *pgdat = NODE_DATA(nid);
	int zid;

	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
		struct zone *zone = &pgdat->node_zones[zid];

		if (zone_intersects(zone, start_pfn, nr_pages))
			return zone;
	}

	return &pgdat->node_zones[ZONE_NORMAL];
}

934
935
936
937
938
939
940
941
942
943
944
945
946
static inline bool movable_pfn_range(int nid, struct zone *default_zone,
		unsigned long start_pfn, unsigned long nr_pages)
{
	if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
				MMOP_ONLINE_KERNEL))
		return true;

	if (!movable_node_is_enabled())
		return false;

	return !zone_intersects(default_zone, start_pfn, nr_pages);
}

947
948
949
950
951
952
953
954
/*
 * Associates the given pfn range with the given node and the zone appropriate
 * for the given online type.
 */
static struct zone * __meminit move_pfn_range(int online_type, int nid,
		unsigned long start_pfn, unsigned long nr_pages)
{
	struct pglist_data *pgdat = NODE_DATA(nid);
955
	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
956
957
958
959

	if (online_type == MMOP_ONLINE_KEEP) {
		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
		/*
960
961
		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
		 * movable zone if that is not possible (e.g. we are within
962
963
		 * or past the existing movable zone). movable_node overrides
		 * this default and defaults to movable zone
964
		 */
965
		if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
966
967
968
			zone = movable_zone;
	} else if (online_type == MMOP_ONLINE_MOVABLE) {
		zone = &pgdat->node_zones[ZONE_MOVABLE];
969
970
	}

971
972
	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
	return zone;
973
}
974

975
/* Must be protected by mem_hotplug_begin() */
976
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
977
{
978
	unsigned long flags;
979
980
	unsigned long onlined_pages = 0;
	struct zone *zone;
981
	int need_zonelists_rebuild = 0;
982
983
984
985
	int nid;
	int ret;
	struct memory_notify arg;

986
987
	nid = pfn_to_nid(pfn);
	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
988
		return -EINVAL;
989

990
991
992
	/* associate pfn range with the zone */
	zone = move_pfn_range(online_type, nid, pfn, nr_pages);

993
994
	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
995
	node_states_check_changes_online(nr_pages, zone, &arg);
996
997
998

	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
999
1000
	if (ret)
		goto failed_addition;
For faster browsing, not all history is shown. View entire blame