memory_hotplug.c 50.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/compiler.h>
13
#include <linux/export.h>
14
#include <linux/pagevec.h>
15
#include <linux/writeback.h>
16
17
18
19
20
21
22
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
23
#include <linux/ioport.h>
24
25
26
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
27
#include <linux/pfn.h>
28
#include <linux/suspend.h>
29
#include <linux/mm_inline.h>
30
#include <linux/firmware-map.h>
31
#include <linux/stop_machine.h>
32
#include <linux/hugetlb.h>
33
#include <linux/memblock.h>
34
35
36

#include <asm/tlbflush.h>

37
38
#include "internal.h"

39
40
41
42
43
44
45
46
47
48
/*
 * online_page_callback contains pointer to current page onlining function.
 * Initially it is generic_online_page(). If it is required it could be
 * changed by calling set_online_page_callback() for callback registration
 * and restore_online_page_callback() for generic callback restore.
 */

static void generic_online_page(struct page *page);

static online_page_callback_t online_page_callback = generic_online_page;
49
static DEFINE_MUTEX(online_page_callback_lock);
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* The same as the cpu_hotplug lock, but for memory hotplug. */
static struct {
	struct task_struct *active_writer;
	struct mutex lock; /* Synchronizes accesses to refcount, */
	/*
	 * Also blocks the new readers during
	 * an ongoing mem hotplug operation.
	 */
	int refcount;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif
} mem_hotplug = {
	.active_writer = NULL,
	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
	.refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	.dep_map = {.name = "mem_hotplug.lock" },
#endif
};

/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)

void get_online_mems(void)
{
	might_sleep();
	if (mem_hotplug.active_writer == current)
		return;
	memhp_lock_acquire_read();
	mutex_lock(&mem_hotplug.lock);
	mem_hotplug.refcount++;
	mutex_unlock(&mem_hotplug.lock);

}
89

90
void put_online_mems(void)
91
{
92
93
94
95
96
97
98
99
100
101
102
103
	if (mem_hotplug.active_writer == current)
		return;
	mutex_lock(&mem_hotplug.lock);

	if (WARN_ON(!mem_hotplug.refcount))
		mem_hotplug.refcount++; /* try to fix things up */

	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
		wake_up_process(mem_hotplug.active_writer);
	mutex_unlock(&mem_hotplug.lock);
	memhp_lock_release();

104
105
}

106
static void mem_hotplug_begin(void)
107
{
108
109
110
111
112
113
114
115
116
117
118
	mem_hotplug.active_writer = current;

	memhp_lock_acquire();
	for (;;) {
		mutex_lock(&mem_hotplug.lock);
		if (likely(!mem_hotplug.refcount))
			break;
		__set_current_state(TASK_UNINTERRUPTIBLE);
		mutex_unlock(&mem_hotplug.lock);
		schedule();
	}
119
120
}

121
122
123
124
125
126
static void mem_hotplug_done(void)
{
	mem_hotplug.active_writer = NULL;
	mutex_unlock(&mem_hotplug.lock);
	memhp_lock_release();
}
127

128
129
130
131
132
133
134
135
136
137
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	BUG_ON(!res);

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
138
	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
139
	if (request_resource(&iomem_resource, res) < 0) {
140
		pr_debug("System RAM resource %pR cannot be added\n", res);
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
		kfree(res);
		res = NULL;
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}

156
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
157
158
void get_page_bootmem(unsigned long info,  struct page *page,
		      unsigned long type)
159
{
Andrea Arcangeli's avatar
Andrea Arcangeli committed
160
	page->lru.next = (struct list_head *) type;
161
162
163
164
165
	SetPagePrivate(page);
	set_page_private(page, info);
	atomic_inc(&page->_count);
}

166
void put_page_bootmem(struct page *page)
167
{
Andrea Arcangeli's avatar
Andrea Arcangeli committed
168
	unsigned long type;
169

Andrea Arcangeli's avatar
Andrea Arcangeli committed
170
171
172
	type = (unsigned long) page->lru.next;
	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
173
174
175
176

	if (atomic_dec_return(&page->_count) == 1) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
177
		INIT_LIST_HEAD(&page->lru);
178
		free_reserved_page(page);
179
180
181
	}
}

182
183
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
184
static void register_page_bootmem_info_section(unsigned long start_pfn)
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
214
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
215
216

}
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	if (!pfn_valid(start_pfn))
		return;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258

void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;
	struct zone *zone;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	zone = &pgdat->node_zones[0];
	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
259
		if (zone_is_initialized(zone)) {
260
261
262
263
264
265
266
267
268
269
270
			nr_pages = zone->wait_table_hash_nr_entries
				* sizeof(wait_queue_head_t);
			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
			page = virt_to_page(zone->wait_table);

			for (i = 0; i < nr_pages; i++, page++)
				get_page_bootmem(node, page, NODE_INFO);
		}
	}

	pfn = pgdat->node_start_pfn;
271
	end_pfn = pgdat_end_pfn(pgdat);
272

273
	/* register section info */
274
275
276
277
278
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		/*
		 * Some platforms can assign the same pfn to multiple nodes - on
		 * node0 as well as nodeN.  To avoid registering a pfn against
		 * multiple nodes we check that this pfn does not already
279
		 * reside in some other nodes.
280
281
282
283
		 */
		if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
			register_page_bootmem_info_section(pfn);
	}
284
}
285
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
286

287
288
static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
				     unsigned long end_pfn)
289
290
291
292
293
{
	unsigned long old_zone_end_pfn;

	zone_span_writelock(zone);

294
	old_zone_end_pfn = zone_end_pfn(zone);
295
	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
296
297
298
299
300
301
302
303
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
				zone->zone_start_pfn;

	zone_span_writeunlock(zone);
}

304
305
306
307
308
static void resize_zone(struct zone *zone, unsigned long start_pfn,
		unsigned long end_pfn)
{
	zone_span_writelock(zone);

309
310
311
312
313
314
315
316
317
318
319
	if (end_pfn - start_pfn) {
		zone->zone_start_pfn = start_pfn;
		zone->spanned_pages = end_pfn - start_pfn;
	} else {
		/*
		 * make it consist as free_area_init_core(),
		 * if spanned_pages = 0, then keep start_pfn = 0
		 */
		zone->zone_start_pfn = 0;
		zone->spanned_pages = 0;
	}
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

	zone_span_writeunlock(zone);
}

static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
		unsigned long end_pfn)
{
	enum zone_type zid = zone_idx(zone);
	int nid = zone->zone_pgdat->node_id;
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn++)
		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
}

335
/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
336
 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
337
338
339
340
341
342
343
344
345
static int __ref ensure_zone_is_initialized(struct zone *zone,
			unsigned long start_pfn, unsigned long num_pages)
{
	if (!zone_is_initialized(zone))
		return init_currently_empty_zone(zone, start_pfn, num_pages,
						 MEMMAP_HOTPLUG);
	return 0;
}

346
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
347
348
		unsigned long start_pfn, unsigned long end_pfn)
{
349
	int ret;
350
	unsigned long flags;
351
352
	unsigned long z1_start_pfn;

353
354
355
	ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
	if (ret)
		return ret;
356
357
358
359

	pgdat_resize_lock(z1->zone_pgdat, &flags);

	/* can't move pfns which are higher than @z2 */
360
	if (end_pfn > zone_end_pfn(z2))
361
		goto out_fail;
362
	/* the move out part must be at the left most of @z2 */
363
364
365
366
367
368
	if (start_pfn > z2->zone_start_pfn)
		goto out_fail;
	/* must included/overlap */
	if (end_pfn <= z2->zone_start_pfn)
		goto out_fail;

369
	/* use start_pfn for z1's start_pfn if z1 is empty */
370
	if (!zone_is_empty(z1))
371
372
373
374
375
		z1_start_pfn = z1->zone_start_pfn;
	else
		z1_start_pfn = start_pfn;

	resize_zone(z1, z1_start_pfn, end_pfn);
376
	resize_zone(z2, end_pfn, zone_end_pfn(z2));
377
378
379
380
381
382
383
384
385
386
387

	pgdat_resize_unlock(z1->zone_pgdat, &flags);

	fix_zone_id(z1, start_pfn, end_pfn);

	return 0;
out_fail:
	pgdat_resize_unlock(z1->zone_pgdat, &flags);
	return -1;
}

388
static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
389
390
		unsigned long start_pfn, unsigned long end_pfn)
{
391
	int ret;
392
	unsigned long flags;
393
394
	unsigned long z2_end_pfn;

395
396
397
	ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
	if (ret)
		return ret;
398
399
400
401
402
403
404

	pgdat_resize_lock(z1->zone_pgdat, &flags);

	/* can't move pfns which are lower than @z1 */
	if (z1->zone_start_pfn > start_pfn)
		goto out_fail;
	/* the move out part mast at the right most of @z1 */
405
	if (zone_end_pfn(z1) >  end_pfn)
406
407
		goto out_fail;
	/* must included/overlap */
408
	if (start_pfn >= zone_end_pfn(z1))
409
410
		goto out_fail;

411
	/* use end_pfn for z2's end_pfn if z2 is empty */
412
	if (!zone_is_empty(z2))
413
		z2_end_pfn = zone_end_pfn(z2);
414
415
416
	else
		z2_end_pfn = end_pfn;

417
	resize_zone(z1, z1->zone_start_pfn, start_pfn);
418
	resize_zone(z2, start_pfn, z2_end_pfn);
419
420
421
422
423
424
425
426
427
428
429

	pgdat_resize_unlock(z1->zone_pgdat, &flags);

	fix_zone_id(z2, start_pfn, end_pfn);

	return 0;
out_fail:
	pgdat_resize_unlock(z1->zone_pgdat, &flags);
	return -1;
}

430
431
static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
				      unsigned long end_pfn)
432
{
433
	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
434

435
	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
436
437
438
439
440
441
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
					pgdat->node_start_pfn;
}

Al Viro's avatar
Al Viro committed
442
static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
443
444
445
446
447
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int nid = pgdat->node_id;
	int zone_type;
448
	unsigned long flags;
449
	int ret;
450
451

	zone_type = zone - pgdat->node_zones;
452
453
454
	ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
	if (ret)
		return ret;
455
456
457
458
459
460

	pgdat_resize_lock(zone->zone_pgdat, &flags);
	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
			phys_start_pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
Dave Hansen's avatar
Dave Hansen committed
461
462
	memmap_init_zone(nr_pages, nid, zone_type,
			 phys_start_pfn, MEMMAP_HOTPLUG);
463
	return 0;
464
465
}

466
467
static int __meminit __add_section(int nid, struct zone *zone,
					unsigned long phys_start_pfn)
468
469
470
{
	int ret;

471
472
473
	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

474
	ret = sparse_add_one_section(zone, phys_start_pfn);
475
476
477
478

	if (ret < 0)
		return ret;

479
480
481
482
483
	ret = __add_zone(zone, phys_start_pfn);

	if (ret < 0)
		return ret;

484
	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
485
486
}

487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
			unsigned long nr_pages)
{
	unsigned long i;
	int err = 0;
	int start_sec, end_sec;
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);

	for (i = start_sec; i <= end_sec; i++) {
		err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);

		/*
		 * EEXIST is finally dealt with by ioresource collision
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
		 */
		if (err && (err != -EEXIST))
			break;
		err = 0;
	}

	return err;
}
EXPORT_SYMBOL_GPL(__add_pages);

#ifdef CONFIG_MEMORY_HOTREMOVE
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static int find_smallest_section_pfn(int nid, struct zone *zone,
				     unsigned long start_pfn,
				     unsigned long end_pfn)
{
	struct mem_section *ms;

	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(start_pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (unlikely(pfn_to_nid(start_pfn) != nid))
			continue;

		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
			continue;

		return start_pfn;
	}

	return 0;
}

/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
static int find_biggest_section_pfn(int nid, struct zone *zone,
				    unsigned long start_pfn,
				    unsigned long end_pfn)
{
	struct mem_section *ms;
	unsigned long pfn;

	/* pfn is the end pfn of a memory section. */
	pfn = end_pfn - 1;
	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (unlikely(pfn_to_nid(pfn) != nid))
			continue;

		if (zone && zone != page_zone(pfn_to_page(pfn)))
			continue;

		return pfn;
	}

	return 0;
}

static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
			     unsigned long end_pfn)
{
577
578
579
	unsigned long zone_start_pfn = zone->zone_start_pfn;
	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
	unsigned long zone_end_pfn = z;
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
	unsigned long pfn;
	struct mem_section *ms;
	int nid = zone_to_nid(zone);

	zone_span_writelock(zone);
	if (zone_start_pfn == start_pfn) {
		/*
		 * If the section is smallest section in the zone, it need
		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
						zone_end_pfn);
		if (pfn) {
			zone->zone_start_pfn = pfn;
			zone->spanned_pages = zone_end_pfn - pfn;
		}
	} else if (zone_end_pfn == end_pfn) {
		/*
		 * If the section is biggest section in the zone, it need
		 * shrink zone->spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
					       start_pfn);
		if (pfn)
			zone->spanned_pages = pfn - zone_start_pfn + 1;
	}

	/*
	 * The section is not biggest or smallest mem_section in the zone, it
	 * only creates a hole in the zone. So in this case, we need not
	 * change the zone. But perhaps, the zone has only hole data. Thus
	 * it check the zone has only hole or not.
	 */
	pfn = zone_start_pfn;
	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (page_zone(pfn_to_page(pfn)) != zone)
			continue;

		 /* If the section is current section, it continues the loop */
		if (start_pfn == pfn)
			continue;

		/* If we find valid section, we have nothing to do */
		zone_span_writeunlock(zone);
		return;
	}

	/* The zone has no valid section */
	zone->zone_start_pfn = 0;
	zone->spanned_pages = 0;
	zone_span_writeunlock(zone);
}

static void shrink_pgdat_span(struct pglist_data *pgdat,
			      unsigned long start_pfn, unsigned long end_pfn)
{
645
646
647
	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
	unsigned long pgdat_end_pfn = p;
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
	unsigned long pfn;
	struct mem_section *ms;
	int nid = pgdat->node_id;

	if (pgdat_start_pfn == start_pfn) {
		/*
		 * If the section is smallest section in the pgdat, it need
		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
						pgdat_end_pfn);
		if (pfn) {
			pgdat->node_start_pfn = pfn;
			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
		}
	} else if (pgdat_end_pfn == end_pfn) {
		/*
		 * If the section is biggest section in the pgdat, it need
		 * shrink pgdat->node_spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
					       start_pfn);
		if (pfn)
			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
	}

	/*
	 * If the section is not biggest or smallest mem_section in the pgdat,
	 * it only creates a hole in the pgdat. So in this case, we need not
	 * change the pgdat.
	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
	 * has only hole or not.
	 */
	pfn = pgdat_start_pfn;
	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
		ms = __pfn_to_section(pfn);

		if (unlikely(!valid_section(ms)))
			continue;

		if (pfn_to_nid(pfn) != nid)
			continue;

		 /* If the section is current section, it continues the loop */
		if (start_pfn == pfn)
			continue;

		/* If we find valid section, we have nothing to do */
		return;
	}

	/* The pgdat has no valid section */
	pgdat->node_start_pfn = 0;
	pgdat->node_spanned_pages = 0;
}

static void __remove_zone(struct zone *zone, unsigned long start_pfn)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int zone_type;
	unsigned long flags;

	zone_type = zone - pgdat->node_zones;

	pgdat_resize_lock(zone->zone_pgdat, &flags);
	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
}

723
724
static int __remove_section(struct zone *zone, struct mem_section *ms)
{
725
726
	unsigned long start_pfn;
	int scn_nr;
727
728
729
730
731
732
733
734
735
	int ret = -EINVAL;

	if (!valid_section(ms))
		return ret;

	ret = unregister_memory_section(ms);
	if (ret)
		return ret;

736
737
738
739
	scn_nr = __section_nr(ms);
	start_pfn = section_nr_to_pfn(scn_nr);
	__remove_zone(zone, start_pfn);

740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
	sparse_remove_one_section(zone, ms);
	return 0;
}

/**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 * @nr_pages: number of pages to remove (must be multiple of section size)
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
758
	unsigned long i;
759
	int sections_to_remove;
760
761
	resource_size_t start, size;
	int ret = 0;
762
763
764
765
766
767
768

	/*
	 * We can only remove entire sections
	 */
	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	BUG_ON(nr_pages % PAGES_PER_SECTION);

769
770
771
	start = phys_start_pfn << PAGE_SHIFT;
	size = nr_pages * PAGE_SIZE;
	ret = release_mem_region_adjustable(&iomem_resource, start, size);
772
773
774
775
776
777
	if (ret) {
		resource_size_t endres = start + size - 1;

		pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
				&start, &endres, ret);
	}
778

779
780
781
782
783
784
785
786
787
788
	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	for (i = 0; i < sections_to_remove; i++) {
		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
		ret = __remove_section(zone, __pfn_to_section(pfn));
		if (ret)
			break;
	}
	return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
789
#endif /* CONFIG_MEMORY_HOTREMOVE */
790

791
792
793
794
int set_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

795
796
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
797
798
799
800
801
802

	if (online_page_callback == generic_online_page) {
		online_page_callback = callback;
		rc = 0;
	}

803
804
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
805
806
807
808
809
810
811
812
813

	return rc;
}
EXPORT_SYMBOL_GPL(set_online_page_callback);

int restore_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

814
815
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
816
817
818
819
820
821

	if (online_page_callback == callback) {
		online_page_callback = generic_online_page;
		rc = 0;
	}

822
823
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
824
825
826
827
828
829

	return rc;
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);

void __online_page_set_limits(struct page *page)
830
{
831
832
833
834
835
}
EXPORT_SYMBOL_GPL(__online_page_set_limits);

void __online_page_increment_counters(struct page *page)
{
836
	adjust_managed_page_count(page, 1);
837
838
}
EXPORT_SYMBOL_GPL(__online_page_increment_counters);
839

840
841
void __online_page_free(struct page *page)
{
842
	__free_reserved_page(page);
843
}
844
845
846
847
848
849
850
851
EXPORT_SYMBOL_GPL(__online_page_free);

static void generic_online_page(struct page *page)
{
	__online_page_set_limits(page);
	__online_page_increment_counters(page);
	__online_page_free(page);
}
852

853
854
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
855
856
{
	unsigned long i;
857
858
859
860
861
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
862
			(*online_page_callback)(page);
863
864
865
866
867
868
			onlined_pages++;
		}
	*(unsigned long *)arg = onlined_pages;
	return 0;
}

869
#ifdef CONFIG_MOVABLE_NODE
870
871
872
873
/*
 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
 * normal memory.
 */
874
875
876
877
static bool can_online_high_movable(struct zone *zone)
{
	return true;
}
878
#else /* CONFIG_MOVABLE_NODE */
879
880
881
882
883
/* ensure every online node has NORMAL memory */
static bool can_online_high_movable(struct zone *zone)
{
	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
}
884
#endif /* CONFIG_MOVABLE_NODE */
885

886
887
888
889
890
891
892
893
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);
	enum zone_type zone_last = ZONE_NORMAL;

	/*
894
895
896
	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_NORMAL,
	 * set zone_last to ZONE_NORMAL.
897
	 *
898
899
900
	 * If we don't have HIGHMEM nor movable node,
	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
901
	 */
902
	if (N_MEMORY == N_NORMAL_MEMORY)
903
904
905
906
907
908
909
910
911
912
913
914
915
		zone_last = ZONE_MOVABLE;

	/*
	 * if the memory to be online is in a zone of 0...zone_last, and
	 * the zones of 0...zone_last don't have memory before online, we will
	 * need to set the node to node_states[N_NORMAL_MEMORY] after
	 * the memory is online.
	 */
	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
		arg->status_change_nid_normal = nid;
	else
		arg->status_change_nid_normal = -1;

916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
#ifdef CONFIG_HIGHMEM
	/*
	 * If we have movable node, node_states[N_HIGH_MEMORY]
	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
	 * set zone_last to ZONE_HIGHMEM.
	 *
	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_MOVABLE,
	 * set zone_last to ZONE_MOVABLE.
	 */
	zone_last = ZONE_HIGHMEM;
	if (N_MEMORY == N_HIGH_MEMORY)
		zone_last = ZONE_MOVABLE;

	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
		arg->status_change_nid_high = nid;
	else
		arg->status_change_nid_high = -1;
#else
	arg->status_change_nid_high = arg->status_change_nid_normal;
#endif

938
939
	/*
	 * if the node don't have memory befor online, we will need to
940
	 * set the node to node_states[N_MEMORY] after the memory
941
942
	 * is online.
	 */
943
	if (!node_state(nid, N_MEMORY))
944
945
946
947
948
949
950
951
952
953
		arg->status_change_nid = nid;
	else
		arg->status_change_nid = -1;
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

954
955
956
957
	if (arg->status_change_nid_high >= 0)
		node_set_state(node, N_HIGH_MEMORY);

	node_set_state(node, N_MEMORY);
958
959
}

960

961
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
962
{
963
	unsigned long flags;
964
965
	unsigned long onlined_pages = 0;
	struct zone *zone;
966
	int need_zonelists_rebuild = 0;
967
968
969
970
	int nid;
	int ret;
	struct memory_notify arg;

971
	mem_hotplug_begin();
972
973
974
975
976
977
978
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_mutex.
	 */
	zone = page_zone(pfn_to_page(pfn));

979
	ret = -EINVAL;
980
981
	if ((zone_idx(zone) > ZONE_NORMAL ||
	    online_type == MMOP_ONLINE_MOVABLE) &&
982
983
	    !can_online_high_movable(zone))
		goto out;
984

985
986
	if (online_type == MMOP_ONLINE_KERNEL &&
	    zone_idx(zone) == ZONE_MOVABLE) {
987
988
		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
			goto out;
989
	}
990
991
	if (online_type == MMOP_ONLINE_MOVABLE &&
	    zone_idx(zone) == ZONE_MOVABLE - 1) {
992
993
		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
			goto out;
994
995
996
997
998
	}

	/* Previous code may changed the zone of the pfn range */
	zone = page_zone(pfn_to_page(pfn));

999
1000
	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
1001
	node_states_check_changes_online(nr_pages, zone, &arg);
1002

1003
	nid = pfn_to_nid(pfn);
1004

1005
1006
1007
1008
	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret) {
		memory_notify(MEM_CANCEL_ONLINE, &arg);
1009
		goto out;
1010
	}
1011
1012
1013
1014
1015
	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
1016
	mutex_lock(&zonelists_mutex);
1017
	if (!populated_zone(zone)) {
1018
		need_zonelists_rebuild = 1;
1019
1020
		build_all_zonelists(NULL, zone);
	}
1021

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1022
	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
1023
		online_pages_range);
1024
	if (ret) {
1025
1026
		if (need_zonelists_rebuild)
			zone_pcp_reset(zone);
1027
		mutex_unlock(&zonelists_mutex);
1028
1029
1030
1031
		printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
		       (unsigned long long) pfn << PAGE_SHIFT,
		       (((unsigned long long) pfn + nr_pages)
			    << PAGE_SHIFT) - 1);
1032
		memory_notify(MEM_CANCEL_ONLINE, &arg);
1033
		goto out;
1034
1035
	}

1036
	zone->present_pages += onlined_pages;
1037
1038

	pgdat_resize_lock(zone->zone_pgdat, &flags);
1039
	zone->zone_pgdat->node_present_pages += onlined_pages;
1040
1041
	pgdat_resize_unlock(zone->zone_pgdat, &flags);

1042
	if (onlined_pages) {
1043
		node_states_set_node(zone_to_nid(zone), &arg);
1044
		if (need_zonelists_rebuild)
1045
			build_all_zonelists(NULL, NULL);
1046
1047
1048
		else
			zone_pcp_update(zone);
	}
1049

1050
	mutex_unlock(&zonelists_mutex);
1051
1052
1053

	init_per_zone_wmark_min();

1054
	if (onlined_pages)
1055
		kswapd_run(zone_to_nid(zone));
1056

1057
	vm_total_pages = nr_free_pagecache_pages();
1058

1059
	writeback_set_ratelimit();
1060
1061
1062

	if (onlined_pages)
		memory_notify(MEM_ONLINE, &arg);
1063
1064
1065
out:
	mem_hotplug_done();
	return ret;
1066
}
1067
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1068

1069
1070
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1071
1072
1073
1074
{
	struct pglist_data *pgdat;
	unsigned long zones_size[MAX_NR_ZONES] = {0};
	unsigned long zholes_size[MAX_NR_ZONES] = {0};
1075
	unsigned long start_pfn = PFN_DOWN(start);
1076

1077
1078
1079
1080
1081
	pgdat = NODE_DATA(nid);
	if (!pgdat) {
		pgdat = arch_alloc_nodedata(nid);
		if (!pgdat)
			return NULL;
1082

1083
1084
		arch_refresh_nodedata(nid, pgdat);
	}
1085
1086
1087
1088

	/* we can use NODE_DATA(nid) from here */

	/* init node's zones as empty zones, we don't have any present pages.*/
1089
	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
1090

1091
1092
1093
1094
	/*
	 * The node we allocated has no zone fallback lists. For avoiding
	 * to access not-initialized zonelist, build here.
	 */
1095
	mutex_lock(&zonelists_mutex);
1096
	build_all_zonelists(pgdat, NULL);
1097
	mutex_unlock(&zonelists_mutex);
1098

1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
	return pgdat;
}

static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
	arch_refresh_nodedata(nid, NULL);
	arch_free_nodedata(pgdat);
	return;
}

1109

1110
1111
1112
/**
 * try_online_node - online a node if offlined
 *
1113
1114
 * called by cpu_up() to online a node without onlined memory.
 */
1115
int try_online_node(int nid)
1116
1117
1118
1119
{
	pg_data_t	*pgdat;
	int	ret;

1120
1121
1122
	if (node_online(nid))
		return 0;

1123
	mem_hotplug_begin();
1124
	pgdat = hotadd_new_pgdat(nid, 0);
1125
	if (!pgdat) {
1126
		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1127
1128
1129
1130
1131
1132
1133
		ret = -ENOMEM;
		goto out;
	}
	node_set_online(nid);
	ret = register_one_node(nid);
	BUG_ON(ret);

1134
1135
1136
1137
1138
1139
	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
		mutex_lock(&zonelists_mutex);
		build_all_zonelists(NULL, NULL);
		mutex_unlock(&zonelists_mutex);
	}

1140
out:
1141
	mem_hotplug_done();
1142
1143
1144
	return ret;
}

1145
1146
static int check_hotplug_memory_range(u64 start, u64 size)
{
1147
	u64 start_pfn = PFN_DOWN(start);
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
	u64 nr_pages = size >> PAGE_SHIFT;

	/* Memory range must be aligned with section */
	if ((start_pfn & ~PAGE_SECTION_MASK) ||
	    (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
		pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
				(unsigned long long)start,
				(unsigned long long)size);
		return -EINVAL;
	}

	return 0;
}

Al Viro's avatar
Al Viro committed
1162
1163
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
1164
{
1165
	pg_data_t *pgdat = NULL;
1166
1167
	bool new_pgdat;
	bool new_node;
1168
	struct resource *res;
1169
1170
	int ret;

1171
1172
1173
1174
	ret = check_hotplug_memory_range(start, size);
	if (ret)
		return ret;

1175
	res = register_memory_resource(start, size);
1176
	ret = -EEXIST;
1177
	if (!res)
1178
		return ret;
1179

1180
1181
1182
1183
	{	/* Stupid hack to suppress address-never-null warning */
		void *p = NODE_DATA(nid);
		new_pgdat = !p;
	}
1184

1185
	mem_hotplug_begin();
1186

1187
1188
	new_node = !node_online(nid);
	if (new_node) {
1189
		pgdat = hotadd_new_pgdat(nid, start);
1190
		ret = -ENOMEM;
1191
		if (!pgdat)
1192
			goto error;
1193
1194
	}

1195
1196
1197
	/* call arch's memory hotadd */
	ret = arch_add_memory(nid, start, size);

1198
1199
1200
	if (ret < 0)
		goto error;

1201
	/* we online node here. we can't roll back from here. */
1202
1203
	node_set_online(nid);

1204
	if (new_node) {
1205
1206
1207
1208
1209
1210
1211
1212
1213
		ret = register_one_node(nid);
		/*
		 * If sysfs file of new node can't create, cpu on the node
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
		 */
		BUG_ON(ret);
	}

1214
1215
1216
	/* create new memmap entry */
	firmware_map_add_hotplug(start, start + size, "System RAM");

1217
1218
	goto out;

1219
1220
1221
1222
error:
	/* rollback pgdat allocation and others */
	if (new_pgdat)
		rollback_node_hotadd(nid, pgdat);
1223
	release_memory_resource(res);
1224

1225
out:
1226
	mem_hotplug_done();
1227
1228
1229
	return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
1230
1231

#ifdef CONFIG_MEMORY_HOTREMOVE
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
/*
 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 * set and the size of the free page is given by page_order(). Using this,
 * the function determines if the pageblock contains only free pages.
 * Due to buddy contraints, a free page at least the size of a pageblock will
 * be located at the start of the pageblock
 */
static inline int pageblock_free(struct page *page)
{
	return PageBuddy(page) && page_order(page) >= pageblock_order;
}

/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
	/* Ensure the starting page is pageblock-aligned */
	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));

	/* If the entire pageblock is free, move to the end of free page */
1251
1252
1253
1254
1255
1256
1257
	if (pageblock_free(page)) {
		int order;
		/* be careful. we don't have locks, page_order can be changed.*/
		order = page_order(page);
		if ((order < MAX_ORDER) && (order >= pageblock_order))
			return page + (1 << order);
	}
1258

1259
	return page + pageblock_nr_pages;
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
}

/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
	struct page *page = pfn_to_page(start_pfn);
	struct page *end_page = page + nr_pages;

	/* Check the starting page of each pageblock within the range */
	for (; page < end_page; page = next_active_pageblock(page)) {
1270
		if (!is_pageblock_removable_nolock(page))
1271
			return 0;
1272
		cond_resched();
1273
1274
1275
1276
1277
1278
	}

	/* All pageblocks in the memory block are likely to be hot-removable */
	return 1;
}

1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
	for (pfn = start_pfn;
	     pfn < end_pfn;
	     pfn += MAX_ORDER_NR_PAGES) {
		i = 0;
		/* This is just a CONFIG_HOLES_IN_ZONE check.*/
		while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
			i++;
		if (i == MAX_ORDER_NR_PAGES)
			continue;
		page = pfn_to_page(pfn + i);
		if (zone && page_zone(page) != zone)
			return 0;
		zone = page_zone(page);
	}
	return 1;
}

/*
1306
1307
1308
1309
 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
 * and hugepages). We scan pfn because it's much easier than scanning over
 * linked list. This function returns the pfn of the first found movable
 * page if it's found, otherwise 0.
1310
 */
1311
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1312
1313
1314
1315
1316
1317
1318
1319
{
	unsigned long pfn;
	struct page *page;
	for (pfn = start; pfn < end; pfn++) {
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageLRU(page))
				return pfn;
1320
1321
1322
1323
1324
1325
1326
			if (PageHuge(page)) {
				if (is_hugepage_active(page))
					return pfn;
				else
					pfn = round_up(pfn + 1,
						1 << compound_order(page)) - 1;
			}
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
		}
	}
	return 0;
}

#define NR_OFFLINE_AT_ONCE_PAGES	(256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	int not_managed = 0;
	int ret = 0;
	LIST_HEAD(source);

	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359

		if (PageHuge(page)) {
			struct page *head = compound_head(page);
			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
			if (compound_order(head) > PFN_SECTION_SHIFT) {
				ret = -EBUSY;
				break;
			}
			if (isolate_huge_page(page, &source))
				move_pages -= 1 << compound_order(head);
			continue;
		}

1360
		if (!get_page_unless_zero(page))
1361
1362
1363
1364
1365
			continue;
		/*
		 * We can skip free pages. And we can only deal with pages on
		 * LRU.
		 */
1366
		ret = isolate_lru_page(page);
1367
		if (!ret) { /* Success */
1368
			put_page(page);
1369
			list_add_tail(&page->lru, &source);
1370
			move_pages--;
1371
1372
1373
			inc_zone_page_state(page, NR_ISOLATED_ANON +
					    page_is_file_cache(page));

1374
1375
		} else {
#ifdef CONFIG_DEBUG_VM
1376
1377
			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
			       pfn);
1378
			dump_page(page, "failed to remove from LRU");
1379
#endif
1380
			put_page(page);
Lucas De Marchi's avatar
Lucas De Marchi committed
1381
			/* Because we don't have big zone->lock. we should
1382
1383
1384
			   check this again here. */
			if (page_count(page)) {
				not_managed++;
1385
				ret = -EBUSY;
1386
1387
				break;
			}
1388
1389
		}
	}
1390
1391
	if (!list_empty(&