init_64.c 34.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
Pavel Machek's avatar
Pavel Machek committed
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
Linus Torvalds's avatar
Linus Torvalds committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
21
#include <linux/initrd.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/memory.h>
31
#include <linux/memory_hotplug.h>
32
#include <linux/memremap.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
35
#include <linux/kcore.h>
Linus Torvalds's avatar
Linus Torvalds committed
36 37

#include <asm/processor.h>
38
#include <asm/bios_ebda.h>
39
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
40 41 42 43
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
44
#include <asm/e820/api.h>
Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48 49
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
Laura Abbott's avatar
Laura Abbott committed
53
#include <asm/set_memory.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
Linus Torvalds's avatar
Linus Torvalds committed
57

58 59
#include "mm_internal.h"

60
#include "ident_map.c"
61

Linus Torvalds's avatar
Linus Torvalds committed
62 63 64 65 66 67
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

68
pteval_t __supported_pte_mask __read_mostly = ~0;
69 70 71 72
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

Ingo Molnar's avatar
Ingo Molnar committed
73 74 75 76 77 78 79 80
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
81 82 83 84 85 86 87 88 89 90
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

91
/*
92
 * When memory was added make sure all the processes MM have
93 94
 * suitable PGD entries in the local PGD level page.
 */
95
void sync_global_pgds(unsigned long start, unsigned long end)
96
{
97
	unsigned long addr;
98

99 100
	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		pgd_t *pgd_ref = pgd_offset_k(addr);
101
		const p4d_t *p4d_ref;
102 103
		struct page *page;

104 105 106 107 108
		/*
		 * With folded p4d, pgd_none() is always false, we need to
		 * handle synchonization on p4d level.
		 */
		BUILD_BUG_ON(pgd_none(*pgd_ref));
109
		p4d_ref = p4d_offset(pgd_ref, addr);
110 111

		if (p4d_none(*p4d_ref))
112 113
			continue;

114
		spin_lock(&pgd_lock);
115
		list_for_each_entry(page, &pgd_list, lru) {
116
			pgd_t *pgd;
117
			p4d_t *p4d;
118 119
			spinlock_t *pgt_lock;

120 121
			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			p4d = p4d_offset(pgd, addr);
122
			/* the pgt_lock only for Xen */
123 124 125
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

126 127 128
			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
				BUG_ON(p4d_page_vaddr(*p4d)
				       != p4d_page_vaddr(*p4d_ref));
129

130 131
			if (p4d_none(*p4d))
				set_p4d(p4d, *p4d_ref);
132

133
			spin_unlock(pgt_lock);
134
		}
135
		spin_unlock(&pgd_lock);
136
	}
137 138
}

139 140 141 142 143
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
Thomas Gleixner's avatar
Thomas Gleixner committed
144
{
Linus Torvalds's avatar
Linus Torvalds committed
145
	void *ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
146

Linus Torvalds's avatar
Linus Torvalds committed
147
	if (after_bootmem)
148
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
Linus Torvalds's avatar
Linus Torvalds committed
149 150
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
Thomas Gleixner's avatar
Thomas Gleixner committed
151 152 153 154 155

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
Linus Torvalds's avatar
Linus Torvalds committed
156

157
	pr_debug("spp_getpage %p\n", ptr);
Thomas Gleixner's avatar
Thomas Gleixner committed
158

Linus Torvalds's avatar
Linus Torvalds committed
159
	return ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
160
}
Linus Torvalds's avatar
Linus Torvalds committed
161

162
static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
Linus Torvalds's avatar
Linus Torvalds committed
163
{
164
	if (pgd_none(*pgd)) {
165 166 167
		p4d_t *p4d = (p4d_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, p4d);
		if (p4d != p4d_offset(pgd, 0))
168
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
169 170 171 172 173 174 175 176 177 178 179 180 181
			       p4d, p4d_offset(pgd, 0));
	}
	return p4d_offset(pgd, vaddr);
}

static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
{
	if (p4d_none(*p4d)) {
		pud_t *pud = (pud_t *)spp_getpage();
		p4d_populate(&init_mm, p4d, pud);
		if (pud != pud_offset(p4d, 0))
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
			       pud, pud_offset(p4d, 0));
182
	}
183
	return pud_offset(p4d, vaddr);
184
}
Linus Torvalds's avatar
Linus Torvalds committed
185

186
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
187
{
Linus Torvalds's avatar
Linus Torvalds committed
188
	if (pud_none(*pud)) {
189
		pmd_t *pmd = (pmd_t *) spp_getpage();
190
		pud_populate(&init_mm, pud, pmd);
191
		if (pmd != pmd_offset(pud, 0))
192
			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
193
			       pmd, pmd_offset(pud, 0));
Linus Torvalds's avatar
Linus Torvalds committed
194
	}
195 196 197
	return pmd_offset(pud, vaddr);
}

198
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
199
{
Linus Torvalds's avatar
Linus Torvalds committed
200
	if (pmd_none(*pmd)) {
201
		pte_t *pte = (pte_t *) spp_getpage();
202
		pmd_populate_kernel(&init_mm, pmd, pte);
203
		if (pte != pte_offset_kernel(pmd, 0))
204
			printk(KERN_ERR "PAGETABLE BUG #03!\n");
Linus Torvalds's avatar
Linus Torvalds committed
205
	}
206 207 208
	return pte_offset_kernel(pmd, vaddr);
}

209
static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
210
{
211 212
	pmd_t *pmd = fill_pmd(pud, vaddr);
	pte_t *pte = fill_pte(pmd, vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215 216 217 218 219 220 221 222

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
{
	p4d_t *p4d = p4d_page + p4d_index(vaddr);
	pud_t *pud = fill_pud(p4d, vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud = pud_page + pud_index(vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

238
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
239 240
{
	pgd_t *pgd;
241
	p4d_t *p4d_page;
242 243 244 245 246 247 248 249 250

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
251 252 253

	p4d_page = p4d_offset(pgd, 0);
	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
254 255
}

256
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
257 258
{
	pgd_t *pgd;
259
	p4d_t *p4d;
260 261 262
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
263 264
	p4d = fill_p4d(pgd, vaddr);
	pud = fill_pud(p4d, vaddr);
265 266 267 268 269 270
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
271

272 273
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
274 275
}

276 277 278 279
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
280
					enum page_cache_mode cache)
281 282
{
	pgd_t *pgd;
283
	p4d_t *p4d;
284 285
	pud_t *pud;
	pmd_t *pmd;
286
	pgprot_t prot;
287

288 289
	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
		pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
290 291 292 293
	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
294 295 296 297 298 299
			p4d = (p4d_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
		if (p4d_none(*p4d)) {
300
			pud = (pud_t *) spp_getpage();
301
			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
302 303
						_PAGE_USER));
		}
304
		pud = pud_offset(p4d, (unsigned long)__va(phys));
305 306 307 308 309 310 311 312 313 314 315 316 317
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
318
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
319 320 321 322
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
323
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
324 325
}

326
/*
327 328 329
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
330
 *
331
 * phys_base holds the negative offset to the kernel, which is added
332 333 334
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
335 336
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
337 338 339 340 341
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
342
	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
343
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
344 345
	pmd_t *pmd = level2_kernel_pgt;

346 347 348 349 350 351 352 353
	/*
	 * Native path, max_pfn_mapped is not set yet.
	 * Xen has valid max_pfn_mapped set in
	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
	 */
	if (max_pfn_mapped)
		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);

354
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
355
		if (pmd_none(*pmd))
356 357 358 359 360 361
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

362 363 364 365
/*
 * Create PTE level page table mapping for physical addresses.
 * It returns the last physical address mapped.
 */
366
static unsigned long __meminit
367
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
368
	      pgprot_t prot)
369
{
370 371 372
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
	pte_t *pte;
373
	int i;
374

375 376
	pte = pte_page + pte_index(paddr);
	i = pte_index(paddr);
377

378 379 380
	for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
		paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
		if (paddr >= paddr_end) {
381
			if (!after_bootmem &&
382
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
383
					     E820_TYPE_RAM) &&
384
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
385
					     E820_TYPE_RESERVED_KERN))
386 387
				set_pte(pte, __pte(0));
			continue;
388 389
		}

390 391 392 393 394 395
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
396
		if (!pte_none(*pte)) {
397 398
			if (!after_bootmem)
				pages++;
399
			continue;
400
		}
401 402

		if (0)
403 404
			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
405
		pages++;
406 407
		set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
408
	}
409

410
	update_page_count(PG_LEVEL_4K, pages);
411

412
	return paddr_last;
413 414
}

415 416 417 418 419
/*
 * Create PMD level page table mapping for physical addresses. The virtual
 * and physical address have to be aligned at this level.
 * It returns the last physical address mapped.
 */
420
static unsigned long __meminit
421
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
422
	      unsigned long page_size_mask, pgprot_t prot)
423
{
424 425
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
426

427
	int i = pmd_index(paddr);
428

429 430
	for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
		pmd_t *pmd = pmd_page + pmd_index(paddr);
431
		pte_t *pte;
432
		pgprot_t new_prot = prot;
433

434 435
		paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
		if (paddr >= paddr_end) {
436
			if (!after_bootmem &&
437
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
438
					     E820_TYPE_RAM) &&
439
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
440
					     E820_TYPE_RESERVED_KERN))
441 442
				set_pmd(pmd, __pmd(0));
			continue;
443
		}
444

445
		if (!pmd_none(*pmd)) {
446 447
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
448
				pte = (pte_t *)pmd_page_vaddr(*pmd);
449 450
				paddr_last = phys_pte_init(pte, paddr,
							   paddr_end, prot);
451
				spin_unlock(&init_mm.page_table_lock);
452
				continue;
453
			}
454 455 456 457 458 459 460 461 462 463 464 465
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
466
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
467 468
				if (!after_bootmem)
					pages++;
469
				paddr_last = paddr_next;
470
				continue;
471
			}
472
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
473 474
		}

475
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
476
			pages++;
477
			spin_lock(&init_mm.page_table_lock);
478
			set_pte((pte_t *)pmd,
479
				pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
480
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
481
			spin_unlock(&init_mm.page_table_lock);
482
			paddr_last = paddr_next;
483
			continue;
484
		}
485

486
		pte = alloc_low_page();
487
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
488

489
		spin_lock(&init_mm.page_table_lock);
490
		pmd_populate_kernel(&init_mm, pmd, pte);
491
		spin_unlock(&init_mm.page_table_lock);
492
	}
493
	update_page_count(PG_LEVEL_2M, pages);
494
	return paddr_last;
495 496
}

497 498
/*
 * Create PUD level page table mapping for physical addresses. The virtual
499 500
 * and physical address do not have to be aligned at this level. KASLR can
 * randomize virtual addresses up to this level.
501 502
 * It returns the last physical address mapped.
 */
503
static unsigned long __meminit
504 505
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
506
{
507 508
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
509 510
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = pud_index(vaddr);
511

512
	for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
513
		pud_t *pud;
Linus Torvalds's avatar
Linus Torvalds committed
514
		pmd_t *pmd;
515
		pgprot_t prot = PAGE_KERNEL;
Linus Torvalds's avatar
Linus Torvalds committed
516

517 518
		vaddr = (unsigned long)__va(paddr);
		pud = pud_page + pud_index(vaddr);
519
		paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
520

521
		if (paddr >= paddr_end) {
522
			if (!after_bootmem &&
523
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
524
					     E820_TYPE_RAM) &&
525
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
526
					     E820_TYPE_RESERVED_KERN))
527
				set_pud(pud, __pud(0));
Linus Torvalds's avatar
Linus Torvalds committed
528
			continue;
Thomas Gleixner's avatar
Thomas Gleixner committed
529
		}
Linus Torvalds's avatar
Linus Torvalds committed
530

531
		if (!pud_none(*pud)) {
532
			if (!pud_large(*pud)) {
533
				pmd = pmd_offset(pud, 0);
534 535 536 537
				paddr_last = phys_pmd_init(pmd, paddr,
							   paddr_end,
							   page_size_mask,
							   prot);
538
				__flush_tlb_all();
539 540
				continue;
			}
541 542 543 544 545 546 547 548 549 550 551 552
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
553
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
554 555
				if (!after_bootmem)
					pages++;
556
				paddr_last = paddr_next;
557
				continue;
558
			}
559
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
560 561
		}

562
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
563
			pages++;
564
			spin_lock(&init_mm.page_table_lock);
565
			set_pte((pte_t *)pud,
566
				pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
567
					PAGE_KERNEL_LARGE));
568
			spin_unlock(&init_mm.page_table_lock);
569
			paddr_last = paddr_next;
570 571 572
			continue;
		}

573
		pmd = alloc_low_page();
574 575
		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
					   page_size_mask, prot);
576 577

		spin_lock(&init_mm.page_table_lock);
578
		pud_populate(&init_mm, pud, pmd);
579
		spin_unlock(&init_mm.page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
580
	}
581
	__flush_tlb_all();
582

583
	update_page_count(PG_LEVEL_1G, pages);
584

585
	return paddr_last;
Thomas Gleixner's avatar
Thomas Gleixner committed
586
}
Linus Torvalds's avatar
Linus Torvalds committed
587

588 589
/*
 * Create page table mapping for the physical memory for specific physical
590
 * addresses. The virtual and physical addresses have to be aligned on PMD level
591 592
 * down. It returns the last physical address mapped.
 */
593
unsigned long __meminit
594 595
kernel_physical_mapping_init(unsigned long paddr_start,
			     unsigned long paddr_end,
596
			     unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
597
{
598
	bool pgd_changed = false;
599
	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
Linus Torvalds's avatar
Linus Torvalds committed
600

601 602 603 604
	paddr_last = paddr_end;
	vaddr = (unsigned long)__va(paddr_start);
	vaddr_end = (unsigned long)__va(paddr_end);
	vaddr_start = vaddr;
Linus Torvalds's avatar
Linus Torvalds committed
605

606 607
	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
		pgd_t *pgd = pgd_offset_k(vaddr);
608
		p4d_t *p4d;
609 610
		pud_t *pud;

611
		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
612

613 614 615 616
		BUILD_BUG_ON(pgd_none(*pgd));
		p4d = p4d_offset(pgd, vaddr);
		if (p4d_val(*p4d)) {
			pud = (pud_t *)p4d_page_vaddr(*p4d);
617 618 619
			paddr_last = phys_pud_init(pud, __pa(vaddr),
						   __pa(vaddr_end),
						   page_size_mask);
620 621 622
			continue;
		}

623
		pud = alloc_low_page();
624 625
		paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
					   page_size_mask);
626 627

		spin_lock(&init_mm.page_table_lock);
628
		p4d_populate(&init_mm, p4d, pud);
629
		spin_unlock(&init_mm.page_table_lock);
630
		pgd_changed = true;
Thomas Gleixner's avatar
Thomas Gleixner committed
631
	}
632 633

	if (pgd_changed)
634
		sync_global_pgds(vaddr_start, vaddr_end - 1);
635

636
	__flush_tlb_all();
Linus Torvalds's avatar
Linus Torvalds committed
637

638
	return paddr_last;
639
}
640

641
#ifndef CONFIG_NUMA
642
void __init initmem_init(void)
643
{
644
	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
645
}
646
#endif
647

Linus Torvalds's avatar
Linus Torvalds committed
648 649
void __init paging_init(void)
{
650
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
651
	sparse_init();
652 653 654 655 656 657 658

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
659 660 661
	node_clear_state(0, N_MEMORY);
	if (N_MEMORY != N_NORMAL_MEMORY)
		node_clear_state(0, N_NORMAL_MEMORY);
662

663
	zone_sizes_init();
Linus Torvalds's avatar
Linus Torvalds committed
664 665
}

666 667 668
/*
 * Memory hotplug specific functions
 */
669
#ifdef CONFIG_MEMORY_HOTPLUG
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

685 686 687 688
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
689
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
690
{
691
	struct pglist_data *pgdat = NODE_DATA(nid);
692
	struct zone *zone = pgdat->node_zones +
693
		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
694
	unsigned long start_pfn = start >> PAGE_SHIFT;
695 696 697
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

698
	init_memory_mapping(start, start + size);
699

700
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
701
	WARN_ON_ONCE(ret);
702

703 704 705
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

706 707
	return ret;
}
708
EXPORT_SYMBOL_GPL(arch_add_memory);
709

710 711 712 713 714 715
#define PAGE_INUSE 0xFD

static void __meminit free_pagetable(struct page *page, int order)
{
	unsigned long magic;
	unsigned int nr_pages = 1 << order;
716 717 718 719 720 721
	struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);

	if (altmap) {
		vmem_altmap_free(altmap, nr_pages);
		return;
	}
722 723 724 725 726

	/* bootmem page has reserved flag */
	if (PageReserved(page)) {
		__ClearPageReserved(page);

727
		magic = (unsigned long)page->freelist;
728 729 730 731
		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			while (nr_pages--)
				put_page_bootmem(page++);
		} else
732 733
			while (nr_pages--)
				free_reserved_page(page++);
734 735 736 737 738 739 740 741 742 743 744
	} else
		free_pages((unsigned long)page_address(page), order);
}

static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
745
		if (!pte_none(*pte))
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
			return;
	}

	/* free a pte talbe */
	free_pagetable(pmd_page(*pmd), 0);
	spin_lock(&init_mm.page_table_lock);
	pmd_clear(pmd);
	spin_unlock(&init_mm.page_table_lock);
}

static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
763
		if (!pmd_none(*pmd))
764 765 766 767 768 769 770 771 772 773
			return;
	}

	/* free a pmd talbe */
	free_pagetable(pud_page(*pud), 0);
	spin_lock(&init_mm.page_table_lock);
	pud_clear(pud);
	spin_unlock(&init_mm.page_table_lock);
}

774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
{
	pud_t *pud;
	int i;

	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	/* free a pud talbe */
	free_pagetable(p4d_page(*p4d), 0);
	spin_lock(&init_mm.page_table_lock);
	p4d_clear(p4d);
	spin_unlock(&init_mm.page_table_lock);
}

792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
		 bool direct)
{
	unsigned long next, pages = 0;
	pte_t *pte;
	void *page_addr;
	phys_addr_t phys_addr;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

		/*
		 * We mapped [0,1G) memory as identity mapping when
		 * initializing, in arch/x86/kernel/head_64.S. These
		 * pagetables cannot be removed.
		 */
		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
		if (phys_addr < (phys_addr_t)0x40000000)
			return;

819
		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897