nommu.c 51.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 *  linux/mm/nommu.c
 *
 *  Replacement code for mm functions to support CPU's that don't
 *  have any form of memory management unit (thus no virtual memory).
 *
 *  See Documentation/nommu-mmap.txt
 *
9
 *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12
 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
13
 *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
Linus Torvalds's avatar
Linus Torvalds committed
14 15
 */

16 17
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

18
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
#include <linux/mm.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
20
#include <linux/vmacache.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23 24 25 26 27 28 29
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
30
#include <linux/compiler.h>
Linus Torvalds's avatar
Linus Torvalds committed
31 32 33 34
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
35
#include <linux/audit.h>
36
#include <linux/printk.h>
Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40

#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
41
#include <asm/mmu_context.h>
42 43
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
44
void *high_memory;
45
EXPORT_SYMBOL(high_memory);
Linus Torvalds's avatar
Linus Torvalds committed
46 47
struct page *mem_map;
unsigned long max_mapnr;
48
EXPORT_SYMBOL(max_mapnr);
Hugh Dickins's avatar
Hugh Dickins committed
49
unsigned long highest_memmap_pfn;
50
struct percpu_counter vm_committed_as;
Linus Torvalds's avatar
Linus Torvalds committed
51 52
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
53
unsigned long sysctl_overcommit_kbytes __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
54
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
55
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
56
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
57
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
Linus Torvalds's avatar
Linus Torvalds committed
58 59
int heap_stack_gap = 0;

60
atomic_long_t mmap_pages_allocated;
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 */
unsigned long vm_memory_committed(void)
{
	return percpu_counter_read_positive(&vm_committed_as);
}

EXPORT_SYMBOL_GPL(vm_memory_committed);

Linus Torvalds's avatar
Linus Torvalds committed
77 78
EXPORT_SYMBOL(mem_map);

79 80 81 82
/* list of mapped, potentially shareable regions */
static struct kmem_cache *vm_region_jar;
struct rb_root nommu_region_tree = RB_ROOT;
DECLARE_RWSEM(nommu_region_sem);
Linus Torvalds's avatar
Linus Torvalds committed
83

84
const struct vm_operations_struct generic_file_vm_ops = {
Linus Torvalds's avatar
Linus Torvalds committed
85 86 87 88 89 90 91 92 93 94 95 96
};

/*
 * Return the total memory allocated for this pointer, not
 * just what the caller asked for.
 *
 * Doesn't have to be accurate, i.e. may have races.
 */
unsigned int kobjsize(const void *objp)
{
	struct page *page;

97 98 99 100
	/*
	 * If the object we have should not have ksize performed on it,
	 * return size of 0
	 */
101
	if (!objp || !virt_addr_valid(objp))
102 103 104 105 106 107 108 109
		return 0;

	page = virt_to_head_page(objp);

	/*
	 * If the allocator sets PageSlab, we know the pointer came from
	 * kmalloc().
	 */
Linus Torvalds's avatar
Linus Torvalds committed
110 111 112
	if (PageSlab(page))
		return ksize(objp);

113 114 115 116 117 118 119 120 121 122 123 124 125 126
	/*
	 * If it's not a compound page, see if we have a matching VMA
	 * region. This test is intentionally done in reverse order,
	 * so if there's no VMA, we still fall through and hand back
	 * PAGE_SIZE for 0-order pages.
	 */
	if (!PageCompound(page)) {
		struct vm_area_struct *vma;

		vma = find_vma(current->mm, (unsigned long)objp);
		if (vma)
			return vma->vm_end - vma->vm_start;
	}

127 128
	/*
	 * The ksize() function is only guaranteed to work for pointers
129
	 * returned by kmalloc(). So handle arbitrary pointers here.
130
	 */
131
	return PAGE_SIZE << compound_order(page);
Linus Torvalds's avatar
Linus Torvalds committed
132 133
}

134 135 136 137
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		      unsigned long start, unsigned long nr_pages,
		      unsigned int foll_flags, struct page **pages,
		      struct vm_area_struct **vmas, int *nonblocking)
Linus Torvalds's avatar
Linus Torvalds committed
138
{
139
	struct vm_area_struct *vma;
140 141 142 143
	unsigned long vm_flags;
	int i;

	/* calculate required read or write permissions.
Hugh Dickins's avatar
Hugh Dickins committed
144
	 * If FOLL_FORCE is set, we only require the "MAY" flags.
145
	 */
Hugh Dickins's avatar
Hugh Dickins committed
146 147 148 149
	vm_flags  = (foll_flags & FOLL_WRITE) ?
			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
	vm_flags &= (foll_flags & FOLL_FORCE) ?
			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
150

151
	for (i = 0; i < nr_pages; i++) {
152
		vma = find_vma(mm, start);
153 154 155 156
		if (!vma)
			goto finish_or_fault;

		/* protect what we can, including chardevs */
Hugh Dickins's avatar
Hugh Dickins committed
157 158
		if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
		    !(vm_flags & vma->vm_flags))
159
			goto finish_or_fault;
160

Linus Torvalds's avatar
Linus Torvalds committed
161 162 163 164 165 166
		if (pages) {
			pages[i] = virt_to_page(start);
			if (pages[i])
				page_cache_get(pages[i]);
		}
		if (vmas)
167
			vmas[i] = vma;
168
		start = (start + PAGE_SIZE) & PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
169
	}
170 171 172 173 174

	return i;

finish_or_fault:
	return i ? : -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
175
}
176 177 178 179 180 181 182 183

/*
 * get a list of pages in an address range belonging to the specified process
 * and indicate the VMA that covers each page
 * - this is potentially dodgy as we may end incrementing the page count of a
 *   slab page or a secondary page from a compound page
 * - don't permit access to VMAs that don't support it, such as I/O mappings
 */
184 185 186 187
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		    unsigned long start, unsigned long nr_pages,
		    int write, int force, struct page **pages,
		    struct vm_area_struct **vmas)
188 189 190 191
{
	int flags = 0;

	if (write)
Hugh Dickins's avatar
Hugh Dickins committed
192
		flags |= FOLL_WRITE;
193
	if (force)
Hugh Dickins's avatar
Hugh Dickins committed
194
		flags |= FOLL_FORCE;
195

196 197
	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
				NULL);
198
}
199 200
EXPORT_SYMBOL(get_user_pages);

201 202 203 204 205 206 207 208 209 210
long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
			   unsigned long start, unsigned long nr_pages,
			   int write, int force, struct page **pages,
			   int *locked)
{
	return get_user_pages(tsk, mm, start, nr_pages, write, force,
			      pages, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);

211 212 213 214
long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
			       unsigned long start, unsigned long nr_pages,
			       int write, int force, struct page **pages,
			       unsigned int gup_flags)
215 216 217 218 219 220 221 222
{
	long ret;
	down_read(&mm->mmap_sem);
	ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
			     pages, NULL);
	up_read(&mm->mmap_sem);
	return ret;
}
223 224 225 226 227 228 229 230 231
EXPORT_SYMBOL(__get_user_pages_unlocked);

long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
			     unsigned long start, unsigned long nr_pages,
			     int write, int force, struct page **pages)
{
	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
					 force, pages, 0);
}
232 233
EXPORT_SYMBOL(get_user_pages_unlocked);

Paul Mundt's avatar
Paul Mundt committed
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
/**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
 * @address: user virtual address
 * @pfn: location to store found PFN
 *
 * Only IO mappings and raw PFN mappings are allowed.
 *
 * Returns zero and the pfn at @pfn on success, -ve otherwise.
 */
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
	unsigned long *pfn)
{
	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		return -EINVAL;

	*pfn = address >> PAGE_SHIFT;
	return 0;
}
EXPORT_SYMBOL(follow_pfn);

255
LIST_HEAD(vmap_area_list);
Linus Torvalds's avatar
Linus Torvalds committed
256

257
void vfree(const void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
258 259 260
{
	kfree(addr);
}
261
EXPORT_SYMBOL(vfree);
Linus Torvalds's avatar
Linus Torvalds committed
262

263
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
Linus Torvalds's avatar
Linus Torvalds committed
264 265
{
	/*
266 267
	 *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
	 * returns only a logical address.
Linus Torvalds's avatar
Linus Torvalds committed
268
	 */
269
	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
Linus Torvalds's avatar
Linus Torvalds committed
270
}
271
EXPORT_SYMBOL(__vmalloc);
Linus Torvalds's avatar
Linus Torvalds committed
272

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
void *vmalloc_user(unsigned long size)
{
	void *ret;

	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
			PAGE_KERNEL);
	if (ret) {
		struct vm_area_struct *vma;

		down_write(&current->mm->mmap_sem);
		vma = find_vma(current->mm, (unsigned long)ret);
		if (vma)
			vma->vm_flags |= VM_USERMAP;
		up_write(&current->mm->mmap_sem);
	}

	return ret;
}
EXPORT_SYMBOL(vmalloc_user);

293
struct page *vmalloc_to_page(const void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
294 295 296
{
	return virt_to_page(addr);
}
297
EXPORT_SYMBOL(vmalloc_to_page);
Linus Torvalds's avatar
Linus Torvalds committed
298

299
unsigned long vmalloc_to_pfn(const void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
300 301 302
{
	return page_to_pfn(virt_to_page(addr));
}
303
EXPORT_SYMBOL(vmalloc_to_pfn);
Linus Torvalds's avatar
Linus Torvalds committed
304 305 306

long vread(char *buf, char *addr, unsigned long count)
{
307 308 309 310
	/* Don't allow overflow */
	if ((unsigned long) buf + count < count)
		count = -(unsigned long) buf;

Linus Torvalds's avatar
Linus Torvalds committed
311 312 313 314 315 316 317 318 319 320 321
	memcpy(buf, addr, count);
	return count;
}

long vwrite(char *buf, char *addr, unsigned long count)
{
	/* Don't allow overflow */
	if ((unsigned long) addr + count < count)
		count = -(unsigned long) addr;

	memcpy(addr, buf, count);
322
	return count;
Linus Torvalds's avatar
Linus Torvalds committed
323 324 325
}

/*
326
 *	vmalloc  -  allocate virtually contiguous memory
Linus Torvalds's avatar
Linus Torvalds committed
327 328 329 330
 *
 *	@size:		allocation size
 *
 *	Allocate enough pages to cover @size from the page level
331
 *	allocator and map them into contiguous kernel virtual space.
Linus Torvalds's avatar
Linus Torvalds committed
332
 *
333
 *	For tight control over page level allocator and protection flags
Linus Torvalds's avatar
Linus Torvalds committed
334 335 336 337 338 339
 *	use __vmalloc() instead.
 */
void *vmalloc(unsigned long size)
{
       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
340 341
EXPORT_SYMBOL(vmalloc);

342
/*
343
 *	vzalloc - allocate virtually contiguous memory with zero fill
344 345 346 347
 *
 *	@size:		allocation size
 *
 *	Allocate enough pages to cover @size from the page level
348
 *	allocator and map them into contiguous kernel virtual space.
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
 *	The memory allocated is set to zero.
 *
 *	For tight control over page level allocator and protection flags
 *	use __vmalloc() instead.
 */
void *vzalloc(unsigned long size)
{
	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
			PAGE_KERNEL);
}
EXPORT_SYMBOL(vzalloc);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:	allocation size
 * @node:	numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 */
372 373 374 375
void *vmalloc_node(unsigned long size, int node)
{
	return vmalloc(size);
}
376
EXPORT_SYMBOL(vmalloc_node);
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:	allocation size
 * @node:	numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 */
void *vzalloc_node(unsigned long size, int node)
{
	return vzalloc(size);
}
EXPORT_SYMBOL(vzalloc_node);
Linus Torvalds's avatar
Linus Torvalds committed
395

Paul Mundt's avatar
Paul Mundt committed
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/**
 *	vmalloc_exec  -  allocate virtually contiguous, executable memory
 *	@size:		allocation size
 *
 *	Kernel-internal function to allocate enough pages to cover @size
 *	the page level allocator and map them into contiguous and
 *	executable kernel virtual space.
 *
 *	For tight control over page level allocator and protection flags
 *	use __vmalloc() instead.
 */

void *vmalloc_exec(unsigned long size)
{
	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
}

417 418
/**
 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
Linus Torvalds's avatar
Linus Torvalds committed
419 420 421
 *	@size:		allocation size
 *
 *	Allocate enough 32bit PA addressable pages to cover @size from the
422
 *	page level allocator and map them into contiguous kernel virtual space.
Linus Torvalds's avatar
Linus Torvalds committed
423 424 425 426 427
 */
void *vmalloc_32(unsigned long size)
{
	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
428 429 430 431 432 433 434 435
EXPORT_SYMBOL(vmalloc_32);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 *	@size:		allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
436 437 438
 *
 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
 * remap_vmalloc_range() are permissible.
439 440 441
 */
void *vmalloc_32_user(unsigned long size)
{
442 443 444 445 446
	/*
	 * We'll have to sort out the ZONE_DMA bits for 64-bit,
	 * but for now this can simply use vmalloc_user() directly.
	 */
	return vmalloc_user(size);
447 448
}
EXPORT_SYMBOL(vmalloc_32_user);
Linus Torvalds's avatar
Linus Torvalds committed
449 450 451 452 453 454

void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
{
	BUG();
	return NULL;
}
455
EXPORT_SYMBOL(vmap);
Linus Torvalds's avatar
Linus Torvalds committed
456

457
void vunmap(const void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
458 459 460
{
	BUG();
}
461
EXPORT_SYMBOL(vunmap);
Linus Torvalds's avatar
Linus Torvalds committed
462

463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
{
	BUG();
	return NULL;
}
EXPORT_SYMBOL(vm_map_ram);

void vm_unmap_ram(const void *mem, unsigned int count)
{
	BUG();
}
EXPORT_SYMBOL(vm_unmap_ram);

void vm_unmap_aliases(void)
{
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

481 482 483 484
/*
 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 * have one.
 */
485
void __weak vmalloc_sync_all(void)
486 487 488
{
}

489 490 491 492 493 494 495 496 497 498 499 500
/**
 *	alloc_vm_area - allocate a range of kernel address space
 *	@size:		size of the area
 *
 *	Returns:	NULL on failure, vm_struct on success
 *
 *	This function reserves a range of kernel address space, and
 *	allocates pagetables to map that range.  No actual mappings
 *	are created.  If the kernel address space is not shared
 *	between processes, it syncs the pagetable across all
 *	processes.
 */
501
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
502 503 504 505 506 507 508 509 510 511 512 513
{
	BUG();
	return NULL;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);

void free_vm_area(struct vm_struct *area)
{
	BUG();
}
EXPORT_SYMBOL_GPL(free_vm_area);

514 515 516 517 518 519 520
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
		   struct page *page)
{
	return -EINVAL;
}
EXPORT_SYMBOL(vm_insert_page);

Linus Torvalds's avatar
Linus Torvalds committed
521 522 523 524 525 526 527
/*
 *  sys_brk() for the most part doesn't need the global kernel
 *  lock, except when an application is doing something nasty
 *  like trying to un-brk an area that has already been mapped
 *  to a regular file.  in this case, the unmapping will need
 *  to invoke file system routines that need the global lock.
 */
528
SYSCALL_DEFINE1(brk, unsigned long, brk)
Linus Torvalds's avatar
Linus Torvalds committed
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
{
	struct mm_struct *mm = current->mm;

	if (brk < mm->start_brk || brk > mm->context.end_brk)
		return mm->brk;

	if (mm->brk == brk)
		return mm->brk;

	/*
	 * Always allow shrinking brk
	 */
	if (brk <= mm->brk) {
		mm->brk = brk;
		return brk;
	}

	/*
	 * Ok, looks good - let it rip.
	 */
549
	flush_icache_range(mm->brk, brk);
Linus Torvalds's avatar
Linus Torvalds committed
550 551 552
	return mm->brk = brk;
}

553 554 555 556
/*
 * initialise the VMA and region record slabs
 */
void __init mmap_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
557
{
558 559
	int ret;

560
	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
561
	VM_BUG_ON(ret);
562
	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
Linus Torvalds's avatar
Linus Torvalds committed
563 564
}

565
/*
566 567
 * validate the region tree
 * - the caller must hold the region lock
568
 */
569 570
#ifdef CONFIG_DEBUG_NOMMU_REGIONS
static noinline void validate_nommu_regions(void)
571
{
572 573
	struct vm_region *region, *last;
	struct rb_node *p, *lastp;
574

575 576 577 578 579
	lastp = rb_first(&nommu_region_tree);
	if (!lastp)
		return;

	last = rb_entry(lastp, struct vm_region, vm_rb);
580 581
	BUG_ON(last->vm_end <= last->vm_start);
	BUG_ON(last->vm_top < last->vm_end);
582 583 584 585 586

	while ((p = rb_next(lastp))) {
		region = rb_entry(p, struct vm_region, vm_rb);
		last = rb_entry(lastp, struct vm_region, vm_rb);

587 588 589
		BUG_ON(region->vm_end <= region->vm_start);
		BUG_ON(region->vm_top < region->vm_end);
		BUG_ON(region->vm_start < last->vm_top);
590

591 592
		lastp = p;
	}
593
}
594
#else
595 596 597
static void validate_nommu_regions(void)
{
}
598
#endif
599 600

/*
601
 * add a region into the global tree
602
 */
603
static void add_nommu_region(struct vm_region *region)
604
{
605 606
	struct vm_region *pregion;
	struct rb_node **p, *parent;
607

608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
	validate_nommu_regions();

	parent = NULL;
	p = &nommu_region_tree.rb_node;
	while (*p) {
		parent = *p;
		pregion = rb_entry(parent, struct vm_region, vm_rb);
		if (region->vm_start < pregion->vm_start)
			p = &(*p)->rb_left;
		else if (region->vm_start > pregion->vm_start)
			p = &(*p)->rb_right;
		else if (pregion == region)
			return;
		else
			BUG();
623 624
	}

625 626
	rb_link_node(&region->vm_rb, parent, p);
	rb_insert_color(&region->vm_rb, &nommu_region_tree);
627

628
	validate_nommu_regions();
629 630
}

631
/*
632
 * delete a region from the global tree
633
 */
634
static void delete_nommu_region(struct vm_region *region)
635
{
636
	BUG_ON(!nommu_region_tree.rb_node);
637

638 639 640
	validate_nommu_regions();
	rb_erase(&region->vm_rb, &nommu_region_tree);
	validate_nommu_regions();
641 642
}

643
/*
644
 * free a contiguous series of pages
645
 */
646
static void free_page_series(unsigned long from, unsigned long to)
647
{
648 649 650
	for (; from < to; from += PAGE_SIZE) {
		struct page *page = virt_to_page(from);

651
		atomic_long_dec(&mmap_pages_allocated);
652
		put_page(page);
653 654 655
	}
}

656
/*
657
 * release a reference to a region
658
 * - the caller must hold the region semaphore for writing, which this releases
659
 * - the region may not have been added to the tree yet, in which case vm_top
660
 *   will equal vm_start
661
 */
662 663
static void __put_nommu_region(struct vm_region *region)
	__releases(nommu_region_sem)
Linus Torvalds's avatar
Linus Torvalds committed
664
{
665
	BUG_ON(!nommu_region_tree.rb_node);
Linus Torvalds's avatar
Linus Torvalds committed
666

667
	if (--region->vm_usage == 0) {
668
		if (region->vm_top > region->vm_start)
669 670 671 672 673 674 675 676
			delete_nommu_region(region);
		up_write(&nommu_region_sem);

		if (region->vm_file)
			fput(region->vm_file);

		/* IO memory and memory shared directly out of the pagecache
		 * from ramfs/tmpfs mustn't be released here */
677
		if (region->vm_flags & VM_MAPPED_COPY)
678
			free_page_series(region->vm_start, region->vm_top);
679 680 681
		kmem_cache_free(vm_region_jar, region);
	} else {
		up_write(&nommu_region_sem);
Linus Torvalds's avatar
Linus Torvalds committed
682
	}
683
}
Linus Torvalds's avatar
Linus Torvalds committed
684

685 686 687 688 689 690 691
/*
 * release a reference to a region
 */
static void put_nommu_region(struct vm_region *region)
{
	down_write(&nommu_region_sem);
	__put_nommu_region(region);
Linus Torvalds's avatar
Linus Torvalds committed
692 693
}

694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
/*
 * update protection on a vma
 */
static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
{
#ifdef CONFIG_MPU
	struct mm_struct *mm = vma->vm_mm;
	long start = vma->vm_start & PAGE_MASK;
	while (start < vma->vm_end) {
		protect_page(mm, start, flags);
		start += PAGE_SIZE;
	}
	update_protections(mm);
#endif
}

710
/*
711 712 713 714
 * add a VMA into a process's mm_struct in the appropriate place in the list
 * and tree and add to the address space's page tree also if not an anonymous
 * page
 * - should be called with mm->mmap_sem held writelocked
715
 */
716
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
717
{
718
	struct vm_area_struct *pvma, *prev;
Linus Torvalds's avatar
Linus Torvalds committed
719
	struct address_space *mapping;
720
	struct rb_node **p, *parent, *rb_prev;
721 722 723 724 725

	BUG_ON(!vma->vm_region);

	mm->map_count++;
	vma->vm_mm = mm;
Linus Torvalds's avatar
Linus Torvalds committed
726

727 728
	protect_vma(vma, vma->vm_flags);

Linus Torvalds's avatar
Linus Torvalds committed
729 730 731 732
	/* add the VMA to the mapping */
	if (vma->vm_file) {
		mapping = vma->vm_file->f_mapping;

733
		i_mmap_lock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
734
		flush_dcache_mmap_lock(mapping);
735
		vma_interval_tree_insert(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
736
		flush_dcache_mmap_unlock(mapping);
737
		i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
738 739
	}

740
	/* add the VMA to the tree */
741
	parent = rb_prev = NULL;
742
	p = &mm->mm_rb.rb_node;
Linus Torvalds's avatar
Linus Torvalds committed
743 744 745 746
	while (*p) {
		parent = *p;
		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);

747 748 749
		/* sort by: start addr, end addr, VMA struct addr in that order
		 * (the latter is necessary as we may get identical VMAs) */
		if (vma->vm_start < pvma->vm_start)
Linus Torvalds's avatar
Linus Torvalds committed
750
			p = &(*p)->rb_left;
751 752
		else if (vma->vm_start > pvma->vm_start) {
			rb_prev = parent;
Linus Torvalds's avatar
Linus Torvalds committed
753
			p = &(*p)->rb_right;
754
		} else if (vma->vm_end < pvma->vm_end)
755
			p = &(*p)->rb_left;
756 757
		else if (vma->vm_end > pvma->vm_end) {
			rb_prev = parent;
758
			p = &(*p)->rb_right;
759
		} else if (vma < pvma)
760
			p = &(*p)->rb_left;
761 762
		else if (vma > pvma) {
			rb_prev = parent;
763
			p = &(*p)->rb_right;
764
		} else
765
			BUG();
Linus Torvalds's avatar
Linus Torvalds committed
766 767 768
	}

	rb_link_node(&vma->vm_rb, parent, p);
769 770 771
	rb_insert_color(&vma->vm_rb, &mm->mm_rb);

	/* add VMA to the VMA list also */
772 773 774
	prev = NULL;
	if (rb_prev)
		prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
775

776
	__vma_link_list(mm, vma, prev, parent);
Linus Torvalds's avatar
Linus Torvalds committed
777 778
}

779
/*
780
 * delete a VMA from its owning mm_struct and address space
781
 */
782
static void delete_vma_from_mm(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
783
{
Davidlohr Bueso's avatar
Davidlohr Bueso committed
784
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
785
	struct address_space *mapping;
786
	struct mm_struct *mm = vma->vm_mm;
Davidlohr Bueso's avatar
Davidlohr Bueso committed
787
	struct task_struct *curr = current;
788

789 790
	protect_vma(vma, 0);

791
	mm->map_count--;
Davidlohr Bueso's avatar
Davidlohr Bueso committed
792 793 794
	for (i = 0; i < VMACACHE_SIZE; i++) {
		/* if the vma is cached, invalidate the entire cache */
		if (curr->vmacache[i] == vma) {
795
			vmacache_invalidate(mm);
Davidlohr Bueso's avatar
Davidlohr Bueso committed
796 797 798
			break;
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
799 800 801 802 803

	/* remove the VMA from the mapping */
	if (vma->vm_file) {
		mapping = vma->vm_file->f_mapping;

804
		i_mmap_lock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
805
		flush_dcache_mmap_lock(mapping);
806
		vma_interval_tree_remove(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
807
		flush_dcache_mmap_unlock(mapping);
808
		i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
809 810
	}

811 812
	/* remove from the MM's tree and list */
	rb_erase(&vma->vm_rb, &mm->mm_rb);
813 814 815 816 817 818 819 820

	if (vma->vm_prev)
		vma->vm_prev->vm_next = vma->vm_next;
	else
		mm->mmap = vma->vm_next;

	if (vma->vm_next)
		vma->vm_next->vm_prev = vma->vm_prev;
821 822 823 824 825 826 827 828 829
}

/*
 * destroy a VMA record
 */
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
	if (vma->vm_ops && vma->vm_ops->close)
		vma->vm_ops->close(vma);
830
	if (vma->vm_file)
831 832 833 834 835 836 837 838 839 840 841 842 843 844
		fput(vma->vm_file);
	put_nommu_region(vma->vm_region);
	kmem_cache_free(vm_area_cachep, vma);
}

/*
 * look up the first VMA in which addr resides, NULL if none
 * - should be called with mm->mmap_sem at least held readlocked
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
	struct vm_area_struct *vma;

	/* check the cache first */
Davidlohr Bueso's avatar
Davidlohr Bueso committed
845 846
	vma = vmacache_find(mm, addr);
	if (likely(vma))
847 848
		return vma;

849
	/* trawl the list (there may be multiple mappings in which addr
850
	 * resides) */
851
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
852 853 854
		if (vma->vm_start > addr)
			return NULL;
		if (vma->vm_end > addr) {
Davidlohr Bueso's avatar
Davidlohr Bueso committed
855
			vmacache_update(addr, vma);
856 857 858 859 860 861 862 863 864 865 866 867 868 869
			return vma;
		}
	}

	return NULL;
}
EXPORT_SYMBOL(find_vma);

/*
 * find a VMA
 * - we don't extend stack VMAs under NOMMU conditions
 */
struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
870
	return find_vma(mm, addr);
871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
}

/*
 * expand a stack to a given address
 * - not supported under NOMMU conditions
 */
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
	return -ENOMEM;
}

/*
 * look up the first VMA exactly that exactly matches addr
 * - should be called with mm->mmap_sem at least held readlocked
 */
static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
					     unsigned long addr,
					     unsigned long len)
{
	struct vm_area_struct *vma;
	unsigned long end = addr + len;

	/* check the cache first */
Davidlohr Bueso's avatar
Davidlohr Bueso committed
894 895
	vma = vmacache_find_exact(mm, addr, end);
	if (vma)
896 897
		return vma;

898
	/* trawl the list (there may be multiple mappings in which addr
899
	 * resides) */
900
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
901 902 903 904 905
		if (vma->vm_start < addr)
			continue;
		if (vma->vm_start > addr)
			return NULL;
		if (vma->vm_end == end) {
Davidlohr Bueso's avatar
Davidlohr Bueso committed
906
			vmacache_update(addr, vma);
907 908 909 910 911
			return vma;
		}
	}

	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
912 913 914 915 916 917 918 919 920 921 922 923 924 925
}

/*
 * determine whether a mapping should be permitted and, if so, what sort of
 * mapping we're capable of supporting
 */
static int validate_mmap_request(struct file *file,
				 unsigned long addr,
				 unsigned long len,
				 unsigned long prot,
				 unsigned long flags,
				 unsigned long pgoff,
				 unsigned long *_capabilities)
{
926
	unsigned long capabilities, rlen;
Linus Torvalds's avatar
Linus Torvalds committed
927 928 929
	int ret;

	/* do the simple checks first */
930
	if (flags & MAP_FIXED)
Linus Torvalds's avatar
Linus Torvalds committed
931 932 933 934 935 936
		return -EINVAL;

	if ((flags & MAP_TYPE) != MAP_PRIVATE &&
	    (flags & MAP_TYPE) != MAP_SHARED)
		return -EINVAL;

937
	if (!len)
Linus Torvalds's avatar
Linus Torvalds committed
938 939
		return -EINVAL;

940
	/* Careful about overflows.. */
941 942
	rlen = PAGE_ALIGN(len);
	if (!rlen || rlen > TASK_SIZE)
943 944
		return -ENOMEM;

Linus Torvalds's avatar
Linus Torvalds committed
945
	/* offset overflow? */
946
	if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
947
		return -EOVERFLOW;
Linus Torvalds's avatar
Linus Torvalds committed
948 949 950

	if (file) {
		/* files must support mmap */
Al Viro's avatar
Al Viro committed
951
		if (!file->f_op->mmap)
Linus Torvalds's avatar
Linus Torvalds committed
952 953 954 955 956 957
			return -ENODEV;

		/* work out if what we've got could possibly be shared
		 * - we support chardevs that provide their own "memory"
		 * - we support files/blockdevs that are memory backed
		 */
958 959 960
		if (file->f_op->mmap_capabilities) {
			capabilities = file->f_op->mmap_capabilities(file);
		} else {
Linus Torvalds's avatar
Linus Torvalds committed
961 962
			/* no explicit capabilities set, so assume some
			 * defaults */
Al Viro's avatar
Al Viro committed
963
			switch (file_inode(file)->i_mode & S_IFMT) {
Linus Torvalds's avatar
Linus Torvalds committed
964 965
			case S_IFREG:
			case S_IFBLK:
966
				capabilities = NOMMU_MAP_COPY;
Linus Torvalds's avatar
Linus Torvalds committed
967 968 969 970
				break;

			case S_IFCHR:
				capabilities =
971 972 973
					NOMMU_MAP_DIRECT |
					NOMMU_MAP_READ |
					NOMMU_MAP_WRITE;
Linus Torvalds's avatar
Linus Torvalds committed
974 975 976 977 978 979 980 981 982 983
				break;

			default:
				return -EINVAL;
			}
		}

		/* eliminate any capabilities that we can't support on this
		 * device */
		if (!file->f_op->get_unmapped_area)
984
			capabilities &= ~NOMMU_MAP_DIRECT;
Al Viro's avatar
Al Viro committed
985
		if (!(file->f_mode & FMODE_CAN_READ))
986
			capabilities &= ~NOMMU_MAP_COPY;
Linus Torvalds's avatar
Linus Torvalds committed
987

988 989 990 991
		/* The file shall have been opened with read permission. */
		if (!(file->f_mode & FMODE_READ))
			return -EACCES;

Linus Torvalds's avatar
Linus Torvalds committed
992 993 994 995 996 997
		if (flags & MAP_SHARED) {
			/* do checks for writing, appending and locking */
			if ((prot & PROT_WRITE) &&
			    !(file->f_mode & FMODE_WRITE))
				return -EACCES;

Al Viro's avatar
Al Viro committed
998
			if (IS_APPEND(file_inode(file)) &&
Linus Torvalds's avatar
Linus Torvalds committed
999 1000 1001
			    (file->f_mode & FMODE_WRITE))
				return -EACCES;

1002
			if (locks_verify_locked(file))
Linus Torvalds's avatar
Linus Torvalds committed
1003 1004
				return -EAGAIN;

1005
			if (!(capabilities & NOMMU_MAP_DIRECT))
Linus Torvalds's avatar
Linus Torvalds committed
1006 1007 1008
				return -ENODEV;

			/* we mustn't privatise shared mappings */
1009
			capabilities &= ~NOMMU_MAP_COPY;
1010
		} else {
Linus Torvalds's avatar
Linus Torvalds committed
1011 1012
			/* we're going to read the file into private memory we
			 * allocate */
1013
			if (!(capabilities & NOMMU_MAP_COPY))
Linus Torvalds's avatar
Linus Torvalds committed
1014 1015 1016 1017 1018
				return -ENODEV;

			/* we don't permit a private writable mapping to be
			 * shared with the backing device */
			if (prot & PROT_WRITE)
1019
				capabilities &= ~NOMMU_MAP_DIRECT;
Linus Torvalds's avatar
Linus Torvalds committed
1020 1021
		}

1022 1023 1024 1025
		if (capabilities & NOMMU_MAP_DIRECT) {
			if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
			    ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
			    ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
1026
			    ) {
1027
				capabilities &= ~NOMMU_MAP_DIRECT;
1028
				if (flags & MAP_SHARED) {
1029
					pr_warn("MAP_SHARED not completely supported on !MMU\n");
1030 1031 1032 1033 1034
					return -EINVAL;
				}
			}
		}

Linus Torvalds's avatar
Linus Torvalds committed
1035 1036
		/* handle executable mappings and implied executable
		 * mappings */
1037
		if (path_noexec(&file->f_path)) {
Linus Torvalds's avatar
Linus Torvalds committed
1038 1039
			if (prot & PROT_EXEC)
				return -EPERM;
1040
		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
Linus Torvalds's avatar
Linus Torvalds committed
1041 1042
			/* handle implication of PROT_EXEC by PROT_READ */
			if (current->personality & READ_IMPLIES_EXEC) {
1043
				if (capabilities & NOMMU_MAP_EXEC)
Linus Torvalds's avatar
Linus Torvalds committed
1044 1045
					prot |= PROT_EXEC;
			}
1046
		} else if ((prot & PROT_READ) &&
Linus Torvalds's avatar
Linus Torvalds committed
1047
			 (prot & PROT_EXEC) &&
1048
			 !(capabilities & NOMMU_MAP_EXEC)
Linus Torvalds's avatar
Linus Torvalds committed
1049 1050
			 ) {
			/* backing file is not executable, try to copy */
1051
			capabilities &= ~NOMMU_MAP_DIRECT;
Linus Torvalds's avatar
Linus Torvalds committed
1052
		}
1053
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
1054 1055 1056
		/* anonymous mappings are always memory backed and can be
		 * privately mapped
		 */