hugetlb.c 21.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

David Gibson's avatar
David Gibson committed
18
19
20
21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23
24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
29
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
30
31
32
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;

33
34
35
36
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
37

38
39
40
41
42
43
44
45
46
47
48
49
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
		clear_user_highpage(page + i, addr);
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
50
			   unsigned long addr, struct vm_area_struct *vma)
51
52
53
54
55
56
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
57
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
58
59
60
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
64
65
66
67
68
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

69
70
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
				unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
71
{
72
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
73
	struct page *page = NULL;
74
75
	struct zonelist *zonelist = huge_zonelist(vma, address,
						htlb_alloc_mask);
76
	struct zone **z;
Linus Torvalds's avatar
Linus Torvalds committed
77

78
	for (z = zonelist->zones; *z; z++) {
79
		nid = zone_to_nid(*z);
80
		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
81
		    !list_empty(&hugepage_freelists[nid]))
82
			break;
Linus Torvalds's avatar
Linus Torvalds committed
83
	}
84
85

	if (*z) {
Linus Torvalds's avatar
Linus Torvalds committed
86
87
88
89
90
91
92
93
94
		page = list_entry(hugepage_freelists[nid].next,
				  struct page, lru);
		list_del(&page->lru);
		free_huge_pages--;
		free_huge_pages_node[nid]--;
	}
	return page;
}

95
96
97
98
99
100
101
102
103
104
105
static void free_huge_page(struct page *page)
{
	BUG_ON(page_count(page));

	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
	enqueue_huge_page(page);
	spin_unlock(&hugetlb_lock);
}

106
static int alloc_fresh_huge_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
107
{
108
	static int prev_nid;
Linus Torvalds's avatar
Linus Torvalds committed
109
	struct page *page;
110
111
112
113
114
	static DEFINE_SPINLOCK(nid_lock);
	int nid;

	spin_lock(&nid_lock);
	nid = next_node(prev_nid, node_online_map);
115
116
	if (nid == MAX_NUMNODES)
		nid = first_node(node_online_map);
117
118
119
	prev_nid = nid;
	spin_unlock(&nid_lock);

120
	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
121
					HUGETLB_PAGE_ORDER);
Linus Torvalds's avatar
Linus Torvalds committed
122
	if (page) {
123
		set_compound_page_dtor(page, free_huge_page);
124
		spin_lock(&hugetlb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
125
126
		nr_huge_pages++;
		nr_huge_pages_node[page_to_nid(page)]++;
127
		spin_unlock(&hugetlb_lock);
128
129
		put_page(page); /* free it into the hugepage allocator */
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
130
	}
131
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
132
133
}

134
135
static struct page *alloc_huge_page(struct vm_area_struct *vma,
				    unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
136
137
138
139
{
	struct page *page;

	spin_lock(&hugetlb_lock);
140
141
142
143
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages--;
	else if (free_huge_pages <= resv_huge_pages)
		goto fail;
144
145
146
147
148

	page = dequeue_huge_page(vma, addr);
	if (!page)
		goto fail;

Linus Torvalds's avatar
Linus Torvalds committed
149
	spin_unlock(&hugetlb_lock);
150
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
151
	return page;
152

153
fail:
154
155
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages++;
156
157
158
159
	spin_unlock(&hugetlb_lock);
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
160
161
162
163
static int __init hugetlb_init(void)
{
	unsigned long i;

164
165
166
	if (HPAGE_SHIFT == 0)
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
167
168
169
170
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

	for (i = 0; i < max_huge_pages; ++i) {
171
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

188
189
190
191
192
193
194
195
196
197
198
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

Linus Torvalds's avatar
Linus Torvalds committed
199
200
201
202
203
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
204
	nr_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
205
206
207
208
209
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
210
	page[1].lru.next = NULL;
211
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
212
213
214
215
216
217
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
218
219
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
220
221
222
223
224
225
226
227
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
228
			free_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
229
230
231
232
233
234
235
236
237
238
239
240
241
242
			if (count >= nr_huge_pages)
				return;
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

static unsigned long set_max_huge_pages(unsigned long count)
{
	while (count > nr_huge_pages) {
243
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
244
245
246
247
248
249
			return nr_huge_pages;
	}
	if (count >= nr_huge_pages)
		return nr_huge_pages;

	spin_lock(&hugetlb_lock);
250
	count = max(count, resv_huge_pages);
Linus Torvalds's avatar
Linus Torvalds committed
251
252
	try_to_free_low(count);
	while (count < nr_huge_pages) {
253
		struct page *page = dequeue_huge_page(NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
		if (!page)
			break;
		update_and_free_page(page);
	}
	spin_unlock(&hugetlb_lock);
	return nr_huge_pages;
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
			   struct file *file, void __user *buffer,
			   size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	max_huge_pages = set_max_huge_pages(max_huge_pages);
	return 0;
}
270
271
272
273
274
275
276
277
278
279
280
281
282

int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
			struct file *file, void __user *buffer,
			size_t *length, loff_t *ppos)
{
	proc_dointvec(table, write, file, buffer, length, ppos);
	if (hugepages_treat_as_movable)
		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
	else
		htlb_alloc_mask = GFP_HIGHUSER;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
283
284
285
286
287
288
289
#endif /* CONFIG_SYSCTL */

int hugetlb_report_meminfo(char *buf)
{
	return sprintf(buf,
			"HugePages_Total: %5lu\n"
			"HugePages_Free:  %5lu\n"
290
			"HugePages_Rsvd:  %5lu\n"
Linus Torvalds's avatar
Linus Torvalds committed
291
292
293
			"Hugepagesize:    %5lu kB\n",
			nr_huge_pages,
			free_huge_pages,
294
			resv_huge_pages,
Linus Torvalds's avatar
Linus Torvalds committed
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
			HPAGE_SIZE/1024);
}

int hugetlb_report_node_meminfo(int nid, char *buf)
{
	return sprintf(buf,
		"Node %d HugePages_Total: %5u\n"
		"Node %d HugePages_Free:  %5u\n",
		nid, nr_huge_pages_node[nid],
		nid, free_huge_pages_node[nid]);
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
				unsigned long address, int *unused)
{
	BUG();
	return NULL;
}

struct vm_operations_struct hugetlb_vm_ops = {
	.nopage = hugetlb_nopage,
};

330
331
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
				int writable)
David Gibson's avatar
David Gibson committed
332
333
334
{
	pte_t entry;

335
	if (writable) {
David Gibson's avatar
David Gibson committed
336
337
338
339
340
341
342
343
344
345
346
		entry =
		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	} else {
		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
	}
	entry = pte_mkyoung(entry);
	entry = pte_mkhuge(entry);

	return entry;
}

347
348
349
350
351
352
static void set_huge_ptep_writable(struct vm_area_struct *vma,
				   unsigned long address, pte_t *ptep)
{
	pte_t entry;

	entry = pte_mkwrite(pte_mkdirty(*ptep));
353
354
355
356
	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
		update_mmu_cache(vma, address, entry);
		lazy_mmu_prot_update(entry);
	}
357
358
359
}


David Gibson's avatar
David Gibson committed
360
361
362
363
364
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
			    struct vm_area_struct *vma)
{
	pte_t *src_pte, *dst_pte, entry;
	struct page *ptepage;
365
	unsigned long addr;
366
367
368
	int cow;

	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
David Gibson's avatar
David Gibson committed
369

370
	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
371
372
373
		src_pte = huge_pte_offset(src, addr);
		if (!src_pte)
			continue;
David Gibson's avatar
David Gibson committed
374
375
376
		dst_pte = huge_pte_alloc(dst, addr);
		if (!dst_pte)
			goto nomem;
377
		spin_lock(&dst->page_table_lock);
378
		spin_lock(&src->page_table_lock);
379
		if (!pte_none(*src_pte)) {
380
381
			if (cow)
				ptep_set_wrprotect(src, addr, src_pte);
382
383
384
385
386
387
			entry = *src_pte;
			ptepage = pte_page(entry);
			get_page(ptepage);
			set_huge_pte_at(dst, addr, dst_pte, entry);
		}
		spin_unlock(&src->page_table_lock);
388
		spin_unlock(&dst->page_table_lock);
David Gibson's avatar
David Gibson committed
389
390
391
392
393
394
395
	}
	return 0;

nomem:
	return -ENOMEM;
}

396
397
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			    unsigned long end)
David Gibson's avatar
David Gibson committed
398
399
400
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address;
401
	pte_t *ptep;
David Gibson's avatar
David Gibson committed
402
403
	pte_t pte;
	struct page *page;
404
	struct page *tmp;
405
406
407
408
409
	/*
	 * A page gathering list, protected by per file i_mmap_lock. The
	 * lock is used to avoid list corruption from multiple unmapping
	 * of the same page since we are using page->lru.
	 */
410
	LIST_HEAD(page_list);
David Gibson's avatar
David Gibson committed
411
412
413
414
415

	WARN_ON(!is_vm_hugetlb_page(vma));
	BUG_ON(start & ~HPAGE_MASK);
	BUG_ON(end & ~HPAGE_MASK);

416
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
417
	for (address = start; address < end; address += HPAGE_SIZE) {
418
		ptep = huge_pte_offset(mm, address);
419
		if (!ptep)
420
421
			continue;

422
423
424
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;

425
		pte = huge_ptep_get_and_clear(mm, address, ptep);
David Gibson's avatar
David Gibson committed
426
427
		if (pte_none(pte))
			continue;
428

David Gibson's avatar
David Gibson committed
429
		page = pte_page(pte);
430
431
		if (pte_dirty(pte))
			set_page_dirty(page);
432
		list_add(&page->lru, &page_list);
David Gibson's avatar
David Gibson committed
433
	}
Linus Torvalds's avatar
Linus Torvalds committed
434
	spin_unlock(&mm->page_table_lock);
435
	flush_tlb_range(vma, start, end);
436
437
438
439
	list_for_each_entry_safe(page, tmp, &page_list, lru) {
		list_del(&page->lru);
		put_page(page);
	}
Linus Torvalds's avatar
Linus Torvalds committed
440
}
David Gibson's avatar
David Gibson committed
441

442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end)
{
	/*
	 * It is undesirable to test vma->vm_file as it should be non-null
	 * for valid hugetlb area. However, vm_file will be NULL in the error
	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
	 * to clean up. Since no pte has actually been setup, it is safe to
	 * do nothing in this case.
	 */
	if (vma->vm_file) {
		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
		__unmap_hugepage_range(vma, start, end);
		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	}
}

460
461
462
463
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *ptep, pte_t pte)
{
	struct page *old_page, *new_page;
464
	int avoidcopy;
465
466
467
468
469
470
471
472
473
474
475
476

	old_page = pte_page(pte);

	/* If no-one else is actually using this page, avoid the copy
	 * and just make the page writable */
	avoidcopy = (page_count(old_page) == 1);
	if (avoidcopy) {
		set_huge_ptep_writable(vma, address, ptep);
		return VM_FAULT_MINOR;
	}

	page_cache_get(old_page);
477
	new_page = alloc_huge_page(vma, address);
478
479
480

	if (!new_page) {
		page_cache_release(old_page);
481
		return VM_FAULT_OOM;
482
483
484
	}

	spin_unlock(&mm->page_table_lock);
485
	copy_huge_page(new_page, old_page, address, vma);
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
	spin_lock(&mm->page_table_lock);

	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
	if (likely(pte_same(*ptep, pte))) {
		/* Break COW */
		set_huge_pte_at(mm, address, ptep,
				make_huge_pte(vma, new_page, 1));
		/* Make the old page be freed below */
		new_page = old_page;
	}
	page_cache_release(new_page);
	page_cache_release(old_page);
	return VM_FAULT_MINOR;
}

501
int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
502
			unsigned long address, pte_t *ptep, int write_access)
503
504
{
	int ret = VM_FAULT_SIGBUS;
505
506
507
508
	unsigned long idx;
	unsigned long size;
	struct page *page;
	struct address_space *mapping;
509
	pte_t new_pte;
510
511
512
513
514
515
516
517
518

	mapping = vma->vm_file->f_mapping;
	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

	/*
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
519
520
521
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
522
523
524
		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
		if (idx >= size)
			goto out;
525
526
527
528
529
		if (hugetlb_get_quota(mapping))
			goto out;
		page = alloc_huge_page(vma, address);
		if (!page) {
			hugetlb_put_quota(mapping);
530
			ret = VM_FAULT_OOM;
531
532
			goto out;
		}
533
		clear_huge_page(page, address);
534

535
536
537
538
539
540
541
542
543
544
545
546
547
548
		if (vma->vm_flags & VM_SHARED) {
			int err;

			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
			if (err) {
				put_page(page);
				hugetlb_put_quota(mapping);
				if (err == -EEXIST)
					goto retry;
				goto out;
			}
		} else
			lock_page(page);
	}
549

550
	spin_lock(&mm->page_table_lock);
551
552
553
554
555
	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	if (idx >= size)
		goto backout;

	ret = VM_FAULT_MINOR;
556
	if (!pte_none(*ptep))
557
558
		goto backout;

559
560
561
562
563
564
565
566
567
	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				&& (vma->vm_flags & VM_SHARED)));
	set_huge_pte_at(mm, address, ptep, new_pte);

	if (write_access && !(vma->vm_flags & VM_SHARED)) {
		/* Optimization, do the COW without a second fault */
		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	}

568
	spin_unlock(&mm->page_table_lock);
569
570
	unlock_page(page);
out:
571
	return ret;
572
573
574
575
576
577
578

backout:
	spin_unlock(&mm->page_table_lock);
	hugetlb_put_quota(mapping);
	unlock_page(page);
	put_page(page);
	goto out;
579
580
}

581
582
583
584
585
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, int write_access)
{
	pte_t *ptep;
	pte_t entry;
586
	int ret;
587
	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
588
589
590
591
592

	ptep = huge_pte_alloc(mm, address);
	if (!ptep)
		return VM_FAULT_OOM;

593
594
595
596
597
598
	/*
	 * Serialize hugepage allocation and instantiation, so that we don't
	 * get spurious allocation failures if two CPUs race to instantiate
	 * the same page in the page cache.
	 */
	mutex_lock(&hugetlb_instantiation_mutex);
599
	entry = *ptep;
600
601
602
603
604
	if (pte_none(entry)) {
		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		mutex_unlock(&hugetlb_instantiation_mutex);
		return ret;
	}
605

606
607
608
609
610
611
612
613
	ret = VM_FAULT_MINOR;

	spin_lock(&mm->page_table_lock);
	/* Check for a racing update before calling hugetlb_cow */
	if (likely(pte_same(entry, *ptep)))
		if (write_access && !pte_write(entry))
			ret = hugetlb_cow(mm, vma, address, ptep, entry);
	spin_unlock(&mm->page_table_lock);
614
	mutex_unlock(&hugetlb_instantiation_mutex);
615
616

	return ret;
617
618
}

David Gibson's avatar
David Gibson committed
619
620
621
622
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			struct page **pages, struct vm_area_struct **vmas,
			unsigned long *position, int *length, int i)
{
623
624
	unsigned long pfn_offset;
	unsigned long vaddr = *position;
David Gibson's avatar
David Gibson committed
625
626
	int remainder = *length;

627
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
628
	while (vaddr < vma->vm_end && remainder) {
629
630
		pte_t *pte;
		struct page *page;
David Gibson's avatar
David Gibson committed
631

632
633
634
635
636
637
		/*
		 * Some archs (sparc64, sh*) have multiple pte_ts to
		 * each hugepage.  We have to make * sure we get the
		 * first, for the page indexing below to work.
		 */
		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
David Gibson's avatar
David Gibson committed
638

639
640
		if (!pte || pte_none(*pte)) {
			int ret;
David Gibson's avatar
David Gibson committed
641

642
643
644
645
646
			spin_unlock(&mm->page_table_lock);
			ret = hugetlb_fault(mm, vma, vaddr, 0);
			spin_lock(&mm->page_table_lock);
			if (ret == VM_FAULT_MINOR)
				continue;
David Gibson's avatar
David Gibson committed
647

648
649
650
651
652
653
			remainder = 0;
			if (!i)
				i = -EFAULT;
			break;
		}

654
655
656
		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
		page = pte_page(*pte);
same_page:
657
658
		if (pages) {
			get_page(page);
659
			pages[i] = page + pfn_offset;
660
		}
David Gibson's avatar
David Gibson committed
661
662
663
664
665

		if (vmas)
			vmas[i] = vma;

		vaddr += PAGE_SIZE;
666
		++pfn_offset;
David Gibson's avatar
David Gibson committed
667
668
		--remainder;
		++i;
669
670
671
672
673
674
675
676
		if (vaddr < vma->vm_end && remainder &&
				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
			/*
			 * We use pfn_offset to avoid touching the pageframes
			 * of this compound page.
			 */
			goto same_page;
		}
David Gibson's avatar
David Gibson committed
677
	}
678
	spin_unlock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
679
680
681
682
683
	*length = remainder;
	*position = vaddr;

	return i;
}
684
685
686
687
688
689
690
691
692
693
694
695

void hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long address, unsigned long end, pgprot_t newprot)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long start = address;
	pte_t *ptep;
	pte_t pte;

	BUG_ON(address >= end);
	flush_cache_range(vma, address, end);

696
	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
697
698
699
700
701
	spin_lock(&mm->page_table_lock);
	for (; address < end; address += HPAGE_SIZE) {
		ptep = huge_pte_offset(mm, address);
		if (!ptep)
			continue;
702
703
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;
704
705
706
707
708
709
710
711
		if (!pte_none(*ptep)) {
			pte = huge_ptep_get_and_clear(mm, address, ptep);
			pte = pte_mkhuge(pte_modify(pte, newprot));
			set_huge_pte_at(mm, address, ptep, pte);
			lazy_mmu_prot_update(pte);
		}
	}
	spin_unlock(&mm->page_table_lock);
712
	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
713
714
715
716

	flush_tlb_range(vma, start, end);
}

717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarentee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		if (nrg == 0)
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

static int hugetlb_acct_memory(long delta)
{
	int ret = -ENOMEM;

	spin_lock(&hugetlb_lock);
	if ((delta + resv_huge_pages) <= free_huge_pages) {
		resv_huge_pages += delta;
		ret = 0;
	}
	spin_unlock(&hugetlb_lock);
	return ret;
}

int hugetlb_reserve_pages(struct inode *inode, long from, long to)
{
	long ret, chg;

	chg = region_chg(&inode->i_mapping->private_list, from, to);
	if (chg < 0)
		return chg;
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
	/*
	 * When cpuset is configured, it breaks the strict hugetlb page
	 * reservation as the accounting is done on a global variable. Such
	 * reservation is completely rubbish in the presence of cpuset because
	 * the reservation is not checked against page availability for the
	 * current cpuset. Application can still potentially OOM'ed by kernel
	 * with lack of free htlb page in cpuset that the task is in.
	 * Attempt to enforce strict accounting with cpuset is almost
	 * impossible (or too ugly) because cpuset is too fluid that
	 * task or memory node can be dynamically moved between cpusets.
	 *
	 * The change of semantics for shared hugetlb mapping with cpuset is
	 * undesirable. However, in order to preserve some of the semantics,
	 * we fall back to check against current free page availability as
	 * a best attempt and hopefully to minimize the impact of changing
	 * semantics that cpuset has.
	 */
	if (chg > cpuset_mems_nr(free_huge_pages_node))
		return -ENOMEM;

878
879
880
881
882
883
884
885
886
887
888
889
	ret = hugetlb_acct_memory(chg);
	if (ret < 0)
		return ret;
	region_add(&inode->i_mapping->private_list, from, to);
	return 0;
}

void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	hugetlb_acct_memory(freed - chg);
}