kexec.c 66.8 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 * kexec.c - kexec system call
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

9
10
#define pr_fmt(fmt)	"kexec: " fmt

11
#include <linux/capability.h>
12
13
14
15
16
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kexec.h>
17
#include <linux/mutex.h>
18
19
20
21
22
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/ioport.h>
23
#include <linux/hardirq.h>
24
25
#include <linux/elf.h>
#include <linux/elfcore.h>
Ken'ichi Ohmichi's avatar
Ken'ichi Ohmichi committed
26
27
#include <linux/utsname.h>
#include <linux/numa.h>
Huang Ying's avatar
Huang Ying committed
28
29
#include <linux/suspend.h>
#include <linux/device.h>
30
31
32
33
#include <linux/freezer.h>
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/console.h>
34
#include <linux/vmalloc.h>
35
#include <linux/swap.h>
36
#include <linux/syscore_ops.h>
37
#include <linux/compiler.h>
38
#include <linux/hugetlb.h>
39

40
41
42
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
Ken'ichi Ohmichi's avatar
Ken'ichi Ohmichi committed
43
#include <asm/sections.h>
44

45
46
47
#include <crypto/hash.h>
#include <crypto/sha.h>

48
/* Per cpu memory for storing cpu states in case of system crash. */
49
note_buf_t __percpu *crash_notes;
50

Ken'ichi Ohmichi's avatar
Ken'ichi Ohmichi committed
51
/* vmcoreinfo stuff */
52
static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
Ken'ichi Ohmichi's avatar
Ken'ichi Ohmichi committed
53
u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
54
55
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
Ken'ichi Ohmichi's avatar
Ken'ichi Ohmichi committed
56

57
58
59
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;

60
61
62
63
64
65
66
67
68
/*
 * Declare these symbols weak so that if architecture provides a purgatory,
 * these will be overridden.
 */
char __weak kexec_purgatory[0];
size_t __weak kexec_purgatory_size = 0;

static int kexec_calculate_store_digests(struct kimage *image);

69
70
71
72
73
74
75
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
	.name  = "Crash kernel",
	.start = 0,
	.end   = 0,
	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
76
struct resource crashk_low_res = {
77
	.name  = "Crash kernel",
78
79
80
81
	.start = 0,
	.end   = 0,
	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
82

83
84
int kexec_should_crash(struct task_struct *p)
{
85
	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
86
87
88
89
		return 1;
	return 0;
}

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
 * When kexec transitions to the new kernel there is a one-to-one
 * mapping between physical and virtual addresses.  On processors
 * where you can disable the MMU this is trivial, and easy.  For
 * others it is still a simple predictable page table to setup.
 *
 * In that environment kexec copies the new kernel to its final
 * resting place.  This means I can only support memory whose
 * physical address can fit in an unsigned long.  In particular
 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
 * If the assembly stub has more restrictive requirements
 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
 * defined more restrictively in <asm/kexec.h>.
 *
 * The code for the transition from the current kernel to the
 * the new kernel is placed in the control_code_buffer, whose size
106
 * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
 * page of memory is necessary, but some architectures require more.
 * Because this memory must be identity mapped in the transition from
 * virtual to physical addresses it must live in the range
 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
 * modifiable.
 *
 * The assembly stub in the control code buffer is passed a linked list
 * of descriptor pages detailing the source pages of the new kernel,
 * and the destination addresses of those source pages.  As this data
 * structure is not used in the context of the current OS, it must
 * be self-contained.
 *
 * The code has been made to work with highmem pages and will use a
 * destination page in its final resting place (if it happens
 * to allocate it).  The end product of this is that most of the
 * physical address space, and most of RAM can be used.
 *
 * Future directions include:
 *  - allocating a page table with the control code buffer identity
 *    mapped, to simplify machine_kexec and make kexec_on_panic more
 *    reliable.
 */

/*
 * KIMAGE_NO_DEST is an impossible destination address..., for
 * allocating pages whose destination address we do not care about.
 */
#define KIMAGE_NO_DEST (-1UL)

Maneesh Soni's avatar
Maneesh Soni committed
136
137
138
static int kimage_is_destination_range(struct kimage *image,
				       unsigned long start, unsigned long end);
static struct page *kimage_alloc_page(struct kimage *image,
Al Viro's avatar
Al Viro committed
139
				       gfp_t gfp_mask,
Maneesh Soni's avatar
Maneesh Soni committed
140
				       unsigned long dest);
141

142
143
144
static int copy_user_segment_list(struct kimage *image,
				  unsigned long nr_segments,
				  struct kexec_segment __user *segments)
145
{
146
	int ret;
147
148
149
150
151
	size_t segment_bytes;

	/* Read in the segments */
	image->nr_segments = nr_segments;
	segment_bytes = nr_segments * sizeof(*segments);
152
153
154
155
156
157
158
159
160
161
162
	ret = copy_from_user(image->segment, segments, segment_bytes);
	if (ret)
		ret = -EFAULT;

	return ret;
}

static int sanity_check_segment_list(struct kimage *image)
{
	int result, i;
	unsigned long nr_segments = image->nr_segments;
163
164
165
166
167
168
169
170

	/*
	 * Verify we have good destination addresses.  The caller is
	 * responsible for making certain we don't attempt to load
	 * the new image into invalid or reserved areas of RAM.  This
	 * just verifies it is an address we can use.
	 *
	 * Since the kernel does everything in page size chunks ensure
171
	 * the destination addresses are page aligned.  Too many
172
173
174
175
176
177
178
179
	 * special cases crop of when we don't do this.  The most
	 * insidious is getting overlapping destination addresses
	 * simply because addresses are changed to page size
	 * granularity.
	 */
	result = -EADDRNOTAVAIL;
	for (i = 0; i < nr_segments; i++) {
		unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
180

181
182
183
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;
		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
184
			return result;
185
		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
186
			return result;
187
188
189
190
191
192
193
194
	}

	/* Verify our destination addresses do not overlap.
	 * If we alloed overlapping destination addresses
	 * through very weird things can happen with no
	 * easy explanation as one segment stops on another.
	 */
	result = -EINVAL;
Maneesh Soni's avatar
Maneesh Soni committed
195
	for (i = 0; i < nr_segments; i++) {
196
197
		unsigned long mstart, mend;
		unsigned long j;
Maneesh Soni's avatar
Maneesh Soni committed
198

199
200
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;
Maneesh Soni's avatar
Maneesh Soni committed
201
		for (j = 0; j < i; j++) {
202
203
204
205
206
			unsigned long pstart, pend;
			pstart = image->segment[j].mem;
			pend   = pstart + image->segment[j].memsz;
			/* Do the segments overlap ? */
			if ((mend > pstart) && (mstart < pend))
207
				return result;
208
209
210
211
212
213
214
215
216
		}
	}

	/* Ensure our buffer sizes are strictly less than
	 * our memory sizes.  This should always be the case,
	 * and it is easier to check up front than to be surprised
	 * later on.
	 */
	result = -EINVAL;
Maneesh Soni's avatar
Maneesh Soni committed
217
	for (i = 0; i < nr_segments; i++) {
218
		if (image->segment[i].bufsz > image->segment[i].memsz)
219
			return result;
220
221
	}

222
223
224
225
226
227
228
229
230
	/*
	 * Verify we have good destination addresses.  Normally
	 * the caller is responsible for making certain we don't
	 * attempt to load the new image into invalid or reserved
	 * areas of RAM.  But crash kernels are preloaded into a
	 * reserved area of ram.  We must ensure the addresses
	 * are in the reserved area otherwise preloading the
	 * kernel could corrupt things.
	 */
Maneesh Soni's avatar
Maneesh Soni committed
231

232
233
234
235
236
237
238
239
240
241
242
243
244
	if (image->type == KEXEC_TYPE_CRASH) {
		result = -EADDRNOTAVAIL;
		for (i = 0; i < nr_segments; i++) {
			unsigned long mstart, mend;

			mstart = image->segment[i].mem;
			mend = mstart + image->segment[i].memsz - 1;
			/* Ensure we are within the crash kernel limits */
			if ((mstart < crashk_res.start) ||
			    (mend > crashk_res.end))
				return result;
		}
	}
245

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
	return 0;
}

static struct kimage *do_kimage_alloc_init(void)
{
	struct kimage *image;

	/* Allocate a controlling structure */
	image = kzalloc(sizeof(*image), GFP_KERNEL);
	if (!image)
		return NULL;

	image->head = 0;
	image->entry = &image->head;
	image->last_entry = &image->head;
	image->control_page = ~0; /* By default this does not apply */
	image->type = KEXEC_TYPE_DEFAULT;

	/* Initialize the list of control pages */
	INIT_LIST_HEAD(&image->control_pages);

	/* Initialize the list of destination pages */
	INIT_LIST_HEAD(&image->dest_pages);

	/* Initialize the list of unusable pages */
	INIT_LIST_HEAD(&image->unusable_pages);

	return image;
274
275
}

276
277
static void kimage_free_page_list(struct list_head *list);

278
279
280
281
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
			     unsigned long nr_segments,
			     struct kexec_segment __user *segments,
			     unsigned long flags)
282
{
283
	int ret;
284
	struct kimage *image;
285
286
287
288
289
290
291
	bool kexec_on_panic = flags & KEXEC_ON_CRASH;

	if (kexec_on_panic) {
		/* Verify we have a valid entry point */
		if ((entry < crashk_res.start) || (entry > crashk_res.end))
			return -EADDRNOTAVAIL;
	}
292
293

	/* Allocate and initialize a controlling structure */
294
295
296
297
298
299
	image = do_kimage_alloc_init();
	if (!image)
		return -ENOMEM;

	image->start = entry;

300
301
	ret = copy_user_segment_list(image, nr_segments, segments);
	if (ret)
302
303
		goto out_free_image;

304
305
	ret = sanity_check_segment_list(image);
	if (ret)
306
		goto out_free_image;
Maneesh Soni's avatar
Maneesh Soni committed
307

308
309
310
311
312
313
	 /* Enable the special crash kernel control page allocation policy. */
	if (kexec_on_panic) {
		image->control_page = crashk_res.start;
		image->type = KEXEC_TYPE_CRASH;
	}

314
315
316
317
318
	/*
	 * Find a location for the control code buffer, and add it
	 * the vector of segments so that it's pages will also be
	 * counted as destination pages.
	 */
319
	ret = -ENOMEM;
320
	image->control_code_page = kimage_alloc_control_pages(image,
321
					   get_order(KEXEC_CONTROL_PAGE_SIZE));
322
	if (!image->control_code_page) {
323
		pr_err("Could not allocate control_code_buffer\n");
324
		goto out_free_image;
325
326
	}

327
328
329
330
331
332
	if (!kexec_on_panic) {
		image->swap_page = kimage_alloc_control_pages(image, 0);
		if (!image->swap_page) {
			pr_err("Could not allocate swap buffer\n");
			goto out_free_control_pages;
		}
Huang Ying's avatar
Huang Ying committed
333
334
	}

335
336
	*rimage = image;
	return 0;
337
out_free_control_pages:
338
	kimage_free_page_list(&image->control_pages);
339
out_free_image:
340
	kfree(image);
341
	return ret;
342
343
}

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
	struct fd f = fdget(fd);
	int ret;
	struct kstat stat;
	loff_t pos;
	ssize_t bytes = 0;

	if (!f.file)
		return -EBADF;

	ret = vfs_getattr(&f.file->f_path, &stat);
	if (ret)
		goto out;

	if (stat.size > INT_MAX) {
		ret = -EFBIG;
		goto out;
	}

	/* Don't hand 0 to vmalloc, it whines. */
	if (stat.size == 0) {
		ret = -EINVAL;
		goto out;
	}

	*buf = vmalloc(stat.size);
	if (!*buf) {
		ret = -ENOMEM;
		goto out;
	}

	pos = 0;
	while (pos < stat.size) {
		bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
				    stat.size - pos);
		if (bytes < 0) {
			vfree(*buf);
			ret = bytes;
			goto out;
		}

		if (bytes == 0)
			break;
		pos += bytes;
	}

	if (pos != stat.size) {
		ret = -EBADF;
		vfree(*buf);
		goto out;
	}

	*buf_len = pos;
out:
	fdput(f);
	return ret;
}

/* Architectures can provide this probe function */
int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
					 unsigned long buf_len)
{
	return -ENOEXEC;
}

void * __weak arch_kexec_kernel_image_load(struct kimage *image)
{
	return ERR_PTR(-ENOEXEC);
}

void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
{
}

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
/* Apply relocations of type RELA */
int __weak
arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
				 unsigned int relsec)
{
	pr_err("RELA relocation unsupported.\n");
	return -ENOEXEC;
}

/* Apply relocations of type REL */
int __weak
arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
			     unsigned int relsec)
{
	pr_err("REL relocation unsupported.\n");
	return -ENOEXEC;
}

437
438
439
440
441
442
443
/*
 * Free up memory used by kernel, initrd, and comand line. This is temporary
 * memory allocation which is not needed any more after these buffers have
 * been loaded into separate segments and have been copied elsewhere.
 */
static void kimage_file_post_load_cleanup(struct kimage *image)
{
444
445
	struct purgatory_info *pi = &image->purgatory_info;

446
447
448
449
450
451
452
453
454
	vfree(image->kernel_buf);
	image->kernel_buf = NULL;

	vfree(image->initrd_buf);
	image->initrd_buf = NULL;

	kfree(image->cmdline_buf);
	image->cmdline_buf = NULL;

455
456
457
458
459
460
	vfree(pi->purgatory_buf);
	pi->purgatory_buf = NULL;

	vfree(pi->sechdrs);
	pi->sechdrs = NULL;

461
462
	/* See if architecture has anything to cleanup post load */
	arch_kimage_file_post_load_cleanup(image);
463
464
465
466
467
468
469
470

	/*
	 * Above call should have called into bootloader to free up
	 * any data stored in kimage->image_loader_data. It should
	 * be ok now to free it up.
	 */
	kfree(image->image_loader_data);
	image->image_loader_data = NULL;
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
}

/*
 * In file mode list of segments is prepared by kernel. Copy relevant
 * data from user space, do error checking, prepare segment list
 */
static int
kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
			     const char __user *cmdline_ptr,
			     unsigned long cmdline_len, unsigned flags)
{
	int ret = 0;
	void *ldata;

	ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
				&image->kernel_buf_len);
	if (ret)
		return ret;

	/* Call arch image probe handlers */
	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
					    image->kernel_buf_len);

	if (ret)
		goto out;

	/* It is possible that there no initramfs is being loaded */
	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
		ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
					&image->initrd_buf_len);
		if (ret)
			goto out;
	}

	if (cmdline_len) {
		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
		if (!image->cmdline_buf) {
			ret = -ENOMEM;
			goto out;
		}

		ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
				     cmdline_len);
		if (ret) {
			ret = -EFAULT;
			goto out;
		}

		image->cmdline_buf_len = cmdline_len;

		/* command line should be a string with last byte null */
		if (image->cmdline_buf[cmdline_len - 1] != '\0') {
			ret = -EINVAL;
			goto out;
		}
	}

	/* Call arch image load handlers */
	ldata = arch_kexec_kernel_image_load(image);

	if (IS_ERR(ldata)) {
		ret = PTR_ERR(ldata);
		goto out;
	}

	image->image_loader_data = ldata;
out:
	/* In case of error, free up all allocated memory in this function */
	if (ret)
		kimage_file_post_load_cleanup(image);
	return ret;
}

static int
kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
		       int initrd_fd, const char __user *cmdline_ptr,
		       unsigned long cmdline_len, unsigned long flags)
{
	int ret;
	struct kimage *image;

	image = do_kimage_alloc_init();
	if (!image)
		return -ENOMEM;

	image->file_mode = 1;

	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
					   cmdline_ptr, cmdline_len, flags);
	if (ret)
		goto out_free_image;

	ret = sanity_check_segment_list(image);
	if (ret)
		goto out_free_post_load_bufs;

	ret = -ENOMEM;
	image->control_code_page = kimage_alloc_control_pages(image,
					   get_order(KEXEC_CONTROL_PAGE_SIZE));
	if (!image->control_code_page) {
		pr_err("Could not allocate control_code_buffer\n");
		goto out_free_post_load_bufs;
	}

	image->swap_page = kimage_alloc_control_pages(image, 0);
	if (!image->swap_page) {
		pr_err(KERN_ERR "Could not allocate swap buffer\n");
		goto out_free_control_pages;
	}

	*rimage = image;
	return 0;
out_free_control_pages:
	kimage_free_page_list(&image->control_pages);
out_free_post_load_bufs:
	kimage_file_post_load_cleanup(image);
out_free_image:
	kfree(image);
	return ret;
}

Maneesh Soni's avatar
Maneesh Soni committed
592
593
594
static int kimage_is_destination_range(struct kimage *image,
					unsigned long start,
					unsigned long end)
595
596
597
598
599
{
	unsigned long i;

	for (i = 0; i < image->nr_segments; i++) {
		unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
600

601
		mstart = image->segment[i].mem;
Maneesh Soni's avatar
Maneesh Soni committed
602
603
		mend = mstart + image->segment[i].memsz;
		if ((end > mstart) && (start < mend))
604
605
			return 1;
	}
Maneesh Soni's avatar
Maneesh Soni committed
606

607
608
609
	return 0;
}

Al Viro's avatar
Al Viro committed
610
static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
611
612
{
	struct page *pages;
Maneesh Soni's avatar
Maneesh Soni committed
613

614
615
616
617
	pages = alloc_pages(gfp_mask, order);
	if (pages) {
		unsigned int count, i;
		pages->mapping = NULL;
618
		set_page_private(pages, order);
619
		count = 1 << order;
Maneesh Soni's avatar
Maneesh Soni committed
620
		for (i = 0; i < count; i++)
621
622
			SetPageReserved(pages + i);
	}
Maneesh Soni's avatar
Maneesh Soni committed
623

624
625
626
627
628
629
	return pages;
}

static void kimage_free_pages(struct page *page)
{
	unsigned int order, count, i;
Maneesh Soni's avatar
Maneesh Soni committed
630

631
	order = page_private(page);
632
	count = 1 << order;
Maneesh Soni's avatar
Maneesh Soni committed
633
	for (i = 0; i < count; i++)
634
635
636
637
638
639
640
		ClearPageReserved(page + i);
	__free_pages(page, order);
}

static void kimage_free_page_list(struct list_head *list)
{
	struct list_head *pos, *next;
Maneesh Soni's avatar
Maneesh Soni committed
641

642
643
644
645
646
647
648
649
650
	list_for_each_safe(pos, next, list) {
		struct page *page;

		page = list_entry(pos, struct page, lru);
		list_del(&page->lru);
		kimage_free_pages(page);
	}
}

Maneesh Soni's avatar
Maneesh Soni committed
651
652
static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
							unsigned int order)
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
{
	/* Control pages are special, they are the intermediaries
	 * that are needed while we copy the rest of the pages
	 * to their final resting place.  As such they must
	 * not conflict with either the destination addresses
	 * or memory the kernel is already using.
	 *
	 * The only case where we really need more than one of
	 * these are for architectures where we cannot disable
	 * the MMU and must instead generate an identity mapped
	 * page table for all of the memory.
	 *
	 * At worst this runs in O(N) of the image size.
	 */
	struct list_head extra_pages;
	struct page *pages;
	unsigned int count;

	count = 1 << order;
	INIT_LIST_HEAD(&extra_pages);

	/* Loop while I can allocate a page and the page allocated
	 * is a destination page.
	 */
	do {
		unsigned long pfn, epfn, addr, eaddr;
Maneesh Soni's avatar
Maneesh Soni committed
679

680
681
682
683
684
685
686
687
		pages = kimage_alloc_pages(GFP_KERNEL, order);
		if (!pages)
			break;
		pfn   = page_to_pfn(pages);
		epfn  = pfn + count;
		addr  = pfn << PAGE_SHIFT;
		eaddr = epfn << PAGE_SHIFT;
		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
Maneesh Soni's avatar
Maneesh Soni committed
688
			      kimage_is_destination_range(image, addr, eaddr)) {
689
690
691
			list_add(&pages->lru, &extra_pages);
			pages = NULL;
		}
Maneesh Soni's avatar
Maneesh Soni committed
692
693
	} while (!pages);

694
695
696
697
698
699
700
701
702
703
704
705
706
707
	if (pages) {
		/* Remember the allocated page... */
		list_add(&pages->lru, &image->control_pages);

		/* Because the page is already in it's destination
		 * location we will never allocate another page at
		 * that address.  Therefore kimage_alloc_pages
		 * will not return it (again) and we don't need
		 * to give it an entry in image->segment[].
		 */
	}
	/* Deal with the destination pages I have inadvertently allocated.
	 *
	 * Ideally I would convert multi-page allocations into single
Lucas De Marchi's avatar
Lucas De Marchi committed
708
	 * page allocations, and add everything to image->dest_pages.
709
710
711
712
713
	 *
	 * For now it is simpler to just free the pages.
	 */
	kimage_free_page_list(&extra_pages);

Maneesh Soni's avatar
Maneesh Soni committed
714
	return pages;
715
716
}

Maneesh Soni's avatar
Maneesh Soni committed
717
718
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
						      unsigned int order)
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
{
	/* Control pages are special, they are the intermediaries
	 * that are needed while we copy the rest of the pages
	 * to their final resting place.  As such they must
	 * not conflict with either the destination addresses
	 * or memory the kernel is already using.
	 *
	 * Control pages are also the only pags we must allocate
	 * when loading a crash kernel.  All of the other pages
	 * are specified by the segments and we just memcpy
	 * into them directly.
	 *
	 * The only case where we really need more than one of
	 * these are for architectures where we cannot disable
	 * the MMU and must instead generate an identity mapped
	 * page table for all of the memory.
	 *
	 * Given the low demand this implements a very simple
	 * allocator that finds the first hole of the appropriate
	 * size in the reserved memory region, and allocates all
	 * of the memory up to and including the hole.
	 */
	unsigned long hole_start, hole_end, size;
	struct page *pages;
Maneesh Soni's avatar
Maneesh Soni committed
743

744
745
746
747
	pages = NULL;
	size = (1 << order) << PAGE_SHIFT;
	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
	hole_end   = hole_start + size - 1;
Maneesh Soni's avatar
Maneesh Soni committed
748
	while (hole_end <= crashk_res.end) {
749
		unsigned long i;
Maneesh Soni's avatar
Maneesh Soni committed
750

751
		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
752
753
			break;
		/* See if I overlap any of the segments */
Maneesh Soni's avatar
Maneesh Soni committed
754
		for (i = 0; i < image->nr_segments; i++) {
755
			unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
756

757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
			mstart = image->segment[i].mem;
			mend   = mstart + image->segment[i].memsz - 1;
			if ((hole_end >= mstart) && (hole_start <= mend)) {
				/* Advance the hole to the end of the segment */
				hole_start = (mend + (size - 1)) & ~(size - 1);
				hole_end   = hole_start + size - 1;
				break;
			}
		}
		/* If I don't overlap any segments I have found my hole! */
		if (i == image->nr_segments) {
			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
			break;
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
772
	if (pages)
773
		image->control_page = hole_end;
Maneesh Soni's avatar
Maneesh Soni committed
774

775
776
777
778
	return pages;
}


Maneesh Soni's avatar
Maneesh Soni committed
779
780
struct page *kimage_alloc_control_pages(struct kimage *image,
					 unsigned int order)
781
782
{
	struct page *pages = NULL;
Maneesh Soni's avatar
Maneesh Soni committed
783
784

	switch (image->type) {
785
786
787
788
789
790
791
	case KEXEC_TYPE_DEFAULT:
		pages = kimage_alloc_normal_control_pages(image, order);
		break;
	case KEXEC_TYPE_CRASH:
		pages = kimage_alloc_crash_control_pages(image, order);
		break;
	}
Maneesh Soni's avatar
Maneesh Soni committed
792

793
794
795
796
797
	return pages;
}

static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
Maneesh Soni's avatar
Maneesh Soni committed
798
	if (*image->entry != 0)
799
		image->entry++;
Maneesh Soni's avatar
Maneesh Soni committed
800

801
802
803
	if (image->entry == image->last_entry) {
		kimage_entry_t *ind_page;
		struct page *page;
Maneesh Soni's avatar
Maneesh Soni committed
804

805
		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
Maneesh Soni's avatar
Maneesh Soni committed
806
		if (!page)
807
			return -ENOMEM;
Maneesh Soni's avatar
Maneesh Soni committed
808

809
810
811
		ind_page = page_address(page);
		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
		image->entry = ind_page;
Maneesh Soni's avatar
Maneesh Soni committed
812
813
		image->last_entry = ind_page +
				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
814
815
816
817
	}
	*image->entry = entry;
	image->entry++;
	*image->entry = 0;
Maneesh Soni's avatar
Maneesh Soni committed
818

819
820
821
	return 0;
}

Maneesh Soni's avatar
Maneesh Soni committed
822
823
static int kimage_set_destination(struct kimage *image,
				   unsigned long destination)
824
825
826
827
828
{
	int result;

	destination &= PAGE_MASK;
	result = kimage_add_entry(image, destination | IND_DESTINATION);
Maneesh Soni's avatar
Maneesh Soni committed
829
	if (result == 0)
830
		image->destination = destination;
Maneesh Soni's avatar
Maneesh Soni committed
831

832
833
834
835
836
837
838
839
840
841
	return result;
}


static int kimage_add_page(struct kimage *image, unsigned long page)
{
	int result;

	page &= PAGE_MASK;
	result = kimage_add_entry(image, page | IND_SOURCE);
Maneesh Soni's avatar
Maneesh Soni committed
842
	if (result == 0)
843
		image->destination += PAGE_SIZE;
Maneesh Soni's avatar
Maneesh Soni committed
844

845
846
847
848
849
850
851
852
853
	return result;
}


static void kimage_free_extra_pages(struct kimage *image)
{
	/* Walk through and free any extra destination pages I may have */
	kimage_free_page_list(&image->dest_pages);

Lucas De Marchi's avatar
Lucas De Marchi committed
854
	/* Walk through and free any unusable pages I have cached */
855
	kimage_free_page_list(&image->unusable_pages);
856
857

}
858
static void kimage_terminate(struct kimage *image)
859
{
Maneesh Soni's avatar
Maneesh Soni committed
860
	if (*image->entry != 0)
861
		image->entry++;
Maneesh Soni's avatar
Maneesh Soni committed
862

863
864
865
866
867
	*image->entry = IND_DONE;
}

#define for_each_kimage_entry(image, ptr, entry) \
	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
868
869
		ptr = (entry & IND_INDIRECTION) ? \
			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885

static void kimage_free_entry(kimage_entry_t entry)
{
	struct page *page;

	page = pfn_to_page(entry >> PAGE_SHIFT);
	kimage_free_pages(page);
}

static void kimage_free(struct kimage *image)
{
	kimage_entry_t *ptr, entry;
	kimage_entry_t ind = 0;

	if (!image)
		return;
Maneesh Soni's avatar
Maneesh Soni committed
886

887
888
889
890
	kimage_free_extra_pages(image);
	for_each_kimage_entry(image, ptr, entry) {
		if (entry & IND_INDIRECTION) {
			/* Free the previous indirection page */
Maneesh Soni's avatar
Maneesh Soni committed
891
			if (ind & IND_INDIRECTION)
892
893
894
895
896
				kimage_free_entry(ind);
			/* Save this indirection page until we are
			 * done with it.
			 */
			ind = entry;
897
		} else if (entry & IND_SOURCE)
898
899
900
			kimage_free_entry(entry);
	}
	/* Free the final indirection page */
Maneesh Soni's avatar
Maneesh Soni committed
901
	if (ind & IND_INDIRECTION)
902
903
904
905
906
907
908
		kimage_free_entry(ind);

	/* Handle any machine specific cleanup */
	machine_kexec_cleanup(image);

	/* Free the kexec control pages... */
	kimage_free_page_list(&image->control_pages);
909
910
911
912
913
914
915
916

	/*
	 * Free up any temporary buffers allocated. This might hit if
	 * error occurred much later after buffer allocation.
	 */
	if (image->file_mode)
		kimage_file_post_load_cleanup(image);

917
918
919
	kfree(image);
}

Maneesh Soni's avatar
Maneesh Soni committed
920
921
static kimage_entry_t *kimage_dst_used(struct kimage *image,
					unsigned long page)
922
923
924
925
926
{
	kimage_entry_t *ptr, entry;
	unsigned long destination = 0;

	for_each_kimage_entry(image, ptr, entry) {
Maneesh Soni's avatar
Maneesh Soni committed
927
		if (entry & IND_DESTINATION)
928
929
			destination = entry & PAGE_MASK;
		else if (entry & IND_SOURCE) {
Maneesh Soni's avatar
Maneesh Soni committed
930
			if (page == destination)
931
932
933
934
				return ptr;
			destination += PAGE_SIZE;
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
935

936
	return NULL;
937
938
}

Maneesh Soni's avatar
Maneesh Soni committed
939
static struct page *kimage_alloc_page(struct kimage *image,
Al Viro's avatar
Al Viro committed
940
					gfp_t gfp_mask,
Maneesh Soni's avatar
Maneesh Soni committed
941
					unsigned long destination)
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
{
	/*
	 * Here we implement safeguards to ensure that a source page
	 * is not copied to its destination page before the data on
	 * the destination page is no longer useful.
	 *
	 * To do this we maintain the invariant that a source page is
	 * either its own destination page, or it is not a
	 * destination page at all.
	 *
	 * That is slightly stronger than required, but the proof
	 * that no problems will not occur is trivial, and the
	 * implementation is simply to verify.
	 *
	 * When allocating all pages normally this algorithm will run
	 * in O(N) time, but in the worst case it will run in O(N^2)
	 * time.   If the runtime is a problem the data structures can
	 * be fixed.
	 */
	struct page *page;
	unsigned long addr;

	/*
	 * Walk through the list of destination pages, and see if I
	 * have a match.
	 */
	list_for_each_entry(page, &image->dest_pages, lru) {
		addr = page_to_pfn(page) << PAGE_SHIFT;
		if (addr == destination) {
			list_del(&page->lru);
			return page;
		}
	}
	page = NULL;
	while (1) {
		kimage_entry_t *old;

		/* Allocate a page, if we run out of memory give up */
		page = kimage_alloc_pages(gfp_mask, 0);
Maneesh Soni's avatar
Maneesh Soni committed
981
		if (!page)
982
			return NULL;
983
		/* If the page cannot be used file it away */
Maneesh Soni's avatar
Maneesh Soni committed
984
985
		if (page_to_pfn(page) >
				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
986
			list_add(&page->lru, &image->unusable_pages);
987
988
989
990
991
992
993
994
995
			continue;
		}
		addr = page_to_pfn(page) << PAGE_SHIFT;

		/* If it is the destination page we want use it */
		if (addr == destination)
			break;

		/* If the page is not a destination page use it */
Maneesh Soni's avatar
Maneesh Soni committed
996
997
		if (!kimage_is_destination_range(image, addr,
						  addr + PAGE_SIZE))
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
			break;

		/*
		 * I know that the page is someones destination page.
		 * See if there is already a source page for this
		 * destination page.  And if so swap the source pages.
		 */
		old = kimage_dst_used(image, addr);
		if (old) {
			/* If so move it */
			unsigned long old_addr;
			struct page *old_page;

			old_addr = *old & PAGE_MASK;
			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
			copy_highpage(page, old_page);
			*old = addr | (*old & ~PAGE_MASK);

			/* The old page I have found cannot be a
1017
1018
			 * destination page, so return it if it's
			 * gfp_flags honor the ones passed in.
1019
			 */
1020
1021
1022
1023
1024
			if (!(gfp_mask & __GFP_HIGHMEM) &&
			    PageHighMem(old_page)) {
				kimage_free_pages(old_page);
				continue;
			}
1025
1026
1027
			addr = old_addr;
			page = old_page;
			break;
1028
		} else {
1029
1030
1031
1032
1033
1034
			/* Place the page on the destination list I
			 * will use it later.
			 */
			list_add(&page->lru, &image->dest_pages);
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
1035

1036
1037
1038
1039
	return page;
}

static int kimage_load_normal_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
1040
					 struct kexec_segment *segment)
1041
1042
{
	unsigned long maddr;
1043
	size_t ubytes, mbytes;
1044
	int result;
1045
1046
	unsigned char __user *buf = NULL;
	unsigned char *kbuf = NULL;
1047
1048

	result = 0;
1049
1050
1051
1052
	if (image->file_mode)
		kbuf = segment->kbuf;
	else
		buf = segment->buf;
1053
1054
1055
1056
1057
	ubytes = segment->bufsz;
	mbytes = segment->memsz;
	maddr = segment->mem;

	result = kimage_set_destination(image, maddr);
Maneesh Soni's avatar
Maneesh Soni committed
1058
	if (result < 0)
1059
		goto out;
Maneesh Soni's avatar
Maneesh Soni committed
1060
1061

	while (mbytes) {
1062
1063
1064
		struct page *page;
		char *ptr;
		size_t uchunk, mchunk;
Maneesh Soni's avatar
Maneesh Soni committed
1065

1066
		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
1067
		if (!page) {
1068
1069
1070
			result  = -ENOMEM;
			goto out;
		}
Maneesh Soni's avatar
Maneesh Soni committed
1071
1072
1073
		result = kimage_add_page(image, page_to_pfn(page)
								<< PAGE_SHIFT);
		if (result < 0)
1074
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
1075

1076
1077
		ptr = kmap(page);
		/* Start with a clear page */
1078
		clear_page(ptr);
1079
		ptr += maddr & ~PAGE_MASK;
1080
1081
1082
		mchunk = min_t(size_t, mbytes,
				PAGE_SIZE - (maddr & ~PAGE_MASK));
		uchunk = min(ubytes, mchunk);
Maneesh Soni's avatar
Maneesh Soni committed
1083

1084
1085
1086
1087
1088
		/* For file based kexec, source pages are in kernel memory */
		if (image->file_mode)
			memcpy(ptr, kbuf, uchunk);
		else
			result = copy_from_user(ptr, buf, uchunk);
1089
1090
		kunmap(page);
		if (result) {
1091
			result = -EFAULT;
1092
1093
1094
1095
			goto out;
		}
		ubytes -= uchunk;
		maddr  += mchunk;
1096
1097
1098
1099
		if (image->file_mode)
			kbuf += mchunk;
		else
			buf += mchunk;
1100
1101
		mbytes -= mchunk;
	}
Maneesh Soni's avatar
Maneesh Soni committed
1102
out:
1103
1104
1105
1106
	return result;
}

static int kimage_load_crash_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
1107
					struct kexec_segment *segment)
1108
1109
1110
1111
1112
1113
{
	/* For crash dumps kernels we simply copy the data from
	 * user space to it's destination.
	 * We do things a page at a time for the sake of kmap.
	 */
	unsigned long maddr;
1114
	size_t ubytes, mbytes;
1115
	int result;
1116
	unsigned char __user *buf;
1117
1118
1119
1120
1121
1122

	result = 0;
	buf = segment->buf;
	ubytes = segment->bufsz;
	mbytes = segment->memsz;
	maddr = segment->mem;
Maneesh Soni's avatar
Maneesh Soni committed
1123
	while (mbytes) {
1124
1125
1126
		struct page *page;
		char *ptr;
		size_t uchunk, mchunk;
Maneesh Soni's avatar
Maneesh Soni committed
1127

1128
		page = pfn_to_page(maddr >> PAGE_SHIFT);
1129
		if (!page) {
1130
1131
1132
1133
1134
			result  = -ENOMEM;
			goto out;
		}
		ptr = kmap(page);
		ptr += maddr & ~PAGE_MASK;
1135
1136
1137
1138
		mchunk = min_t(size_t, mbytes,
				PAGE_SIZE - (maddr & ~PAGE_MASK));
		uchunk = min(ubytes, mchunk);
		if (mchunk > uchunk) {
1139
1140
1141
1142
			/* Zero the trailing part of the page */
			memset(ptr + uchunk, 0, mchunk - uchunk);
		}
		result = copy_from_user(ptr, buf, uchunk);
Zou Nan hai's avatar
Zou Nan hai committed
1143
		kexec_flush_icache_page(page);
1144
1145
		kunmap(page);
		if (result) {
1146
			result = -EFAULT;
1147
1148
1149
1150
			goto out;
		}
		ubytes -= uchunk;
		maddr  += mchunk;
1151
		buf += mchunk;
1152
1153
		mbytes -= mchunk;
	}
Maneesh Soni's avatar
Maneesh Soni committed
1154
out:
1155
1156
1157
1158
	return result;
}

static int kimage_load_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
1159
				struct kexec_segment *segment)
1160
1161
{
	int result = -ENOMEM;
Maneesh Soni's avatar
Maneesh Soni committed
1162
1163

	switch (image->type) {
1164
1165
1166
1167
1168
1169
1170
	case KEXEC_TYPE_DEFAULT:
		result = kimage_load_normal_segment(image, segment);
		break;
	case KEXEC_TYPE_CRASH:
		result = kimage_load_crash_segment(image, segment);
		break;
	}
Maneesh Soni's avatar
Maneesh Soni committed
1171

1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
	return result;
}

/*
 * Exec Kernel system call: for obvious reasons only root may call it.
 *
 * This call breaks up into three pieces.
 * - A generic part which loads the new kernel from the current
 *   address space, and very carefully places the data in the
 *   allocated pages.
 *
 * - A generic part that interacts with the kernel and tells all of
 *   the devices to shut down.  Preventing on-going dmas, and placing
 *   the devices in a consistent state so a later kernel can
 *   reinitialize them.
 *
 * - A machine specific part that includes the syscall number
Geert Uytterhoeven's avatar
Geert Uytterhoeven committed
1189
 *   and then copies the image to it's final destination.  And
1190
1191
1192
1193
1194
 *   jumps into the image at entry.
 *
 * kexec does not sync, or unmount filesystems so if you need
 * that to happen you need to do that yourself.
 */
1195
1196
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
1197
int kexec_load_disabled;
1198
1199

static DEFINE_MUTEX(kexec_mutex);
1200

1201
1202
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
		struct kexec_segment __user *, segments, unsigned long, flags)
1203
1204
1205
1206
1207
{
	struct kimage **dest_image, *image;
	int result;

	/* We only trust the superuser with rebooting the system. */
1208
	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
		return -EPERM;

	/*
	 * Verify we have a legal set of flags
	 * This leaves us room for future extensions.
	 */
	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
		return -EINVAL;

	/* Verify we are on the appropriate architecture */
	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
		return -EINVAL;

	/* Put an artificial cap on the number
	 * of segments passed to kexec_load.
	 */
	if (nr_segments > KEXEC_SEGMENT_MAX)
		return -EINVAL;

	image = NULL;
	result = 0;

	/* Because we write directly to the reserved memory
	 * region when loading crash kernels we need a mutex here to
	 * prevent multiple crash  kernels from attempting to load
	 * simultaneously, and to prevent a crash kernel from loading
	 * over the top of a in use crash kernel.
	 *
	 * KISS: always take the mutex.
	 */
1240
	if (!mutex_trylock(&kexec_mutex))
1241
		return -EBUSY;
Maneesh Soni's avatar
Maneesh Soni committed
1242

1243
	dest_image = &kexec_image;
Maneesh Soni's avatar
Maneesh Soni committed
1244
	if (flags & KEXEC_ON_CRASH)
1245
1246
1247
		dest_image = &kexec_crash_image;
	if (nr_segments > 0) {
		unsigned long i;
Maneesh Soni's avatar
Maneesh Soni committed
1248

1249
		/* Loading another kernel to reboot into */
Maneesh Soni's avatar
Maneesh Soni committed
1250
		if ((flags & KEXEC_ON_CRASH) == 0)
1251
1252
			result = kimage_alloc_init(&image, entry, nr_segments,
						   segments, flags);
1253
1254
1255
1256
1257
1258
		/* Loading another kernel to switch to if this one crashes */
		else if (flags & KEXEC_ON_CRASH) {
			/* Free any current crash dump kernel before
			 * we corrupt it.
			 */
			kimage_free(xchg(&kexec_crash_image, NULL));
1259
1260
			result = kimage_alloc_init(&image, entry, nr_segments,
						   segments, flags);
1261
			crash_map_reserved_pages();
1262
		}
Maneesh Soni's avatar
Maneesh Soni committed
1263
		if (result)
1264
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
1265

Huang Ying's avatar
Huang Ying committed
1266
1267
		if (flags & KEXEC_PRESERVE_CONTEXT)
			image->preserve_context = 1;
1268
		result = machine_kexec_prepare(image);
Maneesh Soni's avatar
Maneesh Soni committed
1269
		if (result)
1270
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
1271
1272

		for (i = 0; i < nr_segments; i++) {
1273
			result = kimage_load_segment(image, &image->segment[i]);
Maneesh Soni's avatar
Maneesh Soni committed
1274
			if (result)
1275
1276
				goto out;
		}
1277
		kimage_terminate(image);
1278
1279
		if (flags & KEXEC_ON_CRASH)
			crash_unmap_reserved_pages();
1280
1281
1282
1283
	}
	/* Install the new kernel, and  Uninstall the old */
	image = xchg(dest_image, image);

Maneesh Soni's avatar
Maneesh Soni committed
1284
out:
1285
	mutex_unlock(&kexec_mutex);
1286
	kimage_free(image);
Maneesh Soni's avatar
Maneesh Soni committed
1287

1288
1289
1290
	return result;
}

1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
/*
 * Add and remove page tables for crashkernel memory
 *
 * Provide an empty default implementation here -- architecture
 * code may override this
 */
void __weak crash_map_reserved_pages(void)
{}

void __weak crash_unmap_reserved_pages(void)
{}

1303
#ifdef CONFIG_COMPAT
1304
1305
1306
1307
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
		       compat_ulong_t, nr_segments,
		       struct compat_kexec_segment __user *, segments,
		       compat_ulong_t, flags)
1308
1309
1310
1311
1312
1313
1314
1315
{
	struct compat_kexec_segment in;
	struct kexec_segment out, __user *ksegments;
	unsigned long i, result;

	/* Don't allow clients that don't understand the native
	 * architecture to do anything.
	 */
Maneesh Soni's avatar
Maneesh Soni committed
1316
	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1317
1318
		return -EINVAL;

Maneesh Soni's avatar
Maneesh Soni committed
1319
	if (nr_segments > KEXEC_SEGMENT_MAX)
1320
1321
1322
		return -EINVAL;

	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1323
	for (i = 0; i < nr_segments; i++) {
1324
		result = copy_from_user(&in, &segments[i], sizeof(in));
Maneesh Soni's avatar
Maneesh Soni committed
1325
		if (result)
1326
1327
1328
1329
1330
1331
1332
1333
			return -EFAULT;

		out.buf   = compat_ptr(in.buf);
		out.bufsz = in.bufsz;
		out.mem   = in.mem;
		out.memsz = in.memsz;

		result = copy_to_user(&ksegments[i], &out, sizeof(out));
Maneesh Soni's avatar
Maneesh Soni committed
1334
		if (result)
1335
1336
1337
1338
1339
1340
1341
			return -EFAULT;
	}

	return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif

1342
1343
1344
1345
SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
		unsigned long, cmdline_len, const char __user *, cmdline_ptr,
		unsigned long, flags)
{
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
	int ret = 0, i;
	struct kimage **dest_image, *image;

	/* We only trust the superuser with rebooting the system. */
	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
		return -EPERM;

	/* Make sure we have a legal set of flags */
	if (flags != (flags & KEXEC_FILE_FLAGS))
		return -EINVAL;

	image = NULL;

	if (!mutex_trylock(&kexec_mutex))
		return -EBUSY;

	dest_image = &kexec_image;
	if (flags & KEXEC_FILE_ON_CRASH)
		dest_image = &kexec_crash_image;

	if (flags & KEXEC_FILE_UNLOAD)
		goto exchange;

	/*
	 * In case of crash, new kernel gets loaded in reserved region. It is
	 * same memory where old crash kernel might be loaded. Free any
	 * current crash dump kernel before we corrupt it.
	 */
	if (flags & KEXEC_FILE_ON_CRASH)
		kimage_free(xchg(&kexec_crash_image, NULL));

	ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
				     cmdline_len, flags);
	if (ret)
		goto out;

	ret = machine_kexec_prepare(image);
	if (ret)
		goto out;

1386
1387
1388
1389
	ret = kexec_calculate_store_digests(image);
	if (ret)
		goto out;

1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
	for (i = 0; i < image->nr_segments; i++) {
		struct kexec_segment *ksegment;

		ksegment = &image->segment[i];
		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
			 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
			 ksegment->memsz);

		ret = kimage_load_segment(image, &image->segment[i]);
		if (ret)
			goto out;
	}

	kimage_terminate(image);

	/*
	 * Free up any temporary buffers allocated which are not needed
	 * after image has been loaded
	 */
	kimage_file_post_load_cleanup(image);
exchange:
	image = xchg(dest_image, image);
out:
	mutex_unlock(&kexec_mutex);
	kimage_free(image);
	return ret;
1416
1417
}

1418
void crash_kexec(struct pt_regs *regs)
1419
{
1420
	/* Take the kexec_mutex here to prevent sys_kexec_load
1421
1422
1423
1424
1425
1426
1427
	 * running on one cpu from replacing the crash kernel
	 * we are using after a panic on a different cpu.
	 *
	 * If the crash kernel was not located in a fixed area
	 * of memory the xchg(&kexec_crash_image) would be
	 * sufficient.  But since I reuse the memory...
	 */
1428
	if (mutex_trylock(&kexec_mutex)) {
1429
		if (kexec_crash_image) {
1430
			struct pt_regs fixed_regs;
1431

1432
			crash_setup_regs(&fixed_regs, regs);