fork.c 73.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

Christian Brauner's avatar
Christian Brauner committed
15
#include <linux/anon_inodes.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
#include <linux/slab.h>
17
#include <linux/sched/autogroup.h>
18
#include <linux/sched/mm.h>
19
#include <linux/sched/coredump.h>
20
#include <linux/sched/user.h>
21
#include <linux/sched/numa_balancing.h>
22
#include <linux/sched/stat.h>
23
#include <linux/sched/task.h>
24
#include <linux/sched/task_stack.h>
25
#include <linux/sched/cputime.h>
Christian Brauner's avatar
Christian Brauner committed
26
#include <linux/seq_file.h>
27
#include <linux/rtmutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32 33 34 35 36
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
37
#include <linux/fdtable.h>
38
#include <linux/iocontext.h>
Linus Torvalds's avatar
Linus Torvalds committed
39 40 41
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
42
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
#include <linux/fs.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
44 45
#include <linux/mm.h>
#include <linux/vmacache.h>
46
#include <linux/nsproxy.h>
47
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
#include <linux/cpu.h>
49
#include <linux/cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
50
#include <linux/security.h>
51
#include <linux/hugetlb.h>
52
#include <linux/seccomp.h>
Linus Torvalds's avatar
Linus Torvalds committed
53 54 55 56
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
57
#include <linux/compat.h>
58
#include <linux/kthread.h>
59
#include <linux/task_io_accounting_ops.h>
60
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
61 62 63
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
64
#include <linux/memcontrol.h>
65
#include <linux/ftrace.h>
66
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
67 68
#include <linux/profile.h>
#include <linux/rmap.h>
Hugh Dickins's avatar
Hugh Dickins committed
69
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
70
#include <linux/acct.h>
71
#include <linux/userfaultfd_k.h>
72
#include <linux/tsacct_kern.h>
73
#include <linux/cn_proc.h>
Rafael J. Wysocki's avatar
Rafael J. Wysocki committed
74
#include <linux/freezer.h>
75
#include <linux/delayacct.h>
76
#include <linux/taskstats_kern.h>
77
#include <linux/random.h>
Miloslav Trmac's avatar
Miloslav Trmac committed
78
#include <linux/tty.h>
79
#include <linux/blkdev.h>
80
#include <linux/fs_struct.h>
81
#include <linux/magic.h>
82
#include <linux/perf_event.h>
83
#include <linux/posix-timers.h>
84
#include <linux/user-return-notifier.h>
Ying Han's avatar
Ying Han committed
85
#include <linux/oom.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
86
#include <linux/khugepaged.h>
87
#include <linux/signalfd.h>
88
#include <linux/uprobes.h>
89
#include <linux/aio.h>
90
#include <linux/compiler.h>
91
#include <linux/sysctl.h>
Dmitry Vyukov's avatar
Dmitry Vyukov committed
92
#include <linux/kcov.h>
93
#include <linux/livepatch.h>
94
#include <linux/thread_info.h>
95
#include <linux/stackleak.h>
96
#include <linux/kasan.h>
Linus Torvalds's avatar
Linus Torvalds committed
97 98 99

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
100
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
101 102 103 104
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

105 106
#include <trace/events/sched.h>

107 108 109
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

110 111 112 113 114 115 116 117 118 119
/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

Linus Torvalds's avatar
Linus Torvalds committed
120 121 122 123
/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;	/* Handle normal Linux uptimes. */
124
int nr_threads;			/* The idle threads do not count.. */
Linus Torvalds's avatar
Linus Torvalds committed
125

126
static int max_threads;		/* tunable limit on nr_threads */
Linus Torvalds's avatar
Linus Torvalds committed
127

128 129 130 131 132 133 134 135 136
#define NAMED_ARRAY_INDEX(x)	[x] = __stringify(x)

static const char * const resident_page_types[] = {
	NAMED_ARRAY_INDEX(MM_FILEPAGES),
	NAMED_ARRAY_INDEX(MM_ANONPAGES),
	NAMED_ARRAY_INDEX(MM_SWAPENTS),
	NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

Linus Torvalds's avatar
Linus Torvalds committed
137 138
DEFINE_PER_CPU(unsigned long, process_counts) = 0;

139
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
140 141 142 143 144 145 146 147

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
	return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
Linus Torvalds's avatar
Linus Torvalds committed
148 149 150 151 152 153

int nr_processes(void)
{
	int cpu;
	int total = 0;

154
	for_each_possible_cpu(cpu)
Linus Torvalds's avatar
Linus Torvalds committed
155 156 157 158 159
		total += per_cpu(process_counts, cpu);

	return total;
}

160 161 162 163
void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

164
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
165
static struct kmem_cache *task_struct_cachep;
166 167 168 169 170 171 172 173 174 175

static inline struct task_struct *alloc_task_struct_node(int node)
{
	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
	kmem_cache_free(task_struct_cachep, tsk);
}
Linus Torvalds's avatar
Linus Torvalds committed
176 177
#endif

178
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
179

180 181 182 183
/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
184
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
185 186 187 188 189 190 191 192

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210

static int free_vm_stack_cache(unsigned int cpu)
{
	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *vm_stack = cached_vm_stacks[i];

		if (!vm_stack)
			continue;

		vfree(vm_stack->addr);
		cached_vm_stacks[i] = NULL;
	}

	return 0;
}
211 212
#endif

213
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
214
{
215
#ifdef CONFIG_VMAP_STACK
216 217 218 219
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
220 221 222
		struct vm_struct *s;

		s = this_cpu_xchg(cached_stacks[i], NULL);
223 224 225 226

		if (!s)
			continue;

227 228 229
		/* Clear the KASAN shadow of the stack. */
		kasan_unpoison_shadow(s->addr, THREAD_SIZE);

230 231
		/* Clear stale pointers from reused stack. */
		memset(s->addr, 0, THREAD_SIZE);
232

233
		tsk->stack_vm_area = s;
234
		tsk->stack = s->addr;
235 236 237
		return s->addr;
	}

238 239 240 241 242
	/*
	 * Allocated stacks are cached and later reused by new threads,
	 * so memcg accounting is performed manually on assigning/releasing
	 * stacks to tasks. Drop __GFP_ACCOUNT.
	 */
243
	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
244
				     VMALLOC_START, VMALLOC_END,
245
				     THREADINFO_GFP & ~__GFP_ACCOUNT,
246 247
				     PAGE_KERNEL,
				     0, node, __builtin_return_address(0));
248 249 250 251 252 253

	/*
	 * We can't call find_vm_area() in interrupt context, and
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
254
	if (stack) {
255
		tsk->stack_vm_area = find_vm_area(stack);
256 257
		tsk->stack = stack;
	}
258 259
	return stack;
#else
260 261
	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
					     THREAD_SIZE_ORDER);
262

263 264 265 266 267
	if (likely(page)) {
		tsk->stack = page_address(page);
		return tsk->stack;
	}
	return NULL;
268
#endif
269 270
}

271
static inline void free_thread_stack(struct task_struct *tsk)
272
{
273
#ifdef CONFIG_VMAP_STACK
274 275 276
	struct vm_struct *vm = task_stack_vm_area(tsk);

	if (vm) {
277 278
		int i;

279 280 281 282 283 284 285 286
		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			mod_memcg_page_state(vm->pages[i],
					     MEMCG_KERNEL_STACK_KB,
					     -(int)(PAGE_SIZE / 1024));

			memcg_kmem_uncharge(vm->pages[i], 0);
		}

287
		for (i = 0; i < NR_CACHED_STACKS; i++) {
288 289
			if (this_cpu_cmpxchg(cached_stacks[i],
					NULL, tsk->stack_vm_area) != NULL)
290 291 292 293 294
				continue;

			return;
		}

295
		vfree_atomic(tsk->stack);
296 297 298 299 300
		return;
	}
#endif

	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
301
}
302
# else
303
static struct kmem_cache *thread_stack_cache;
304

305
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
306 307
						  int node)
{
308 309 310 311
	unsigned long *stack;
	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
	tsk->stack = stack;
	return stack;
312 313
}

314
static void free_thread_stack(struct task_struct *tsk)
315
{
316
	kmem_cache_free(thread_stack_cache, tsk->stack);
317 318
}

319
void thread_stack_cache_init(void)
320
{
321 322 323
	thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
					THREAD_SIZE, THREAD_SIZE, 0, 0,
					THREAD_SIZE, NULL);
324
	BUG_ON(thread_stack_cache == NULL);
325 326
}
# endif
327 328
#endif

Linus Torvalds's avatar
Linus Torvalds committed
329
/* SLAB cache for signal_struct structures (tsk->signal) */
330
static struct kmem_cache *signal_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
331 332

/* SLAB cache for sighand_struct structures (tsk->sighand) */
333
struct kmem_cache *sighand_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
334 335

/* SLAB cache for files_struct structures (tsk->files) */
336
struct kmem_cache *files_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
337 338

/* SLAB cache for fs_struct structures (tsk->fs) */
339
struct kmem_cache *fs_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
340 341

/* SLAB cache for vm_area_struct structures */
342
static struct kmem_cache *vm_area_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
343 344

/* SLAB cache for mm_struct structures (tsk->mm) */
345
static struct kmem_cache *mm_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
346

347
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
348
{
349
	struct vm_area_struct *vma;
350

351
	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
352 353
	if (vma)
		vma_init(vma, mm);
354
	return vma;
355 356 357 358
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
359 360 361 362 363 364 365
	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

	if (new) {
		*new = *orig;
		INIT_LIST_HEAD(&new->anon_vma_chain);
	}
	return new;
366 367 368 369 370 371 372
}

void vm_area_free(struct vm_area_struct *vma)
{
	kmem_cache_free(vm_area_cachep, vma);
}

373
static void account_kernel_stack(struct task_struct *tsk, int account)
374
{
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
	void *stack = task_stack_page(tsk);
	struct vm_struct *vm = task_stack_vm_area(tsk);

	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

	if (vm) {
		int i;

		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			mod_zone_page_state(page_zone(vm->pages[i]),
					    NR_KERNEL_STACK_KB,
					    PAGE_SIZE / 1024 * account);
		}
	} else {
		/*
		 * All stack pages are in the same zone and belong to the
		 * same memcg.
		 */
		struct page *first_page = virt_to_page(stack);

		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
				    THREAD_SIZE / 1024 * account);

400 401
		mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
				    account * (THREAD_SIZE / 1024));
402
	}
403 404
}

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
static int memcg_charge_kernel_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
	struct vm_struct *vm = task_stack_vm_area(tsk);
	int ret;

	if (vm) {
		int i;

		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			/*
			 * If memcg_kmem_charge() fails, page->mem_cgroup
			 * pointer is NULL, and both memcg_kmem_uncharge()
			 * and mod_memcg_page_state() in free_thread_stack()
			 * will ignore this page. So it's safe.
			 */
			ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
			if (ret)
				return ret;

			mod_memcg_page_state(vm->pages[i],
					     MEMCG_KERNEL_STACK_KB,
					     PAGE_SIZE / 1024);
		}
	}
#endif
	return 0;
}

434
static void release_task_stack(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
435
{
436 437 438
	if (WARN_ON(tsk->state != TASK_DEAD))
		return;  /* Better to leak the stack than to free prematurely */

439 440
	account_kernel_stack(tsk, -1);
	free_thread_stack(tsk);
441 442 443 444 445 446 447 448 449
	tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
450
	if (refcount_dec_and_test(&tsk->stack_refcount))
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
		release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifndef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * The task is finally done with both the stack and thread_info,
	 * so free both.
	 */
	release_task_stack(tsk);
#else
	/*
	 * If the task had a separate stack allocation, it should be gone
	 * by now.
	 */
468
	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
469
#endif
Ingo Molnar's avatar
Ingo Molnar committed
470
	rt_mutex_debug_task_free(tsk);
471
	ftrace_graph_exit_task(tsk);
472
	put_seccomp_filter(tsk);
473
	arch_release_task_struct(tsk);
474 475
	if (tsk->flags & PF_KTHREAD)
		free_kthread_struct(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
476 477 478 479
	free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
					struct mm_struct *oldmm)
{
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	LIST_HEAD(uf);

	uprobe_start_dup_mmap();
	if (down_write_killable(&oldmm->mmap_sem)) {
		retval = -EINTR;
		goto fail_uprobe_end;
	}
	flush_cache_dup_mm(oldmm);
	uprobe_dup_mmap(oldmm, mm);
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

	/* No ordering required: file already has been exposed. */
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

	mm->total_vm = oldmm->total_vm;
	mm->data_vm = oldmm->data_vm;
	mm->exec_vm = oldmm->exec_vm;
	mm->stack_vm = oldmm->stack_vm;

	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
	retval = ksm_fork(mm, oldmm);
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
	if (retval)
		goto out;

	prev = NULL;
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
			continue;
		}
		charge = 0;
529 530 531 532 533 534 535 536
		/*
		 * Don't duplicate many vmas if we've been oom-killed (for
		 * example)
		 */
		if (fatal_signal_pending(current)) {
			retval = -EINTR;
			goto out;
		}
537 538 539 540 541 542 543
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned long len = vma_pages(mpnt);

			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
				goto fail_nomem;
			charge = len;
		}
544
		tmp = vm_area_dup(mpnt);
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
		if (!tmp)
			goto fail_nomem;
		retval = vma_dup_policy(mpnt, tmp);
		if (retval)
			goto fail_nomem_policy;
		tmp->vm_mm = mm;
		retval = dup_userfaultfd(tmp, &uf);
		if (retval)
			goto fail_nomem_anon_vma_fork;
		if (tmp->vm_flags & VM_WIPEONFORK) {
			/* VM_WIPEONFORK gets a clean slate in the child. */
			tmp->anon_vma = NULL;
			if (anon_vma_prepare(tmp))
				goto fail_nomem_anon_vma_fork;
		} else if (anon_vma_fork(tmp, mpnt))
			goto fail_nomem_anon_vma_fork;
		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
		tmp->vm_next = tmp->vm_prev = NULL;
		file = tmp->vm_file;
		if (file) {
			struct inode *inode = file_inode(file);
			struct address_space *mapping = file->f_mapping;

			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
			i_mmap_lock_write(mapping);
			if (tmp->vm_flags & VM_SHARED)
				atomic_inc(&mapping->i_mmap_writable);
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
			vma_interval_tree_insert_after(tmp, mpnt,
					&mapping->i_mmap);
			flush_dcache_mmap_unlock(mapping);
			i_mmap_unlock_write(mapping);
		}

		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

		/*
		 * Link in the new vma and copy the page table entries.
		 */
		*pprev = tmp;
		pprev = &tmp->vm_next;
		tmp->vm_prev = prev;
		prev = tmp;

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		if (!(tmp->vm_flags & VM_WIPEONFORK))
			retval = copy_page_range(mm, oldmm, mpnt);

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	/* a new mm has just been created */
613
	retval = arch_dup_mmap(oldmm, mm);
614 615 616 617 618 619 620 621 622 623 624
out:
	up_write(&mm->mmap_sem);
	flush_tlb_mm(oldmm);
	up_write(&oldmm->mmap_sem);
	dup_userfaultfd_complete(&uf);
fail_uprobe_end:
	uprobe_end_dup_mmap();
	return retval;
fail_nomem_anon_vma_fork:
	mpol_put(vma_policy(tmp));
fail_nomem_policy:
625
	vm_area_free(tmp);
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
	mm->pgd = pgd_alloc(mm);
	if (unlikely(!mm->pgd))
		return -ENOMEM;
	return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
	pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	down_write(&oldmm->mmap_sem);
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
	up_write(&oldmm->mmap_sem);
	return 0;
}
#define mm_alloc_pgd(mm)	(0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
	int i;

660 661 662
	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
			 "Please make sure 'struct resident_page_types[]' is updated as well");

663 664 665 666
	for (i = 0; i < NR_MM_COUNTERS; i++) {
		long x = atomic_long_read(&mm->rss_stat.count[i]);

		if (unlikely(x))
667 668
			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
				 mm, resident_page_types[i], x);
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
	}

	if (mm_pgtables_bytes(mm))
		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
				mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
688
void __mmdrop(struct mm_struct *mm)
689 690
{
	BUG_ON(mm == &init_mm);
691 692
	WARN_ON_ONCE(mm == current->mm);
	WARN_ON_ONCE(mm == current->active_mm);
693 694
	mm_free_pgd(mm);
	destroy_context(mm);
695
	mmu_notifier_subscriptions_destroy(mm);
696 697 698 699
	check_mm(mm);
	put_user_ns(mm->user_ns);
	free_mm(mm);
}
700
EXPORT_SYMBOL_GPL(__mmdrop);
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717

static void mmdrop_async_fn(struct work_struct *work)
{
	struct mm_struct *mm;

	mm = container_of(work, struct mm_struct, async_put_work);
	__mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
		schedule_work(&mm->async_put_work);
	}
}

718 719
static inline void free_signal_struct(struct signal_struct *sig)
{
720
	taskstats_tgid_free(sig);
721
	sched_autogroup_exit(sig);
722 723 724 725
	/*
	 * __mmdrop is not safe to call from softirq context on x86 due to
	 * pgd_dtor so postpone it to the async context
	 */
726
	if (sig->oom_mm)
727
		mmdrop_async(sig->oom_mm);
728 729 730 731 732
	kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
733
	if (refcount_dec_and_test(&sig->sigcnt))
734 735 736
		free_signal_struct(sig);
}

737
void __put_task_struct(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
738
{
Eugene Teo's avatar
Eugene Teo committed
739
	WARN_ON(!tsk->exit_state);
740
	WARN_ON(refcount_read(&tsk->usage));
Linus Torvalds's avatar
Linus Torvalds committed
741 742
	WARN_ON(tsk == current);

743
	cgroup_free(tsk);
744
	task_numa_free(tsk, true);
745
	security_task_free(tsk);
746
	exit_creds(tsk);
747
	delayacct_tsk_free(tsk);
748
	put_signal_struct(tsk->signal);
Linus Torvalds's avatar
Linus Torvalds committed
749 750 751 752

	if (!profile_handoff_task(tsk))
		free_task(tsk);
}
753
EXPORT_SYMBOL_GPL(__put_task_struct);
Linus Torvalds's avatar
Linus Torvalds committed
754

755
void __init __weak arch_task_cache_init(void) { }
756

757 758 759
/*
 * set_max_threads
 */
760
static void set_max_threads(unsigned int max_threads_suggested)
761
{
762
	u64 threads;
763
	unsigned long nr_pages = totalram_pages();
764 765

	/*
766 767
	 * The number of threads shall be limited such that the thread
	 * structures may only consume a small part of the available memory.
768
	 */
769
	if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
770 771
		threads = MAX_THREADS;
	else
772
		threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
773 774
				    (u64) THREAD_SIZE * 8UL);

775 776 777
	if (threads > max_threads_suggested)
		threads = max_threads_suggested;

778
	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
779 780
}

781 782 783 784
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
785

786
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
787 788 789 790 791 792 793 794 795 796 797 798 799 800
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
	/* Fetch thread_struct whitelist for the architecture. */
	arch_thread_struct_whitelist(offset, size);

	/*
	 * Handle zero-sized whitelist or empty thread_struct, otherwise
	 * adjust offset to position of thread_struct in task_struct.
	 */
	if (unlikely(*size == 0))
		*offset = 0;
	else
		*offset += offsetof(struct task_struct, thread);
}
801
#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
802

803
void __init fork_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
804
{
805
	int i;
806
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
Linus Torvalds's avatar
Linus Torvalds committed
807
#ifndef ARCH_MIN_TASKALIGN
808
#define ARCH_MIN_TASKALIGN	0
Linus Torvalds's avatar
Linus Torvalds committed
809
#endif
810
	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
811
	unsigned long useroffset, usersize;
812

Linus Torvalds's avatar
Linus Torvalds committed
813
	/* create a slab on which task_structs can be allocated */
814 815
	task_struct_whitelist(&useroffset, &usersize);
	task_struct_cachep = kmem_cache_create_usercopy("task_struct",
816
			arch_task_struct_size, align,
817 818
			SLAB_PANIC|SLAB_ACCOUNT,
			useroffset, usersize, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
819 820
#endif

821 822 823
	/* do the arch specific task caches init */
	arch_task_cache_init();

824
	set_max_threads(MAX_THREADS);
Linus Torvalds's avatar
Linus Torvalds committed
825 826 827 828 829

	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	init_task.signal->rlim[RLIMIT_SIGPENDING] =
		init_task.signal->rlim[RLIMIT_NPROC];
830

831 832 833
	for (i = 0; i < UCOUNT_COUNTS; i++) {
		init_user_ns.ucount_max[i] = max_threads/2;
	}
834 835 836 837 838

#ifdef CONFIG_VMAP_STACK
	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
			  NULL, free_vm_stack_cache);
#endif
839 840

	lockdep_init_task(&init_task);
841
	uprobes_init();
Linus Torvalds's avatar
Linus Torvalds committed
842 843
}

844
int __weak arch_dup_task_struct(struct task_struct *dst,
845 846 847 848 849 850
					       struct task_struct *src)
{
	*dst = *src;
	return 0;
}

851 852 853 854 855 856 857 858
void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

859
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
Linus Torvalds's avatar
Linus Torvalds committed
860 861
{
	struct task_struct *tsk;
862
	unsigned long *stack;
863
	struct vm_struct *stack_vm_area __maybe_unused;
Peter Zijlstra's avatar
Peter Zijlstra committed
864
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
865

866 867
	if (node == NUMA_NO_NODE)
		node = tsk_fork_get_node(orig);
868
	tsk = alloc_task_struct_node(node);
Linus Torvalds's avatar
Linus Torvalds committed
869 870 871
	if (!tsk)
		return NULL;

872 873
	stack = alloc_thread_stack_node(tsk, node);
	if (!stack)
874
		goto free_tsk;
Linus Torvalds's avatar
Linus Torvalds committed
875

876 877 878
	if (memcg_charge_kernel_stack(tsk))
		goto free_stack;

879 880
	stack_vm_area = task_stack_vm_area(tsk);

881
	err = arch_dup_task_struct(tsk, orig);
882 883 884 885 886 887 888 889 890 891

	/*
	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
	 * sure they're properly initialized before using any stack-related
	 * functions again.
	 */
	tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = stack_vm_area;
#endif
892
#ifdef CONFIG_THREAD_INFO_IN_TASK
893
	refcount_set(&tsk->stack_refcount, 1);
894
#endif
895

896
	if (err)
897
		goto free_stack;
898

Kees Cook's avatar
Kees Cook committed
899 900 901 902 903 904 905 906 907
#ifdef CONFIG_SECCOMP
	/*
	 * We must handle setting up seccomp filters once we're under
	 * the sighand lock in case orig has changed between now and
	 * then. Until then, filter must be NULL to avoid messing up
	 * the usage counts on the error path calling free_task.
	 */
	tsk->seccomp.filter = NULL;
#endif
908 909

	setup_thread_stack(tsk, orig);
910
	clear_user_return_notifier(tsk);
911
	clear_tsk_need_resched(tsk);
912
	set_task_stack_end_magic(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
913

914
#ifdef CONFIG_STACKPROTECTOR
915
	tsk->stack_canary = get_random_canary();
916
#endif
917 918
	if (orig->cpus_ptr == &orig->cpus_mask)
		tsk->cpus_ptr = &tsk->cpus_mask;
919

920
	/*
921 922
	 * One for the user space visible state that goes away when reaped.
	 * One for the scheduler.
923
	 */
924 925 926
	refcount_set(&tsk->rcu_users, 2);
	/* One for the rcu users */
	refcount_set(&tsk->usage, 1);
927
#ifdef CONFIG_BLK_DEV_IO_TRACE
928
	tsk->btrace_seq = 0;
929
#endif
930
	tsk->splice_pipe = NULL;
931
	tsk->task_frag.page = NULL;
932
	tsk->wake_q.next = NULL;
933

934
	account_kernel_stack(tsk, 1);
935

Dmitry Vyukov's avatar
Dmitry Vyukov committed
936 937
	kcov_task_init(tsk);

938 939 940 941
#ifdef CONFIG_FAULT_INJECTION
	tsk->fail_nth = 0;
#endif

942 943 944 945 946
#ifdef CONFIG_BLK_CGROUP
	tsk->throttle_queue = NULL;
	tsk->use_memdelay = 0;
#endif

947 948 949
#ifdef CONFIG_MEMCG
	tsk->active_memcg = NULL;
#endif
Linus Torvalds's avatar
Linus Torvalds committed
950
	return tsk;
951

952
free_stack:
953
	free_thread_stack(tsk);
954
free_tsk:
955 956
	free_task_struct(tsk);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
957 958
}

Daniel Walker's avatar
Daniel Walker committed
959
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
Linus Torvalds's avatar
Linus Torvalds committed
960

961 962 963 964 965 966 967 968 969 970 971 972
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
	default_dump_filter =
		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
		MMF_DUMP_FILTER_MASK;
	return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

Linus Torvalds's avatar
Linus Torvalds committed
973 974
#include <linux/init_task.h>

975 976 977 978
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
	spin_lock_init(&mm->ioctx_lock);
979
	mm->ioctx_table = NULL;
980 981 982
#endif
}

983 984 985 986 987 988 989 990 991
static __always_inline void mm_clear_owner(struct mm_struct *mm,
					   struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	if (mm->owner == p)
		WRITE_ONCE(mm->owner, NULL);
#endif
}

992 993 994 995 996 997 998
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	mm->owner = p;
#endif
}

999 1000 1001 1002 1003 1004 1005
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
	mm->uprobes_state.xol_area = NULL;
#endif
}

1006 1007
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
1008
{
1009 1010 1011
	mm->mmap = NULL;
	mm->mm_rb = RB_ROOT;
	mm->vmacache_seqnum = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1012 1013 1014 1015
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	INIT_LIST_HEAD(&mm->mmlist);
1016
	mm->core_state = NULL;
1017
	mm_pgtables_bytes_init(mm);
1018 1019
	mm->map_count = 0;
	mm->locked_vm = 0;
1020
	atomic64_set(&mm->pinned_vm, 0);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1021
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
Linus Torvalds's avatar
Linus Torvalds committed
1022
	spin_lock_init(&mm->page_table_lock);
1023
	spin_lock_init(&mm->arg_lock);
1024
	mm_init_cpumask(mm);
1025
	mm_init_aio(mm);
1026
	mm_init_owner(mm, p);
1027
	RCU_INIT_POINTER(mm->exe_file, NULL);
1028
	mmu_notifier_subscriptions_init(mm);
1029
	init_tlb_flush_pending(mm);
Vladimir Davydov's avatar