namespace.c 85.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/cred.h>
19
#include <linux/idr.h>
20
#include <linux/init.h>		/* init_rootfs */
Al Viro's avatar
Al Viro committed
21 22 23
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
24
#include <linux/proc_ns.h>
25
#include <linux/magic.h>
Al Viro's avatar
Al Viro committed
26
#include <linux/bootmem.h>
Al Viro's avatar
Al Viro committed
27
#include <linux/task_work.h>
28 29
#include <linux/sched/task.h>

30
#include "pnode.h"
31
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
32

33 34 35
/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

Al Viro's avatar
Al Viro committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
	if (!str)
		return 0;
	mhash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
	if (!str)
		return 0;
	mphash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mphash_entries=", set_mphash_entries);
Eric Dumazet's avatar
Eric Dumazet committed
60

Al Viro's avatar
Al Viro committed
61
static u64 event;
62
static DEFINE_IDA(mnt_id_ida);
63
static DEFINE_IDA(mnt_group_ida);
Nick Piggin's avatar
Nick Piggin committed
64
static DEFINE_SPINLOCK(mnt_id_lock);
65 66
static int mnt_id_start = 0;
static int mnt_group_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
67

Al Viro's avatar
Al Viro committed
68
static struct hlist_head *mount_hashtable __read_mostly;
Al Viro's avatar
Al Viro committed
69
static struct hlist_head *mountpoint_hashtable __read_mostly;
70
static struct kmem_cache *mnt_cache __read_mostly;
Al Viro's avatar
Al Viro committed
71
static DECLARE_RWSEM(namespace_sem);
Linus Torvalds's avatar
Linus Torvalds committed
72

Miklos Szeredi's avatar
Miklos Szeredi committed
73
/* /sys/fs */
74 75
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
Miklos Szeredi's avatar
Miklos Szeredi committed
76

Nick Piggin's avatar
Nick Piggin committed
77 78 79 80 81 82 83 84
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
Al Viro's avatar
Al Viro committed
85
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
Nick Piggin's avatar
Nick Piggin committed
86

Al Viro's avatar
Al Viro committed
87
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
88
{
Ram Pai's avatar
Ram Pai committed
89 90
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
Al Viro's avatar
Al Viro committed
91 92 93 94 95 96 97 98 99
	tmp = tmp + (tmp >> m_hash_shift);
	return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
	tmp = tmp + (tmp >> mp_hash_shift);
	return &mountpoint_hashtable[tmp & mp_hash_mask];
Linus Torvalds's avatar
Linus Torvalds committed
100 101
}

102
static int mnt_alloc_id(struct mount *mnt)
103 104 105 106 107
{
	int res;

retry:
	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
Nick Piggin's avatar
Nick Piggin committed
108
	spin_lock(&mnt_id_lock);
Al Viro's avatar
Al Viro committed
109
	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
110
	if (!res)
Al Viro's avatar
Al Viro committed
111
		mnt_id_start = mnt->mnt_id + 1;
Nick Piggin's avatar
Nick Piggin committed
112
	spin_unlock(&mnt_id_lock);
113 114 115 116 117 118
	if (res == -EAGAIN)
		goto retry;

	return res;
}

119
static void mnt_free_id(struct mount *mnt)
120
{
Al Viro's avatar
Al Viro committed
121
	int id = mnt->mnt_id;
Nick Piggin's avatar
Nick Piggin committed
122
	spin_lock(&mnt_id_lock);
123 124 125
	ida_remove(&mnt_id_ida, id);
	if (mnt_id_start > id)
		mnt_id_start = id;
Nick Piggin's avatar
Nick Piggin committed
126
	spin_unlock(&mnt_id_lock);
127 128
}

129 130 131 132 133
/*
 * Allocate a new peer group ID
 *
 * mnt_group_ida is protected by namespace_sem
 */
134
static int mnt_alloc_group_id(struct mount *mnt)
135
{
136 137
	int res;

138 139 140
	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
		return -ENOMEM;

141 142
	res = ida_get_new_above(&mnt_group_ida,
				mnt_group_start,
Al Viro's avatar
Al Viro committed
143
				&mnt->mnt_group_id);
144
	if (!res)
Al Viro's avatar
Al Viro committed
145
		mnt_group_start = mnt->mnt_group_id + 1;
146 147

	return res;
148 149 150 151 152
}

/*
 * Release a peer group ID
 */
153
void mnt_release_group_id(struct mount *mnt)
154
{
Al Viro's avatar
Al Viro committed
155
	int id = mnt->mnt_group_id;
156 157 158
	ida_remove(&mnt_group_ida, id);
	if (mnt_group_start > id)
		mnt_group_start = id;
Al Viro's avatar
Al Viro committed
159
	mnt->mnt_group_id = 0;
160 161
}

Nick Piggin's avatar
Nick Piggin committed
162 163 164
/*
 * vfsmount lock must be held for read
 */
165
static inline void mnt_add_count(struct mount *mnt, int n)
Nick Piggin's avatar
Nick Piggin committed
166 167
{
#ifdef CONFIG_SMP
168
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
Nick Piggin's avatar
Nick Piggin committed
169 170
#else
	preempt_disable();
171
	mnt->mnt_count += n;
Nick Piggin's avatar
Nick Piggin committed
172 173 174 175 176 177 178
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
179
unsigned int mnt_get_count(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
180 181
{
#ifdef CONFIG_SMP
182
	unsigned int count = 0;
Nick Piggin's avatar
Nick Piggin committed
183 184 185
	int cpu;

	for_each_possible_cpu(cpu) {
186
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
187 188 189 190
	}

	return count;
#else
191
	return mnt->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
192 193 194
#endif
}

195 196 197 198 199 200 201 202
static void drop_mountpoint(struct fs_pin *p)
{
	struct mount *m = container_of(p, struct mount, mnt_umount);
	dput(m->mnt_ex_mountpoint);
	pin_remove(p);
	mntput(&m->mnt);
}

203
static struct mount *alloc_vfsmnt(const char *name)
Linus Torvalds's avatar
Linus Torvalds committed
204
{
205 206
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
207 208
		int err;

209
		err = mnt_alloc_id(mnt);
210 211 212 213
		if (err)
			goto out_free_cache;

		if (name) {
214
			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
215
			if (!mnt->mnt_devname)
216
				goto out_free_id;
217 218
		}

Nick Piggin's avatar
Nick Piggin committed
219
#ifdef CONFIG_SMP
220 221
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
Nick Piggin's avatar
Nick Piggin committed
222 223
			goto out_free_devname;

224
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
Nick Piggin's avatar
Nick Piggin committed
225
#else
226 227
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
Nick Piggin's avatar
Nick Piggin committed
228 229
#endif

Al Viro's avatar
Al Viro committed
230
		INIT_HLIST_NODE(&mnt->mnt_hash);
231 232 233 234 235 236 237
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
238
		INIT_HLIST_NODE(&mnt->mnt_mp_list);
239
		INIT_LIST_HEAD(&mnt->mnt_umounting);
240
		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
Linus Torvalds's avatar
Linus Torvalds committed
241
	}
242
	return mnt;
243

npiggin@suse.de's avatar
npiggin@suse.de committed
244 245
#ifdef CONFIG_SMP
out_free_devname:
246
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
247
#endif
248
out_free_id:
249
	mnt_free_id(mnt);
250
out_free_cache:
251
	kmem_cache_free(mnt_cache, mnt);
252
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
253 254
}

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
276 277
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
278
	if (sb_rdonly(mnt->mnt_sb))
279 280
		return 1;
	return 0;
281 282 283
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

284
static inline void mnt_inc_writers(struct mount *mnt)
npiggin@suse.de's avatar
npiggin@suse.de committed
285 286
{
#ifdef CONFIG_SMP
287
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
288
#else
289
	mnt->mnt_writers++;
npiggin@suse.de's avatar
npiggin@suse.de committed
290 291
#endif
}
292

293
static inline void mnt_dec_writers(struct mount *mnt)
294
{
npiggin@suse.de's avatar
npiggin@suse.de committed
295
#ifdef CONFIG_SMP
296
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
297
#else
298
	mnt->mnt_writers--;
npiggin@suse.de's avatar
npiggin@suse.de committed
299
#endif
300 301
}

302
static unsigned int mnt_get_writers(struct mount *mnt)
303
{
npiggin@suse.de's avatar
npiggin@suse.de committed
304 305
#ifdef CONFIG_SMP
	unsigned int count = 0;
306 307 308
	int cpu;

	for_each_possible_cpu(cpu) {
309
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
310 311
	}

npiggin@suse.de's avatar
npiggin@suse.de committed
312 313 314 315
	return count;
#else
	return mnt->mnt_writers;
#endif
316 317
}

318 319 320 321 322 323 324 325 326
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

327
/*
328 329 330 331
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
332 333
 */
/**
334
 * __mnt_want_write - get write access to a mount without freeze protection
335
 * @m: the mount on which to take a write
336
 *
337 338 339 340 341
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
342
 */
343
int __mnt_want_write(struct vfsmount *m)
344
{
345
	struct mount *mnt = real_mount(m);
346 347
	int ret = 0;

npiggin@suse.de's avatar
npiggin@suse.de committed
348
	preempt_disable();
349
	mnt_inc_writers(mnt);
npiggin@suse.de's avatar
npiggin@suse.de committed
350
	/*
351
	 * The store to mnt_inc_writers must be visible before we pass
npiggin@suse.de's avatar
npiggin@suse.de committed
352 353 354 355
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
356
	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
npiggin@suse.de's avatar
npiggin@suse.de committed
357 358 359 360 361 362 363
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
364
	if (mnt_is_readonly(m)) {
365
		mnt_dec_writers(mnt);
366 367
		ret = -EROFS;
	}
npiggin@suse.de's avatar
npiggin@suse.de committed
368
	preempt_enable();
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
390
	return ret;
391 392 393
}
EXPORT_SYMBOL_GPL(mnt_want_write);

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
412
	mnt_inc_writers(real_mount(mnt));
413 414 415 416 417 418
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
419
 * __mnt_want_write_file - get write access to a file's mount
420 421
 * @file: the file who's mount on which to take a write
 *
422
 * This is like __mnt_want_write, but it takes a file and can
423 424
 * do some optimisations if the file is open for write already
 */
425
int __mnt_want_write_file(struct file *file)
426
{
427
	if (!(file->f_mode & FMODE_WRITER))
428
		return __mnt_want_write(file->f_path.mnt);
429 430 431
	else
		return mnt_clone_write(file->f_path.mnt);
}
432 433

/**
434
 * mnt_want_write_file_path - get write access to a file's mount
435 436 437 438
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
439 440 441 442 443
 *
 * Called by the vfs for cases when we have an open file at hand, but will do an
 * inode operation on it (important distinction for files opened on overlayfs,
 * since the file operations will come from the real underlying file, while
 * inode operations come from the overlay).
444
 */
445
int mnt_want_write_file_path(struct file *file)
446 447 448 449 450 451 452 453 454
{
	int ret;

	sb_start_write(file->f_path.mnt->mnt_sb);
	ret = __mnt_want_write_file(file);
	if (ret)
		sb_end_write(file->f_path.mnt->mnt_sb);
	return ret;
}
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501

static inline int may_write_real(struct file *file)
{
	struct dentry *dentry = file->f_path.dentry;
	struct dentry *upperdentry;

	/* Writable file? */
	if (file->f_mode & FMODE_WRITER)
		return 0;

	/* Not overlayfs? */
	if (likely(!(dentry->d_flags & DCACHE_OP_REAL)))
		return 0;

	/* File refers to upper, writable layer? */
	upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
	if (upperdentry && file_inode(file) == d_inode(upperdentry))
		return 0;

	/* Lower layer: can't write to real file, sorry... */
	return -EPERM;
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 *
 * Mostly called by filesystems from their ioctl operation before performing
 * modification.  On overlayfs this needs to check if the file is on a read-only
 * lower layer and deny access in that case.
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

	ret = may_write_real(file);
	if (!ret) {
		sb_start_write(file_inode(file)->i_sb);
		ret = __mnt_want_write_file(file);
		if (ret)
			sb_end_write(file_inode(file)->i_sb);
	}
	return ret;
}
502 503
EXPORT_SYMBOL_GPL(mnt_want_write_file);

504
/**
505
 * __mnt_drop_write - give up write access to a mount
506 507 508 509
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
510
 * __mnt_want_write() call above.
511
 */
512
void __mnt_drop_write(struct vfsmount *mnt)
513
{
npiggin@suse.de's avatar
npiggin@suse.de committed
514
	preempt_disable();
515
	mnt_dec_writers(real_mount(mnt));
npiggin@suse.de's avatar
npiggin@suse.de committed
516
	preempt_enable();
517
}
518 519 520 521 522 523 524 525 526 527 528 529 530 531

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
532 533
EXPORT_SYMBOL_GPL(mnt_drop_write);

534 535 536 537 538
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

539
void mnt_drop_write_file_path(struct file *file)
Al Viro's avatar
Al Viro committed
540 541 542
{
	mnt_drop_write(file->f_path.mnt);
}
543 544 545 546 547 548

void mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
	sb_end_write(file_inode(file)->i_sb);
}
Al Viro's avatar
Al Viro committed
549 550
EXPORT_SYMBOL(mnt_drop_write_file);

551
static int mnt_make_readonly(struct mount *mnt)
552
{
553 554
	int ret = 0;

555
	lock_mount_hash();
556
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
557
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
558 559
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
560
	 */
npiggin@suse.de's avatar
npiggin@suse.de committed
561 562
	smp_mb();

563
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
564 565 566 567 568 569 570 571 572 573 574 575 576 577
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
578
	 */
579
	if (mnt_get_writers(mnt) > 0)
npiggin@suse.de's avatar
npiggin@suse.de committed
580 581
		ret = -EBUSY;
	else
582
		mnt->mnt.mnt_flags |= MNT_READONLY;
npiggin@suse.de's avatar
npiggin@suse.de committed
583 584 585 586 587
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
588
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
589
	unlock_mount_hash();
590
	return ret;
591 592
}

593
static void __mnt_unmake_readonly(struct mount *mnt)
594
{
595
	lock_mount_hash();
596
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
597
	unlock_mount_hash();
598 599
}

600 601 602 603 604
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

605 606 607 608
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

609
	lock_mount_hash();
610 611 612 613 614 615 616 617 618 619
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
620 621 622
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

623 624 625 626 627 628 629 630
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
631
	unlock_mount_hash();
632 633 634 635

	return err;
}

636
static void free_vfsmnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
637
{
638
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
639
#ifdef CONFIG_SMP
640
	free_percpu(mnt->mnt_pcp);
npiggin@suse.de's avatar
npiggin@suse.de committed
641
#endif
642
	kmem_cache_free(mnt_cache, mnt);
Linus Torvalds's avatar
Linus Torvalds committed
643 644
}

645 646 647 648 649
static void delayed_free_vfsmnt(struct rcu_head *head)
{
	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

Al Viro's avatar
Al Viro committed
650
/* call under rcu_read_lock */
Al Viro's avatar
Al Viro committed
651
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
Al Viro's avatar
Al Viro committed
652 653 654
{
	struct mount *mnt;
	if (read_seqretry(&mount_lock, seq))
Al Viro's avatar
Al Viro committed
655
		return 1;
Al Viro's avatar
Al Viro committed
656
	if (bastard == NULL)
Al Viro's avatar
Al Viro committed
657
		return 0;
Al Viro's avatar
Al Viro committed
658 659 660
	mnt = real_mount(bastard);
	mnt_add_count(mnt, 1);
	if (likely(!read_seqretry(&mount_lock, seq)))
Al Viro's avatar
Al Viro committed
661
		return 0;
Al Viro's avatar
Al Viro committed
662 663
	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
		mnt_add_count(mnt, -1);
Al Viro's avatar
Al Viro committed
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
		return 1;
	}
	return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	int res = __legitimize_mnt(bastard, seq);
	if (likely(!res))
		return true;
	if (unlikely(res < 0)) {
		rcu_read_unlock();
		mntput(bastard);
		rcu_read_lock();
Al Viro's avatar
Al Viro committed
679 680 681 682
	}
	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
683
/*
684
 * find the first mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
685
 * call under rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
686
 */
687
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
688
{
Al Viro's avatar
Al Viro committed
689
	struct hlist_head *head = m_hash(mnt, dentry);
690 691
	struct mount *p;

Al Viro's avatar
Al Viro committed
692
	hlist_for_each_entry_rcu(p, head, mnt_hash)
693 694 695 696 697
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
			return p;
	return NULL;
}

698
/*
699 700 701 702 703 704 705 706 707 708 709 710 711 712
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
713
 */
714
struct vfsmount *lookup_mnt(const struct path *path)
715
{
716
	struct mount *child_mnt;
Al Viro's avatar
Al Viro committed
717 718
	struct vfsmount *m;
	unsigned seq;
Nick Piggin's avatar
Nick Piggin committed
719

Al Viro's avatar
Al Viro committed
720 721 722 723 724 725 726 727
	rcu_read_lock();
	do {
		seq = read_seqbegin(&mount_lock);
		child_mnt = __lookup_mnt(path->mnt, path->dentry);
		m = child_mnt ? &child_mnt->mnt : NULL;
	} while (!legitimize_mnt(m, seq));
	rcu_read_unlock();
	return m;
728 729
}

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct mount *mnt;
	bool is_covered = false;

	if (!d_mountpoint(dentry))
		goto out;

	down_read(&namespace_sem);
	list_for_each_entry(mnt, &ns->list, mnt_list) {
		is_covered = (mnt->mnt_mountpoint == dentry);
		if (is_covered)
			break;
	}
	up_read(&namespace_sem);
out:
	return is_covered;
}

765
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
766
{
Al Viro's avatar
Al Viro committed
767
	struct hlist_head *chain = mp_hash(dentry);
768 769
	struct mountpoint *mp;

Al Viro's avatar
Al Viro committed
770
	hlist_for_each_entry(mp, chain, m_hash) {
771 772 773 774 775 776 777 778
		if (mp->m_dentry == dentry) {
			/* might be worth a WARN_ON() */
			if (d_unlinked(dentry))
				return ERR_PTR(-ENOENT);
			mp->m_count++;
			return mp;
		}
	}
779 780 781
	return NULL;
}

782
static struct mountpoint *get_mountpoint(struct dentry *dentry)
783
{
784
	struct mountpoint *mp, *new = NULL;
785
	int ret;
786

787 788 789 790 791 792 793 794 795 796 797 798
	if (d_mountpoint(dentry)) {
mountpoint:
		read_seqlock_excl(&mount_lock);
		mp = lookup_mountpoint(dentry);
		read_sequnlock_excl(&mount_lock);
		if (mp)
			goto done;
	}

	if (!new)
		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!new)
799 800
		return ERR_PTR(-ENOMEM);

801 802

	/* Exactly one processes may set d_mounted */
803 804
	ret = d_set_mounted(dentry);

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
	/* Someone else set d_mounted? */
	if (ret == -EBUSY)
		goto mountpoint;

	/* The dentry is not available as a mountpoint? */
	mp = ERR_PTR(ret);
	if (ret)
		goto done;

	/* Add the new mountpoint to the hash table */
	read_seqlock_excl(&mount_lock);
	new->m_dentry = dentry;
	new->m_count = 1;
	hlist_add_head(&new->m_hash, mp_hash(dentry));
	INIT_HLIST_HEAD(&new->m_list);
	read_sequnlock_excl(&mount_lock);

	mp = new;
	new = NULL;
done:
	kfree(new);
826 827 828 829 830 831 832
	return mp;
}

static void put_mountpoint(struct mountpoint *mp)
{
	if (!--mp->m_count) {
		struct dentry *dentry = mp->m_dentry;
833
		BUG_ON(!hlist_empty(&mp->m_list));
834 835 836
		spin_lock(&dentry->d_lock);
		dentry->d_flags &= ~DCACHE_MOUNTED;
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
837
		hlist_del(&mp->m_hash);
838 839 840 841
		kfree(mp);
	}
}

Al Viro's avatar
Al Viro committed
842
static inline int check_mnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
843
{
844
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
Linus Torvalds's avatar
Linus Torvalds committed
845 846
}

Nick Piggin's avatar
Nick Piggin committed
847 848 849
/*
 * vfsmount lock must be held for write
 */
850
static void touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
851 852 853 854 855 856 857
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
858 859 860
/*
 * vfsmount lock must be held for write
 */
861
static void __touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
862 863 864 865 866 867 868
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
869 870 871
/*
 * vfsmount lock must be held for write
 */
872
static void unhash_mnt(struct mount *mnt)
873
{
874
	mnt->mnt_parent = mnt;
875
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
876
	list_del_init(&mnt->mnt_child);
Al Viro's avatar
Al Viro committed
877
	hlist_del_init_rcu(&mnt->mnt_hash);
878
	hlist_del_init(&mnt->mnt_mp_list);
879 880
	put_mountpoint(mnt->mnt_mp);
	mnt->mnt_mp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
881 882
}

883 884 885 886 887 888 889 890 891 892
/*
 * vfsmount lock must be held for write
 */
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
	old_path->dentry = mnt->mnt_mountpoint;
	old_path->mnt = &mnt->mnt_parent->mnt;
	unhash_mnt(mnt);
}

893 894 895 896 897 898 899 900 901 902
/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
	/* old mountpoint will be dropped when we can do that */
	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
	unhash_mnt(mnt);
}

Nick Piggin's avatar
Nick Piggin committed
903 904 905
/*
 * vfsmount lock must be held for write
 */
906 907
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
908
			struct mount *child_mnt)
909
{
910
	mp->m_count++;
911
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
912
	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
913
	child_mnt->mnt_parent = mnt;
914
	child_mnt->mnt_mp = mp;
915
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
916 917
}

918 919 920 921 922 923 924
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
	hlist_add_head_rcu(&mnt->mnt_hash,
			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

Nick Piggin's avatar
Nick Piggin committed
925 926 927
/*
 * vfsmount lock must be held for write
 */
928 929 930
static void attach_mnt(struct mount *mnt,
			struct mount *parent,
			struct mountpoint *mp)
Linus Torvalds's avatar
Linus Torvalds committed
931
{
932
	mnt_set_mountpoint(parent, mp, mnt);
933
	__attach_mnt(mnt, parent);
934 935
}

936
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
Al Viro's avatar
Al Viro committed
937
{
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965
	struct mountpoint *old_mp = mnt->mnt_mp;
	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
	struct mount *old_parent = mnt->mnt_parent;

	list_del_init(&mnt->mnt_child);
	hlist_del_init(&mnt->mnt_mp_list);
	hlist_del_init_rcu(&mnt->mnt_hash);

	attach_mnt(mnt, parent, mp);

	put_mountpoint(old_mp);

	/*
	 * Safely avoid even the suggestion this code might sleep or
	 * lock the mount hash by taking advantage of the knowledge that
	 * mnt_change_mountpoint will not release the final reference
	 * to a mountpoint.
	 *
	 * During mounting, the mount passed in as the parent mount will
	 * continue to use the old mountpoint and during unmounting, the
	 * old mountpoint will continue to exist until namespace_unlock,
	 * which happens well after mnt_change_mountpoint.
	 */
	spin_lock(&old_mountpoint->d_lock);
	old_mountpoint->d_lockref.count--;
	spin_unlock(&old_mountpoint->d_lock);

	mnt_add_count(old_parent, -1);
Al Viro's avatar
Al Viro committed
966 967
}

968
/*
Nick Piggin's avatar
Nick Piggin committed
969
 * vfsmount lock must be held for write
970
 */
971
static void commit_tree(struct mount *mnt)
972
{
973
	struct mount *parent = mnt->mnt_parent;
974
	struct mount *m;
975
	LIST_HEAD(head);
Al Viro's avatar
Al Viro committed
976
	struct mnt_namespace *n = parent->mnt_ns;
977

978
	BUG_ON(parent == mnt);
979

Al Viro's avatar
Al Viro committed
980
	list_add_tail(&head, &mnt->mnt_list);
Al Viro's avatar
Al Viro committed
981
	list_for_each_entry(m, &head, mnt_list)
Al Viro's avatar
Al Viro committed
982
		m->mnt_ns = n;
983

984 985
	list_splice(&head, n->list.prev);

986 987 988
	n->mounts += n->pending_mounts;
	n->pending_mounts = 0;

989
	__attach_mnt(mnt, parent);
990
	touch_mnt_namespace(n);
Linus Torvalds's avatar
Linus Torvalds committed
991 992
}

993
static struct mount *next_mnt(struct mount *p, struct mount *root)
Linus Torvalds's avatar
Linus Torvalds committed
994
{
995 996
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
Linus Torvalds's avatar
Linus Torvalds committed
997
		while (1) {
998
			if (p == root)
Linus Torvalds's avatar
Linus Torvalds committed
999
				return NULL;
1000 1001
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
Linus Torvalds's avatar
Linus Torvalds committed
1002
				break;
1003
			p = p->mnt_parent;
Linus Torvalds's avatar
Linus Torvalds committed
1004 1005
		}
	}
1006
	return list_entry(next, struct mount, mnt_child);
Linus Torvalds's avatar
Linus Torvalds committed
1007 1008
}

1009
static struct mount *skip_mnt_tree(struct mount *p)
Ram Pai's avatar
Ram Pai committed
1010
{
1011 1012 1013 1014
	struct list_head *prev = p->mnt_mounts.prev;
	while (prev != &p->mnt_mounts) {
		p = list_entry(prev, struct mount, mnt_child);
		prev = p->mnt_mounts.prev;
Ram Pai's avatar
Ram Pai committed
1015 1016 1017 1018
	}
	return p;
}

1019 1020 1021
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
1022
	struct mount *mnt;
1023 1024 1025 1026 1027 1028 1029 1030 1031
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name);
	if (!mnt)
		return ERR_PTR(-ENOMEM);

1032
	if (flags & SB_KERNMOUNT)
1033
		mnt->mnt.mnt_flags = MNT_INTERNAL;
1034 1035 1036

	root = mount_fs(type, flags, name, data);
	if (IS_ERR(root)) {
1037
		mnt_free_id(mnt);
1038 1039 1040 1041
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

1042 1043
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
1044
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1045
	mnt->mnt_parent = mnt;
1046
	lock_mount_hash();
1047
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
1048
	unlock_mount_hash();
1049
	return &mnt->mnt;
1050 1051 1052
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
	     const char *name, void *data)
{
	/* Until it is worked out how to pass the user namespace
	 * through from the parent mount to the submount don't support
	 * unprivileged mounts with submounts.
	 */
	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
		return ERR_PTR(-EPERM);

1064
	return vfs_kern_mount(type, SB_SUBMOUNT, n