namei.c 121 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
Robert Love's avatar
Robert Love committed
25
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27
#include <linux/personality.h>
#include <linux/security.h>
Mimi Zohar's avatar
Mimi Zohar committed
28
#include <linux/ima.h>
Linus Torvalds's avatar
Linus Torvalds committed
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
42

43
#include "internal.h"
44
#include "mount.h"
45

Linus Torvalds's avatar
Linus Torvalds committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
Lucas De Marchi's avatar
Lucas De Marchi committed
80
 * the name is a symlink pointing to a non-existent name.
Linus Torvalds's avatar
Linus Torvalds committed
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
Linus Torvalds's avatar
Linus Torvalds committed
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

Al Viro's avatar
Al Viro committed
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
Al Viro's avatar
Al Viro committed
130
	struct filename *result;
131
	char *kname;
Al Viro's avatar
Al Viro committed
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
Al Viro's avatar
Al Viro committed
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

Al Viro's avatar
Al Viro committed
149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
Al Viro's avatar
Al Viro committed
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
Al Viro's avatar
Al Viro committed
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
Al Viro's avatar
Al Viro committed
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

Al Viro's avatar
Al Viro committed
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
Al Viro's avatar
Al Viro committed
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
Al Viro's avatar
Al Viro committed
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

Al Viro's avatar
Al Viro committed
189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
Al Viro's avatar
Al Viro committed
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
Linus Torvalds's avatar
Linus Torvalds committed
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
204 205
}

206 207
struct filename *
getname(const char __user * filename)
Al Viro's avatar
Al Viro committed
208
{
209
	return getname_flags(filename, 0, NULL);
Al Viro's avatar
Al Viro committed
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
Al Viro's avatar
Al Viro committed
223
		result->name = (char *)result->iname;
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
	} else if (len <= PATH_MAX) {
		struct filename *tmp;

		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
239 240
	result->uptr = NULL;
	result->aname = NULL;
241
	result->refcnt = 1;
242
	audit_getname(result);
243 244 245 246

	return result;
}

247
void putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
248
{
249 250 251 252 253
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

Al Viro's avatar
Al Viro committed
254
	if (name->name != name->iname) {
255 256 257 258
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
Linus Torvalds's avatar
Linus Torvalds committed
259 260
}

261 262
static int check_acl(struct inode *inode, int mask)
{
263
#ifdef CONFIG_FS_POSIX_ACL
264 265 266
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
267 268
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
269
	                return -EAGAIN;
270
		/* no ->get_acl() calls in RCU mode... */
271
		if (is_uncached_acl(acl))
272
			return -ECHILD;
273
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
274 275
	}

Christoph Hellwig's avatar
Christoph Hellwig committed
276 277 278
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
279 280 281 282 283
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
284
#endif
285 286 287 288

	return -EAGAIN;
}

289
/*
290
 * This does the basic permission checking
Linus Torvalds's avatar
Linus Torvalds committed
291
 */
292
static int acl_permission_check(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
293
{
294
	unsigned int mode = inode->i_mode;
Linus Torvalds's avatar
Linus Torvalds committed
295

296
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
Linus Torvalds's avatar
Linus Torvalds committed
297 298
		mode >>= 6;
	else {
299
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
300
			int error = check_acl(inode, mask);
301 302
			if (error != -EAGAIN)
				return error;
Linus Torvalds's avatar
Linus Torvalds committed
303 304 305 306 307 308 309 310 311
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
312
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
313
		return 0;
314 315 316 317
	return -EACCES;
}

/**
318
 * generic_permission -  check for access rights on a Posix-like filesystem
319
 * @inode:	inode to check access rights for
320
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
321 322 323 324
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
325 326 327 328 329
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
330
 */
331
int generic_permission(struct inode *inode, int mask)
332 333 334 335
{
	int ret;

	/*
336
	 * Do the basic permission checks.
337
	 */
338
	ret = acl_permission_check(inode, mask);
339 340
	if (ret != -EACCES)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
341

342 343 344
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
345 346
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
347
				return 0;
348
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
Linus Torvalds's avatar
Linus Torvalds committed
349
			return 0;
350 351
		return -EACCES;
	}
Linus Torvalds's avatar
Linus Torvalds committed
352 353 354 355

	/*
	 * Searching includes executable on directories, else just read.
	 */
356
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
357
	if (mask == MAY_READ)
358
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
Linus Torvalds's avatar
Linus Torvalds committed
359
			return 0;
360 361 362 363 364 365 366 367
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
Linus Torvalds's avatar
Linus Torvalds committed
368 369 370

	return -EACCES;
}
371
EXPORT_SYMBOL(generic_permission);
Linus Torvalds's avatar
Linus Torvalds committed
372

373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

David Howells's avatar
David Howells committed
393 394 395
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
396
 * @inode: Inode to check permission on
David Howells's avatar
David Howells committed
397 398 399 400 401 402 403 404 405 406
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
407
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
David Howells's avatar
David Howells committed
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
	}

	retval = do_inode_permission(inode, mask);
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
David Howells's avatar
David Howells committed
457
}
458
EXPORT_SYMBOL(inode_permission);
David Howells's avatar
David Howells committed
459

Jan Blunck's avatar
Jan Blunck committed
460 461 462 463 464 465
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
466
void path_get(const struct path *path)
Jan Blunck's avatar
Jan Blunck committed
467 468 469 470 471 472
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

Jan Blunck's avatar
Jan Blunck committed
473 474 475 476 477 478
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
479
void path_put(const struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
480
{
Jan Blunck's avatar
Jan Blunck committed
481 482
	dput(path->dentry);
	mntput(path->mnt);
Linus Torvalds's avatar
Linus Torvalds committed
483
}
Jan Blunck's avatar
Jan Blunck committed
484
EXPORT_SYMBOL(path_put);
Linus Torvalds's avatar
Linus Torvalds committed
485

486
#define EMBEDDED_LEVELS 2
487 488
struct nameidata {
	struct path	path;
Al Viro's avatar
Al Viro committed
489
	struct qstr	last;
490 491 492
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
493
	unsigned	seq, m_seq;
494 495
	int		last_type;
	unsigned	depth;
496
	int		total_link_count;
497 498
	struct saved {
		struct path link;
499
		struct delayed_call done;
500
		const char *name;
501
		unsigned seq;
502
	} *stack, internal[EMBEDDED_LEVELS];
503 504
	struct filename	*name;
	struct nameidata *saved;
505
	struct inode	*link_inode;
506 507
	unsigned	root_seq;
	int		dfd;
508
} __randomize_layout;
509

510
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
511
{
512 513
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
514 515
	p->dfd = dfd;
	p->name = name;
516
	p->total_link_count = old ? old->total_link_count : 0;
517
	p->saved = old;
518
	current->nameidata = p;
519 520
}

521
static void restore_nameidata(void)
522
{
523
	struct nameidata *now = current->nameidata, *old = now->saved;
524 525 526 527

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
528
	if (now->stack != now->internal)
529
		kfree(now->stack);
530 531 532 533
}

static int __nd_alloc_stack(struct nameidata *nd)
{
Al Viro's avatar
Al Viro committed
534 535 536 537 538 539 540 541 542
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
543
				  GFP_KERNEL);
Al Viro's avatar
Al Viro committed
544 545 546
		if (unlikely(!p))
			return -ENOMEM;
	}
547 548 549 550 551
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

552 553 554 555 556 557 558 559 560 561
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;
562
	struct super_block *sb = mnt->mnt_sb;
563

564 565
	/* Bind mounts and multi-root filesystems can have disconnected paths */
	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
566 567 568 569 570
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

571 572
static inline int nd_alloc_stack(struct nameidata *nd)
{
573
	if (likely(nd->depth != EMBEDDED_LEVELS))
574 575 576 577 578 579
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

580 581 582 583 584
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
585 586
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
587 588 589 590 591 592 593 594 595 596 597
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
598 599 600 601
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

Al Viro's avatar
Al Viro committed
643
/*
Nick Piggin's avatar
Nick Piggin committed
644
 * Path walking has 2 modes, rcu-walk and ref-walk (see
Al Viro's avatar
Al Viro committed
645 646
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
Mike Marshall's avatar
Mike Marshall committed
647
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
Al Viro's avatar
Al Viro committed
648 649 650 651
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
Nick Piggin's avatar
Nick Piggin committed
652 653 654
 */

/**
Al Viro's avatar
Al Viro committed
655 656
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
657
 * Returns: 0 on success, -ECHILD on failure
Nick Piggin's avatar
Nick Piggin committed
658
 *
Al Viro's avatar
Al Viro committed
659 660 661
 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
662 663
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
Nick Piggin's avatar
Nick Piggin committed
664
 */
Al Viro's avatar
Al Viro committed
665
static int unlazy_walk(struct nameidata *nd)
Nick Piggin's avatar
Nick Piggin committed
666 667 668 669
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
670

Al Viro's avatar
Al Viro committed
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out1;
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
			goto out;
	}
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
	return 0;

out2:
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out1:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
out:
	rcu_read_unlock();
	return -ECHILD;
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

712
	nd->flags &= ~LOOKUP_RCU;
713 714 715 716
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
Al Viro's avatar
Al Viro committed
717
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
718
		goto out1;
Al Viro's avatar
Al Viro committed
719

720
	/*
Al Viro's avatar
Al Viro committed
721 722 723 724 725
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
726
	 */
Al Viro's avatar
Al Viro committed
727 728 729 730 731 732
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
		rcu_read_unlock();
		dput(dentry);
		goto drop_root_mnt;
Al Viro's avatar
Al Viro committed
733
	}
734 735 736 737 738
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
739 740 741 742
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
743
		}
Nick Piggin's avatar
Nick Piggin committed
744 745
	}

Al Viro's avatar
Al Viro committed
746
	rcu_read_unlock();
Nick Piggin's avatar
Nick Piggin committed
747
	return 0;
Al Viro's avatar
Al Viro committed
748

749 750 751 752
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
753
out:
Al Viro's avatar
Al Viro committed
754
	rcu_read_unlock();
755 756 757
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
Nick Piggin's avatar
Nick Piggin committed
758 759 760
	return -ECHILD;
}

761
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
762
{
763 764 765 766
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
767 768
}

769 770 771
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
772
 *
773 774 775 776 777
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
778
 */
779
static int complete_walk(struct nameidata *nd)
780
{
Al Viro's avatar
Al Viro committed
781
	struct dentry *dentry = nd->path.dentry;
782 783
	int status;

784 785 786
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
Al Viro's avatar
Al Viro committed
787
		if (unlikely(unlazy_walk(nd)))
788 789 790
			return -ECHILD;
	}

Al Viro's avatar
Al Viro committed
791 792 793
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

794
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
795 796
		return 0;

797
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
798 799 800
	if (status > 0)
		return 0;

Al Viro's avatar
Al Viro committed
801
	if (!status)
802
		status = -ESTALE;
Al Viro's avatar
Al Viro committed
803

804 805 806
	return status;
}

Al Viro's avatar
Al Viro committed
807
static void set_root(struct nameidata *nd)
Nick Piggin's avatar
Nick Piggin committed
808
{
809
	struct fs_struct *fs = current->fs;
Nick Piggin's avatar
Nick Piggin committed
810

811 812 813 814 815 816 817 818 819 820 821
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
	}
Nick Piggin's avatar
Nick Piggin committed
822 823
}

Jan Blunck's avatar
Jan Blunck committed
824
static void path_put_conditional(struct path *path, struct nameidata *nd)
825 826
{
	dput(path->dentry);
827
	if (path->mnt != nd->path.mnt)
828 829 830
		mntput(path->mnt);
}

831 832
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
833
{
Nick Piggin's avatar
Nick Piggin committed
834 835 836 837
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
838
	}
Nick Piggin's avatar
Nick Piggin committed
839
	nd->path.mnt = path->mnt;
840
	nd->path.dentry = path->dentry;
841 842
}

843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

Christoph Hellwig's avatar
Christoph Hellwig committed
863
/*
864
 * Helper to directly jump to a known parsed path from ->get_link,
Christoph Hellwig's avatar
Christoph Hellwig committed
865 866
 * caller must have taken a reference to path beforehand.
 */
867
void nd_jump_link(struct path *path)
Christoph Hellwig's avatar
Christoph Hellwig committed
868
{
869
	struct nameidata *nd = current->nameidata;
Christoph Hellwig's avatar
Christoph Hellwig committed
870 871 872 873 874 875 876
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

877
static inline void put_link(struct nameidata *nd)
878
{
Al Viro's avatar
Al Viro committed
879
	struct saved *last = nd->stack + --nd->depth;
880
	do_delayed_call(&last->done);
Al Viro's avatar
Al Viro committed
881 882
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
883 884
}

885 886
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
Kees Cook's avatar
Kees Cook committed
887 888 889

/**
 * may_follow_link - Check symlink following for unsafe situations
890
 * @nd: nameidata pathwalk data
Kees Cook's avatar
Kees Cook committed
891 892 893 894 895 896 897 898 899 900 901 902
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
Al Viro's avatar
Al Viro committed
903
static inline int may_follow_link(struct nameidata *nd)
Kees Cook's avatar
Kees Cook committed
904 905 906
{
	const struct inode *inode;
	const struct inode *parent;
907
	kuid_t puid;
Kees Cook's avatar
Kees Cook committed
908 909 910 911 912

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
913
	inode = nd->link_inode;
914
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
915 916 917
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
918
	parent = nd->inode;
Kees Cook's avatar
Kees Cook committed
919 920 921 922
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
923 924
	puid = parent->i_uid;
	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
925 926
		return 0;

927 928 929
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

Al Viro's avatar
Al Viro committed
930
	audit_log_link_denied("follow_link", &nd->stack[0].link);
Kees Cook's avatar
Kees Cook committed
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
977
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
Kees Cook's avatar
Kees Cook committed
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
993
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
Kees Cook's avatar
Kees Cook committed
994 995
		return 0;

996
	audit_log_link_denied("linkat", link);
Kees Cook's avatar
Kees Cook committed
997 998 999
	return -EPERM;
}

1000 1001
static __always_inline
const char *get_link(struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
1002
{
1003
	struct saved *last = nd->stack + nd->depth - 1;
Al Viro's avatar
Al Viro committed
1004
	struct dentry *dentry = last->link.dentry;
1005
	struct inode *inode = nd->link_inode;
1006
	int error;
1007
	const char *res;
Linus Torvalds's avatar
Linus Torvalds committed
1008

1009 1010 1011
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
1012
	} else if (atime_needs_update_rcu(&last->link, inode)) {
Al Viro's avatar
Al Viro committed
1013
		if (unlikely(unlazy_walk(nd)))
Al Viro's avatar
Al Viro committed
1014
			return ERR_PTR(-ECHILD);
1015
		touch_atime(&last->link);
Al Viro's avatar
Al Viro committed
1016
	}
1017

1018 1019 1020
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
1021
		return ERR_PTR(error);
1022

1023
	nd->last_type = LAST_BIND;
1024 1025
	res = inode->i_link;
	if (!res) {
1026 1027 1028
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
1029
		if (nd->flags & LOOKUP_RCU) {
1030
			res = get(NULL, inode, &last->done);
1031
			if (res == ERR_PTR(-ECHILD)) {
Al Viro's avatar
Al Viro committed
1032
				if (unlikely(unlazy_walk(nd)))
1033
					return ERR_PTR(-ECHILD);
1034
				res = get(dentry, inode, &last->done);
1035 1036
			}
		} else {
1037
			res = get(dentry, inode, &last->done);
1038
		}
1039
		if (IS_ERR_OR_NULL(res))
1040 1041 1042
			return res;
	}
	if (*res == '/') {
1043 1044
		if (!nd->root.mnt)
			set_root(nd);
1045 1046
		if (unlikely(nd_jump_root(nd)))
			return ERR_PTR(-ECHILD);
1047 1048
		while (unlikely(*++res == '/'))
			;
Linus Torvalds's avatar
Linus Torvalds committed
1049
	}
1050 1051
	if (!*res)
		res = NULL;
1052 1053
	return res;
}
1054

1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
Al Viro's avatar
Al Viro committed
1065
int follow_up(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1066
{
1067 1068
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
Linus Torvalds's avatar
Linus Torvalds committed
1069
	struct dentry *mountpoint;
Nick Piggin's avatar
Nick Piggin committed
1070

Al Viro's avatar
Al Viro committed
1071
	read_seqlock_excl(&mount_lock);
1072
	parent = mnt->mnt_parent;
Al Viro's avatar
Al Viro committed
1073
	if (parent == mnt) {
Al Viro's avatar
Al Viro committed
1074
		read_sequnlock_excl(&mount_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1075 1076
		return 0;
	}
1077
	mntget(&parent->mnt);
1078
	mountpoint = dget(mnt->mnt_mountpoint);
Al Viro's avatar
Al Viro committed
1079
	read_sequnlock_excl(&mount_lock);
Al Viro's avatar
Al Viro committed
1080 1081 1082
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1083
	path->mnt = &parent->mnt;
Linus Torvalds's avatar
Linus Torvalds committed
1084 1085
	return 1;
}
1086
EXPORT_SYMBOL(follow_up);
Linus Torvalds's avatar
Linus Torvalds committed
1087

Nick Piggin's avatar
Nick Piggin committed
1088
/*
1089 1090 1091
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
Linus Torvalds's avatar
Linus Torvalds committed
1092
 */
1093
static int follow_automount(struct path *path, struct nameidata *nd,
1094
			    bool *need_mntput)
Nick Piggin's avatar
Nick Piggin committed
1095
{
1096
	struct vfsmount *mnt;
1097
	int err;
1098 1099 1100 1101

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;