eventfd.c 11.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12
/*
 *  fs/eventfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
13
#include <linux/sched/signal.h>
14
#include <linux/kernel.h>
15
#include <linux/slab.h>
16 17 18
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/anon_inodes.h>
19
#include <linux/syscalls.h>
20
#include <linux/export.h>
21 22
#include <linux/kref.h>
#include <linux/eventfd.h>
23 24
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
25 26
#include <linux/idr.h>

27 28
DEFINE_PER_CPU(int, eventfd_wake_count);

29
static DEFINE_IDA(eventfd_ida);
30 31

struct eventfd_ctx {
32
	struct kref kref;
33 34 35 36 37 38
	wait_queue_head_t wqh;
	/*
	 * Every time that a write(2) is performed on an eventfd, the
	 * value of the __u64 being written is added to "count" and a
	 * wakeup is performed on "wqh". A read(2) will return the "count"
	 * value to userspace, and will reset "count" to zero. The kernel
39
	 * side eventfd_signal() also, adds to the "count" counter and
40 41 42
	 * issue a wakeup.
	 */
	__u64 count;
43
	unsigned int flags;
44
	int id;
45 46
};

47 48 49 50 51 52 53 54
/**
 * eventfd_signal - Adds @n to the eventfd counter.
 * @ctx: [in] Pointer to the eventfd context.
 * @n: [in] Value of the counter to be added to the eventfd internal counter.
 *          The value cannot be negative.
 *
 * This function is supposed to be called by the kernel in paths that do not
 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
55
 * value, and we signal this as overflow condition by returning a EPOLLERR
56 57
 * to poll(2).
 *
58
 * Returns the amount by which the counter was incremented.  This will be less
59
 * than @n if the counter has overflowed.
60
 */
61
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
62 63 64
{
	unsigned long flags;

65 66 67 68 69 70 71 72 73 74 75
	/*
	 * Deadlock or stack overflow issues can happen if we recurse here
	 * through waitqueue wakeup handlers. If the caller users potentially
	 * nested waitqueues with custom wakeup handlers, then it should
	 * check eventfd_signal_count() before calling this function. If
	 * it returns true, the eventfd_signal() call should be deferred to a
	 * safe context.
	 */
	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
		return 0;

76
	spin_lock_irqsave(&ctx->wqh.lock, flags);
77
	this_cpu_inc(eventfd_wake_count);
78
	if (ULLONG_MAX - ctx->count < n)
79
		n = ULLONG_MAX - ctx->count;
80 81
	ctx->count += n;
	if (waitqueue_active(&ctx->wqh))
82
		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
83
	this_cpu_dec(eventfd_wake_count);
84
	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
85 86 87

	return n;
}
88
EXPORT_SYMBOL_GPL(eventfd_signal);
89

90 91
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
92 93
	if (ctx->id >= 0)
		ida_simple_remove(&eventfd_ida, ctx->id);
94 95 96
	kfree(ctx);
}

97 98 99 100
static void eventfd_free(struct kref *kref)
{
	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);

101
	eventfd_free_ctx(ctx);
102 103 104 105 106 107 108
}

/**
 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
 * @ctx: [in] Pointer to eventfd context.
 *
 * The eventfd context reference must have been previously acquired either
109
 * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
110 111 112 113 114 115 116
 */
void eventfd_ctx_put(struct eventfd_ctx *ctx)
{
	kref_put(&ctx->kref, eventfd_free);
}
EXPORT_SYMBOL_GPL(eventfd_ctx_put);

117 118
static int eventfd_release(struct inode *inode, struct file *file)
{
119 120
	struct eventfd_ctx *ctx = file->private_data;

121
	wake_up_poll(&ctx->wqh, EPOLLHUP);
122
	eventfd_ctx_put(ctx);
123 124 125
	return 0;
}

126
static __poll_t eventfd_poll(struct file *file, poll_table *wait)
127 128
{
	struct eventfd_ctx *ctx = file->private_data;
Al Viro's avatar
Al Viro committed
129
	__poll_t events = 0;
130
	u64 count;
131

132 133
	poll_wait(file, &ctx->wqh, wait);

134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	/*
	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
	 * can be done outside ctx->wqh.lock because we know that poll_wait
	 * takes that lock (through add_wait_queue) if our caller will sleep.
	 *
	 * The read _can_ therefore seep into add_wait_queue's critical
	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
	 * as an acquire barrier and ensures that the read be ordered properly
	 * against the writes.  The following CAN happen and is safe:
	 *
	 *     poll                               write
	 *     -----------------                  ------------
	 *     lock ctx->wqh.lock (in poll_wait)
	 *     count = ctx->count
	 *     __add_wait_queue
	 *     unlock ctx->wqh.lock
	 *                                        lock ctx->qwh.lock
	 *                                        ctx->count += n
	 *                                        if (waitqueue_active)
	 *                                          wake_up_locked_poll
	 *                                        unlock ctx->qwh.lock
	 *     eventfd_poll returns 0
	 *
	 * but the following, which would miss a wakeup, cannot happen:
	 *
	 *     poll                               write
	 *     -----------------                  ------------
	 *     count = ctx->count (INVALID!)
	 *                                        lock ctx->qwh.lock
	 *                                        ctx->count += n
	 *                                        **waitqueue_active is false**
	 *                                        **no wake_up_locked_poll!**
	 *                                        unlock ctx->qwh.lock
	 *     lock ctx->wqh.lock (in poll_wait)
	 *     __add_wait_queue
	 *     unlock ctx->wqh.lock
	 *     eventfd_poll returns 0
	 */
	count = READ_ONCE(ctx->count);
173

174
	if (count > 0)
175
		events |= EPOLLIN;
176
	if (count == ULLONG_MAX)
177
		events |= EPOLLERR;
178
	if (ULLONG_MAX - 1 > count)
179
		events |= EPOLLOUT;
180 181 182 183

	return events;
}

184 185 186 187 188 189 190 191 192 193
static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
	ctx->count -= *cnt;
}

/**
 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 * @ctx: [in] Pointer to eventfd context.
 * @wait: [in] Wait queue to be removed.
194
 * @cnt: [out] Pointer to the 64-bit counter value.
195
 *
196
 * Returns %0 if successful, or the following error codes:
197 198 199 200 201 202
 *
 * -EAGAIN      : The operation would have blocked.
 *
 * This is used to atomically remove a wait queue entry from the eventfd wait
 * queue head, and read/reset the counter value.
 */
203
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
204 205 206 207 208 209 210 211
				  __u64 *cnt)
{
	unsigned long flags;

	spin_lock_irqsave(&ctx->wqh.lock, flags);
	eventfd_ctx_do_read(ctx, cnt);
	__remove_wait_queue(&ctx->wqh, wait);
	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
212
		wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
213 214 215 216 217 218
	spin_unlock_irqrestore(&ctx->wqh.lock, flags);

	return *cnt != 0 ? 0 : -EAGAIN;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);

219 220
static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
			    loff_t *ppos)
221
{
222
	struct eventfd_ctx *ctx = file->private_data;
223
	ssize_t res;
224
	__u64 ucnt = 0;
225 226
	DECLARE_WAITQUEUE(wait, current);

227 228 229
	if (count < sizeof(ucnt))
		return -EINVAL;

230
	spin_lock_irq(&ctx->wqh.lock);
231
	res = -EAGAIN;
232
	if (ctx->count > 0)
233 234
		res = sizeof(ucnt);
	else if (!(file->f_flags & O_NONBLOCK)) {
235
		__add_wait_queue(&ctx->wqh, &wait);
236
		for (;;) {
237 238
			set_current_state(TASK_INTERRUPTIBLE);
			if (ctx->count > 0) {
239
				res = sizeof(ucnt);
240 241 242 243 244 245
				break;
			}
			if (signal_pending(current)) {
				res = -ERESTARTSYS;
				break;
			}
246
			spin_unlock_irq(&ctx->wqh.lock);
247
			schedule();
248
			spin_lock_irq(&ctx->wqh.lock);
249 250 251 252
		}
		__remove_wait_queue(&ctx->wqh, &wait);
		__set_current_state(TASK_RUNNING);
	}
253 254
	if (likely(res > 0)) {
		eventfd_ctx_do_read(ctx, &ucnt);
255
		if (waitqueue_active(&ctx->wqh))
256
			wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
257
	}
258
	spin_unlock_irq(&ctx->wqh.lock);
259

260 261
	if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
		return -EFAULT;
262

263
	return res;
264
}
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
			     loff_t *ppos)
{
	struct eventfd_ctx *ctx = file->private_data;
	ssize_t res;
	__u64 ucnt;
	DECLARE_WAITQUEUE(wait, current);

	if (count < sizeof(ucnt))
		return -EINVAL;
	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
		return -EFAULT;
	if (ucnt == ULLONG_MAX)
		return -EINVAL;
280
	spin_lock_irq(&ctx->wqh.lock);
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
	res = -EAGAIN;
	if (ULLONG_MAX - ctx->count > ucnt)
		res = sizeof(ucnt);
	else if (!(file->f_flags & O_NONBLOCK)) {
		__add_wait_queue(&ctx->wqh, &wait);
		for (res = 0;;) {
			set_current_state(TASK_INTERRUPTIBLE);
			if (ULLONG_MAX - ctx->count > ucnt) {
				res = sizeof(ucnt);
				break;
			}
			if (signal_pending(current)) {
				res = -ERESTARTSYS;
				break;
			}
296
			spin_unlock_irq(&ctx->wqh.lock);
297
			schedule();
298
			spin_lock_irq(&ctx->wqh.lock);
299 300 301 302
		}
		__remove_wait_queue(&ctx->wqh, &wait);
		__set_current_state(TASK_RUNNING);
	}
303
	if (likely(res > 0)) {
304 305
		ctx->count += ucnt;
		if (waitqueue_active(&ctx->wqh))
306
			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
307
	}
308
	spin_unlock_irq(&ctx->wqh.lock);
309 310 311 312

	return res;
}

313
#ifdef CONFIG_PROC_FS
314
static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
315 316 317 318
{
	struct eventfd_ctx *ctx = f->private_data;

	spin_lock_irq(&ctx->wqh.lock);
319 320
	seq_printf(m, "eventfd-count: %16llx\n",
		   (unsigned long long)ctx->count);
321
	spin_unlock_irq(&ctx->wqh.lock);
322
	seq_printf(m, "eventfd-id: %d\n", ctx->id);
323 324 325
}
#endif

326
static const struct file_operations eventfd_fops = {
327 328 329
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= eventfd_show_fdinfo,
#endif
330
	.release	= eventfd_release,
331
	.poll		= eventfd_poll,
332 333
	.read		= eventfd_read,
	.write		= eventfd_write,
334
	.llseek		= noop_llseek,
335 336
};

337 338 339 340 341 342 343 344 345 346
/**
 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 * @fd: [in] Eventfd file descriptor.
 *
 * Returns a pointer to the eventfd file structure in case of success, or the
 * following error pointer:
 *
 * -EBADF    : Invalid @fd file descriptor.
 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 */
347 348 349 350 351 352 353 354 355 356 357 358 359 360
struct file *eventfd_fget(int fd)
{
	struct file *file;

	file = fget(fd);
	if (!file)
		return ERR_PTR(-EBADF);
	if (file->f_op != &eventfd_fops) {
		fput(file);
		return ERR_PTR(-EINVAL);
	}

	return file;
}
361
EXPORT_SYMBOL_GPL(eventfd_fget);
362

363 364 365 366 367 368 369 370 371 372 373 374
/**
 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 * @fd: [in] Eventfd file descriptor.
 *
 * Returns a pointer to the internal eventfd context, otherwise the error
 * pointers returned by the following functions:
 *
 * eventfd_fget
 */
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
	struct eventfd_ctx *ctx;
375 376 377 378 379
	struct fd f = fdget(fd);
	if (!f.file)
		return ERR_PTR(-EBADF);
	ctx = eventfd_ctx_fileget(f.file);
	fdput(f);
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
	return ctx;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);

/**
 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 * @file: [in] Eventfd file pointer.
 *
 * Returns a pointer to the internal eventfd context, otherwise the error
 * pointer:
 *
 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 */
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
{
395 396
	struct eventfd_ctx *ctx;

397 398 399
	if (file->f_op != &eventfd_fops)
		return ERR_PTR(-EINVAL);

400 401 402
	ctx = file->private_data;
	kref_get(&ctx->kref);
	return ctx;
403 404 405
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);

406
static int do_eventfd(unsigned int count, int flags)
407 408
{
	struct eventfd_ctx *ctx;
409
	int fd;
410

411 412 413 414
	/* Check the EFD_* constants for consistency.  */
	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);

415
	if (flags & ~EFD_FLAGS_SET)
416
		return -EINVAL;
Ulrich Drepper's avatar
Ulrich Drepper committed
417

418 419
	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
420
		return -ENOMEM;
421

422
	kref_init(&ctx->kref);
423 424
	init_waitqueue_head(&ctx->wqh);
	ctx->count = count;
425
	ctx->flags = flags;
426
	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
427

428 429 430
	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
			      O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
	if (fd < 0)
431 432
		eventfd_free_ctx(ctx);

433
	return fd;
434 435
}

436 437 438 439 440
SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
	return do_eventfd(count, flags);
}

441
SYSCALL_DEFINE1(eventfd, unsigned int, count)
Ulrich Drepper's avatar
Ulrich Drepper committed
442
{
443
	return do_eventfd(count, 0);
Ulrich Drepper's avatar
Ulrich Drepper committed
444
}
445