vfio_iommu_type1.c 57 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
 *
 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 *     Author: Alex Williamson <alex.williamson@redhat.com>
 *
 * Derived from original vfio:
 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
 * Author: Tom Lyon, pugs@cisco.com
 *
 * We arbitrarily define a Type1 IOMMU as one matching the below code.
 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
 * VT-d, but that makes it harder to re-use as theoretically anyone
 * implementing a similar IOMMU could make use of this.  We expect the
 * IOMMU to support the IOMMU API and have few to no restrictions around
 * the IOVA range that can be mapped.  The Type1 IOMMU is currently
 * optimized for relatively static mappings of a userspace process with
 * userpsace pages pinned into memory.  We also assume devices and IOMMU
 * domains are PCI based as the IOMMU API is still centered around a
 * device/bus interface rather than a group interface.
 */

#include <linux/compat.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mm.h>
30
#include <linux/rbtree.h>
31
#include <linux/sched/signal.h>
32
#include <linux/sched/mm.h>
33 34 35 36
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/workqueue.h>
37
#include <linux/mdev.h>
38
#include <linux/notifier.h>
39
#include <linux/dma-iommu.h>
40
#include <linux/irqdomain.h>
41 42 43 44 45 46 47 48 49 50 51

#define DRIVER_VERSION  "0.2"
#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC     "Type1 IOMMU driver for VFIO"

static bool allow_unsafe_interrupts;
module_param_named(allow_unsafe_interrupts,
		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(allow_unsafe_interrupts,
		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");

52 53 54 55 56 57
static bool disable_hugepages;
module_param_named(disable_hugepages,
		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(disable_hugepages,
		 "Disable VFIO IOMMU support for IOMMU hugepages.");

58 59 60 61 62
static unsigned int dma_entry_limit __read_mostly = U16_MAX;
module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
MODULE_PARM_DESC(dma_entry_limit,
		 "Maximum number of user DMA mappings per container (65535).");

63
struct vfio_iommu {
64
	struct list_head	domain_list;
65
	struct list_head	iova_list;
66
	struct vfio_domain	*external_domain; /* domain for external user */
67
	struct mutex		lock;
68
	struct rb_root		dma_list;
69
	struct blocking_notifier_head notifier;
70
	unsigned int		dma_avail;
71 72
	bool			v2;
	bool			nesting;
73 74 75 76 77
};

struct vfio_domain {
	struct iommu_domain	*domain;
	struct list_head	next;
78
	struct list_head	group_list;
79
	int			prot;		/* IOMMU_CACHE */
80
	bool			fgsp;		/* Fine-grained super pages */
81 82 83
};

struct vfio_dma {
84
	struct rb_node		node;
85 86
	dma_addr_t		iova;		/* Device address */
	unsigned long		vaddr;		/* Process virtual addr */
87
	size_t			size;		/* Map size (bytes) */
88
	int			prot;		/* IOMMU_READ/WRITE */
89
	bool			iommu_mapped;
90
	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
91
	struct task_struct	*task;
92
	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
93 94 95 96 97
};

struct vfio_group {
	struct iommu_group	*iommu_group;
	struct list_head	next;
98
	bool			mdev_group;	/* An mdev group */
99 100
};

101 102 103 104 105 106
struct vfio_iova {
	struct list_head	list;
	dma_addr_t		start;
	dma_addr_t		end;
};

107 108 109 110 111 112 113 114 115 116
/*
 * Guest RAM pinning working set or DMA target
 */
struct vfio_pfn {
	struct rb_node		node;
	dma_addr_t		iova;		/* Device address */
	unsigned long		pfn;		/* Host pfn */
	atomic_t		ref_count;
};

117 118 119 120 121 122 123
struct vfio_regions {
	struct list_head list;
	dma_addr_t iova;
	phys_addr_t phys;
	size_t len;
};

124 125 126 127 128
#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
					(!list_empty(&iommu->domain_list))

static int put_pfn(unsigned long pfn, int prot);

129 130 131 132 133
/*
 * This code handles mapping and unmapping of user data buffers
 * into DMA'ble space using the IOMMU
 */

134 135 136 137 138 139 140 141 142 143
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
				      dma_addr_t start, size_t size)
{
	struct rb_node *node = iommu->dma_list.rb_node;

	while (node) {
		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);

		if (start + size <= dma->iova)
			node = node->rb_left;
144
		else if (start >= dma->iova + dma->size)
145 146 147 148 149 150 151 152
			node = node->rb_right;
		else
			return dma;
	}

	return NULL;
}

153
static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
154 155 156 157 158 159 160 161
{
	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
	struct vfio_dma *dma;

	while (*link) {
		parent = *link;
		dma = rb_entry(parent, struct vfio_dma, node);

162
		if (new->iova + new->size <= dma->iova)
163 164 165 166 167 168 169 170 171
			link = &(*link)->rb_left;
		else
			link = &(*link)->rb_right;
	}

	rb_link_node(&new->node, parent, link);
	rb_insert_color(&new->node, &iommu->dma_list);
}

172
static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
173 174 175 176
{
	rb_erase(&old->node, &iommu->dma_list);
}

177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
/*
 * Helper Functions for host iova-pfn list
 */
static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
{
	struct vfio_pfn *vpfn;
	struct rb_node *node = dma->pfn_list.rb_node;

	while (node) {
		vpfn = rb_entry(node, struct vfio_pfn, node);

		if (iova < vpfn->iova)
			node = node->rb_left;
		else if (iova > vpfn->iova)
			node = node->rb_right;
		else
			return vpfn;
	}
	return NULL;
}

static void vfio_link_pfn(struct vfio_dma *dma,
			  struct vfio_pfn *new)
{
	struct rb_node **link, *parent = NULL;
	struct vfio_pfn *vpfn;

	link = &dma->pfn_list.rb_node;
	while (*link) {
		parent = *link;
		vpfn = rb_entry(parent, struct vfio_pfn, node);

		if (new->iova < vpfn->iova)
			link = &(*link)->rb_left;
		else
			link = &(*link)->rb_right;
	}

	rb_link_node(&new->node, parent, link);
	rb_insert_color(&new->node, &dma->pfn_list);
}

static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
{
	rb_erase(&old->node, &dma->pfn_list);
}

static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
				unsigned long pfn)
{
	struct vfio_pfn *vpfn;

	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
	if (!vpfn)
		return -ENOMEM;

	vpfn->iova = iova;
	vpfn->pfn = pfn;
	atomic_set(&vpfn->ref_count, 1);
	vfio_link_pfn(dma, vpfn);
	return 0;
}

static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
				      struct vfio_pfn *vpfn)
{
	vfio_unlink_pfn(dma, vpfn);
	kfree(vpfn);
}

static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
					       unsigned long iova)
{
	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);

	if (vpfn)
		atomic_inc(&vpfn->ref_count);
	return vpfn;
}

static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
{
	int ret = 0;

	if (atomic_dec_and_test(&vpfn->ref_count)) {
		ret = put_pfn(vpfn->pfn, dma->prot);
		vfio_remove_from_pfn_list(dma, vpfn);
	}
	return ret;
}

268
static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
269 270
{
	struct mm_struct *mm;
271
	int ret;
272

273
	if (!npage)
274
		return 0;
275

276
	mm = async ? get_task_mm(dma->task) : dma->task->mm;
277
	if (!mm)
278
		return -ESRCH; /* process exited */
279

280 281
	ret = down_write_killable(&mm->mmap_sem);
	if (!ret) {
282 283
		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
					  dma->lock_cap);
284
		up_write(&mm->mmap_sem);
285 286
	}

287
	if (async)
288
		mmput(mm);
289 290

	return ret;
291 292 293 294 295 296
}

/*
 * Some mappings aren't backed by a struct page, for example an mmap'd
 * MMIO range for our own or another device.  These use a different
 * pfn conversion and shouldn't be tracked as locked pages.
297 298
 * For compound pages, any driver that sets the reserved bit in head
 * page needs to set the reserved bit in all subpages to be safe.
299 300 301
 */
static bool is_invalid_reserved_pfn(unsigned long pfn)
{
302 303
	if (pfn_valid(pfn))
		return PageReserved(pfn_to_page(pfn));
304 305 306 307 308 309 310 311

	return true;
}

static int put_pfn(unsigned long pfn, int prot)
{
	if (!is_invalid_reserved_pfn(pfn)) {
		struct page *page = pfn_to_page(pfn);
312

313
		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
314 315 316 317 318
		return 1;
	}
	return 0;
}

319 320
static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
			 int prot, unsigned long *pfn)
321 322 323
{
	struct page *page[1];
	struct vm_area_struct *vma;
324
	unsigned int flags = 0;
325
	int ret;
326

327 328 329 330
	if (prot & IOMMU_WRITE)
		flags |= FOLL_WRITE;

	down_read(&mm->mmap_sem);
331
	ret = pin_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
332
				    page, NULL, NULL);
333
	if (ret == 1) {
334
		*pfn = page_to_pfn(page[0]);
335 336
		ret = 0;
		goto done;
337 338
	}

339 340
	vaddr = untagged_addr(vaddr);

341
	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
342 343 344 345 346 347

	if (vma && vma->vm_flags & VM_PFNMAP) {
		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
		if (is_invalid_reserved_pfn(*pfn))
			ret = 0;
	}
348
done:
349
	up_read(&mm->mmap_sem);
350 351 352
	return ret;
}

353 354 355 356 357
/*
 * Attempt to pin pages.  We really don't want to track all the pfns and
 * the iommu can only map chunks of consecutive pfns anyway, so get the
 * first page and all consecutive pages with the same locking.
 */
358
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
359
				  long npage, unsigned long *pfn_base,
360
				  unsigned long limit)
361
{
362
	unsigned long pfn = 0;
363
	long ret, pinned = 0, lock_acct = 0;
364
	bool rsvd;
365
	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
366

367 368
	/* This code path is only user initiated */
	if (!current->mm)
369
		return -ENODEV;
370

371
	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
372
	if (ret)
373
		return ret;
374

375
	pinned++;
376
	rsvd = is_invalid_reserved_pfn(*pfn_base);
377

378 379 380 381
	/*
	 * Reserved pages aren't counted against the user, externally pinned
	 * pages are already counted against the user.
	 */
382
	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
383
		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
384 385 386
			put_pfn(*pfn_base, dma->prot);
			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
					limit << PAGE_SHIFT);
387
			return -ENOMEM;
388 389
		}
		lock_acct++;
390 391
	}

392 393
	if (unlikely(disable_hugepages))
		goto out;
394

395 396 397 398 399 400 401
	/* Lock all the consecutive pages from pfn_base */
	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
		if (ret)
			break;

402 403
		if (pfn != *pfn_base + pinned ||
		    rsvd != is_invalid_reserved_pfn(pfn)) {
404 405 406
			put_pfn(pfn, dma->prot);
			break;
		}
407

408
		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
409
			if (!dma->lock_cap &&
410
			    current->mm->locked_vm + lock_acct + 1 > limit) {
411
				put_pfn(pfn, dma->prot);
412 413
				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
					__func__, limit << PAGE_SHIFT);
414 415
				ret = -ENOMEM;
				goto unpin_out;
416
			}
417
			lock_acct++;
418 419 420
		}
	}

421
out:
422
	ret = vfio_lock_acct(dma, lock_acct, false);
423 424 425

unpin_out:
	if (ret) {
426 427 428 429
		if (!rsvd) {
			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
				put_pfn(pfn, dma->prot);
		}
430 431 432

		return ret;
	}
433

434
	return pinned;
435 436
}

437 438 439
static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
				    unsigned long pfn, long npage,
				    bool do_accounting)
440
{
441
	long unlocked = 0, locked = 0;
442 443
	long i;

444
	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
445 446
		if (put_pfn(pfn++, dma->prot)) {
			unlocked++;
447
			if (vfio_find_vpfn(dma, iova))
448 449 450 451 452
				locked++;
		}
	}

	if (do_accounting)
453
		vfio_lock_acct(dma, locked - unlocked, true);
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468

	return unlocked;
}

static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
				  unsigned long *pfn_base, bool do_accounting)
{
	struct mm_struct *mm;
	int ret;

	mm = get_task_mm(dma->task);
	if (!mm)
		return -ENODEV;

	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
469
	if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
470
		ret = vfio_lock_acct(dma, 1, true);
471 472
		if (ret) {
			put_pfn(*pfn_base, dma->prot);
473 474 475 476 477
			if (ret == -ENOMEM)
				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
					"(%ld) exceeded\n", __func__,
					dma->task->comm, task_pid_nr(dma->task),
					task_rlimit(dma->task, RLIMIT_MEMLOCK));
478 479 480
		}
	}

481 482 483 484 485 486 487 488 489 490 491 492 493 494
	mmput(mm);
	return ret;
}

static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
				    bool do_accounting)
{
	int unlocked;
	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);

	if (!vpfn)
		return 0;

	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
495 496

	if (do_accounting)
497
		vfio_lock_acct(dma, -unlocked, true);
498 499 500 501

	return unlocked;
}

502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
static int vfio_iommu_type1_pin_pages(void *iommu_data,
				      unsigned long *user_pfn,
				      int npage, int prot,
				      unsigned long *phys_pfn)
{
	struct vfio_iommu *iommu = iommu_data;
	int i, j, ret;
	unsigned long remote_vaddr;
	struct vfio_dma *dma;
	bool do_accounting;

	if (!iommu || !user_pfn || !phys_pfn)
		return -EINVAL;

	/* Supported for v2 version only */
	if (!iommu->v2)
		return -EACCES;

	mutex_lock(&iommu->lock);

522
	/* Fail if notifier list is empty */
523
	if (!iommu->notifier.head) {
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
		ret = -EINVAL;
		goto pin_done;
	}

	/*
	 * If iommu capable domain exist in the container then all pages are
	 * already pinned and accounted. Accouting should be done if there is no
	 * iommu capable domain in the container.
	 */
	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);

	for (i = 0; i < npage; i++) {
		dma_addr_t iova;
		struct vfio_pfn *vpfn;

		iova = user_pfn[i] << PAGE_SHIFT;
540
		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
		if (!dma) {
			ret = -EINVAL;
			goto pin_unwind;
		}

		if ((dma->prot & prot) != prot) {
			ret = -EPERM;
			goto pin_unwind;
		}

		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
		if (vpfn) {
			phys_pfn[i] = vpfn->pfn;
			continue;
		}

		remote_vaddr = dma->vaddr + iova - dma->iova;
		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
					     do_accounting);
560
		if (ret)
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
			goto pin_unwind;

		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
		if (ret) {
			vfio_unpin_page_external(dma, iova, do_accounting);
			goto pin_unwind;
		}
	}

	ret = i;
	goto pin_done;

pin_unwind:
	phys_pfn[i] = 0;
	for (j = 0; j < i; j++) {
		dma_addr_t iova;

		iova = user_pfn[j] << PAGE_SHIFT;
579
		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
		vfio_unpin_page_external(dma, iova, do_accounting);
		phys_pfn[j] = 0;
	}
pin_done:
	mutex_unlock(&iommu->lock);
	return ret;
}

static int vfio_iommu_type1_unpin_pages(void *iommu_data,
					unsigned long *user_pfn,
					int npage)
{
	struct vfio_iommu *iommu = iommu_data;
	bool do_accounting;
	int i;

	if (!iommu || !user_pfn)
		return -EINVAL;

	/* Supported for v2 version only */
	if (!iommu->v2)
		return -EACCES;

	mutex_lock(&iommu->lock);

	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
	for (i = 0; i < npage; i++) {
		struct vfio_dma *dma;
		dma_addr_t iova;

		iova = user_pfn[i] << PAGE_SHIFT;
611
		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
612 613 614 615 616 617 618 619 620 621
		if (!dma)
			goto unpin_exit;
		vfio_unpin_page_external(dma, iova, do_accounting);
	}

unpin_exit:
	mutex_unlock(&iommu->lock);
	return i > npage ? npage : (i > 0 ? i : -EINVAL);
}

622
static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
623 624
			    struct list_head *regions,
			    struct iommu_iotlb_gather *iotlb_gather)
625 626 627 628
{
	long unlocked = 0;
	struct vfio_regions *entry, *next;

629
	iommu_tlb_sync(domain->domain, iotlb_gather);
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658

	list_for_each_entry_safe(entry, next, regions, list) {
		unlocked += vfio_unpin_pages_remote(dma,
						    entry->iova,
						    entry->phys >> PAGE_SHIFT,
						    entry->len >> PAGE_SHIFT,
						    false);
		list_del(&entry->list);
		kfree(entry);
	}

	cond_resched();

	return unlocked;
}

/*
 * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
 * of these regions (currently using a list).
 *
 * This value specifies maximum number of regions for each IOTLB flush sync.
 */
#define VFIO_IOMMU_TLB_SYNC_MAX		512

static size_t unmap_unpin_fast(struct vfio_domain *domain,
			       struct vfio_dma *dma, dma_addr_t *iova,
			       size_t len, phys_addr_t phys, long *unlocked,
			       struct list_head *unmapped_list,
659 660
			       int *unmapped_cnt,
			       struct iommu_iotlb_gather *iotlb_gather)
661 662 663 664 665
{
	size_t unmapped = 0;
	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);

	if (entry) {
666 667
		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
					    iotlb_gather);
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686

		if (!unmapped) {
			kfree(entry);
		} else {
			entry->iova = *iova;
			entry->phys = phys;
			entry->len  = unmapped;
			list_add_tail(&entry->list, unmapped_list);

			*iova += unmapped;
			(*unmapped_cnt)++;
		}
	}

	/*
	 * Sync if the number of fast-unmap regions hits the limit
	 * or in case of errors.
	 */
	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
687 688
		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
					     iotlb_gather);
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
		*unmapped_cnt = 0;
	}

	return unmapped;
}

static size_t unmap_unpin_slow(struct vfio_domain *domain,
			       struct vfio_dma *dma, dma_addr_t *iova,
			       size_t len, phys_addr_t phys,
			       long *unlocked)
{
	size_t unmapped = iommu_unmap(domain->domain, *iova, len);

	if (unmapped) {
		*unlocked += vfio_unpin_pages_remote(dma, *iova,
						     phys >> PAGE_SHIFT,
						     unmapped >> PAGE_SHIFT,
						     false);
		*iova += unmapped;
		cond_resched();
	}
	return unmapped;
}

713 714
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
			     bool do_accounting)
715
{
716 717
	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
	struct vfio_domain *domain, *d;
718
	LIST_HEAD(unmapped_region_list);
719
	struct iommu_iotlb_gather iotlb_gather;
720
	int unmapped_region_cnt = 0;
721 722
	long unlocked = 0;

723
	if (!dma->size)
724 725 726 727 728
		return 0;

	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
		return 0;

729 730 731 732 733 734 735 736 737 738
	/*
	 * We use the IOMMU to track the physical addresses, otherwise we'd
	 * need a much more complicated tracking system.  Unfortunately that
	 * means we need to use one of the iommu domains to figure out the
	 * pfns to unpin.  The rest need to be unmapped in advance so we have
	 * no iommu translations remaining when the pages are unpinned.
	 */
	domain = d = list_first_entry(&iommu->domain_list,
				      struct vfio_domain, next);

739
	list_for_each_entry_continue(d, &iommu->domain_list, next) {
740
		iommu_unmap(d->domain, dma->iova, dma->size);
741 742
		cond_resched();
	}
743

744
	iommu_iotlb_gather_init(&iotlb_gather);
745
	while (iova < end) {
746 747
		size_t unmapped, len;
		phys_addr_t phys, next;
748

749
		phys = iommu_iova_to_phys(domain->domain, iova);
750 751 752
		if (WARN_ON(!phys)) {
			iova += PAGE_SIZE;
			continue;
753
		}
754

755 756 757 758 759 760 761 762 763 764 765 766
		/*
		 * To optimize for fewer iommu_unmap() calls, each of which
		 * may require hardware cache flushing, try to find the
		 * largest contiguous physical memory chunk to unmap.
		 */
		for (len = PAGE_SIZE;
		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
			next = iommu_iova_to_phys(domain->domain, iova + len);
			if (next != phys + len)
				break;
		}

767 768 769 770 771 772
		/*
		 * First, try to use fast unmap/unpin. In case of failure,
		 * switch to slow unmap/unpin path.
		 */
		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
					    &unlocked, &unmapped_region_list,
773 774
					    &unmapped_region_cnt,
					    &iotlb_gather);
775 776 777 778 779 780
		if (!unmapped) {
			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
						    phys, &unlocked);
			if (WARN_ON(!unmapped))
				break;
		}
781
	}
782

783
	dma->iommu_mapped = false;
784

785 786 787 788
	if (unmapped_region_cnt) {
		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
					    &iotlb_gather);
	}
789

790
	if (do_accounting) {
791
		vfio_lock_acct(dma, -unlocked, true);
792 793 794
		return 0;
	}
	return unlocked;
795 796
}

797
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
798
{
799
	vfio_unmap_unpin(iommu, dma, true);
800
	vfio_unlink_dma(iommu, dma);
801
	put_task_struct(dma->task);
802
	kfree(dma);
803
	iommu->dma_avail++;
804
}
805

806 807 808
static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
{
	struct vfio_domain *domain;
809
	unsigned long bitmap = ULONG_MAX;
810

811 812
	mutex_lock(&iommu->lock);
	list_for_each_entry(domain, &iommu->domain_list, next)
813
		bitmap &= domain->domain->pgsize_bitmap;
814
	mutex_unlock(&iommu->lock);
815

816 817 818 819 820 821 822 823 824 825 826 827 828
	/*
	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
	 * That way the user will be able to map/unmap buffers whose size/
	 * start address is aligned with PAGE_SIZE. Pinning code uses that
	 * granularity while iommu driver can use the sub-PAGE_SIZE size
	 * to map the buffer.
	 */
	if (bitmap & ~PAGE_MASK) {
		bitmap &= PAGE_MASK;
		bitmap |= PAGE_SIZE;
	}

829
	return bitmap;
830 831 832 833 834 835
}

static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
			     struct vfio_iommu_type1_dma_unmap *unmap)
{
	uint64_t mask;
836
	struct vfio_dma *dma, *dma_last = NULL;
837
	size_t unmapped = 0;
838
	int ret = 0, retries = 0;
839

840
	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
841 842 843

	if (unmap->iova & mask)
		return -EINVAL;
844
	if (!unmap->size || unmap->size & mask)
845
		return -EINVAL;
846
	if (unmap->iova + unmap->size - 1 < unmap->iova ||
847 848
	    unmap->size > SIZE_MAX)
		return -EINVAL;
849 850

	WARN_ON(mask & PAGE_MASK);
851
again:
852 853
	mutex_lock(&iommu->lock);

854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
	/*
	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
	 * avoid tracking individual mappings.  This means that the granularity
	 * of the original mapping was lost and the user was allowed to attempt
	 * to unmap any range.  Depending on the contiguousness of physical
	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
	 * or may not have worked.  We only guaranteed unmap granularity
	 * matching the original mapping; even though it was untracked here,
	 * the original mappings are reflected in IOMMU mappings.  This
	 * resulted in a couple unusual behaviors.  First, if a range is not
	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
	 * a zero sized unmap.  Also, if an unmap request overlaps the first
	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
	 * This also returns success and the returned unmap size reflects the
	 * actual size unmapped.
	 *
	 * We attempt to maintain compatibility with this "v1" interface, but
	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
	 * request offset from the beginning of the original mapping will
	 * return success with zero sized unmap.  And an unmap request covering
	 * the first iova of mapping will unmap the entire range.
	 *
	 * The v2 version of this interface intends to be more deterministic.
	 * Unmap requests must fully cover previous mappings.  Multiple
	 * mappings may still be unmaped by specifying large ranges, but there
	 * must not be any previous mappings bisected by the range.  An error
	 * will be returned if these conditions are not met.  The v2 interface
	 * will only return success and a size of zero if there were no
	 * mappings within the range.
	 */
	if (iommu->v2) {
886
		dma = vfio_find_dma(iommu, unmap->iova, 1);
887 888 889 890 891 892 893 894 895 896 897
		if (dma && dma->iova != unmap->iova) {
			ret = -EINVAL;
			goto unlock;
		}
		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
			ret = -EINVAL;
			goto unlock;
		}
	}

898
	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
899
		if (!iommu->v2 && unmap->iova > dma->iova)
900
			break;
901 902 903 904 905 906
		/*
		 * Task with same address space who mapped this iova range is
		 * allowed to unmap the iova range.
		 */
		if (dma->task->mm != current->mm)
			break;
907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932

		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
			struct vfio_iommu_type1_dma_unmap nb_unmap;

			if (dma_last == dma) {
				BUG_ON(++retries > 10);
			} else {
				dma_last = dma;
				retries = 0;
			}

			nb_unmap.iova = dma->iova;
			nb_unmap.size = dma->size;

			/*
			 * Notify anyone (mdev vendor drivers) to invalidate and
			 * unmap iovas within the range we're about to unmap.
			 * Vendor drivers MUST unpin pages in response to an
			 * invalidation.
			 */
			mutex_unlock(&iommu->lock);
			blocking_notifier_call_chain(&iommu->notifier,
						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
						    &nb_unmap);
			goto again;
		}
933 934
		unmapped += dma->size;
		vfio_remove_dma(iommu, dma);
935
	}
936

937
unlock:
938
	mutex_unlock(&iommu->lock);
939

940
	/* Report how much was unmapped */
941 942 943 944 945
	unmap->size = unmapped;

	return ret;
}

946 947 948 949 950 951 952 953 954
static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
			  unsigned long pfn, long npage, int prot)
{
	struct vfio_domain *d;
	int ret;

	list_for_each_entry(d, &iommu->domain_list, next) {
		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
				npage << PAGE_SHIFT, prot | d->prot);
955 956
		if (ret)
			goto unwind;
957 958

		cond_resched();
959 960 961 962 963 964 965
	}

	return 0;

unwind:
	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
966

967
	return ret;
968 969
}

970 971 972 973 974 975 976
static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
			    size_t map_size)
{
	dma_addr_t iova = dma->iova;
	unsigned long vaddr = dma->vaddr;
	size_t size = map_size;
	long npage;
977
	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
978 979 980 981 982
	int ret = 0;

	while (size) {
		/* Pin a contiguous chunk of memory */
		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
983
					      size >> PAGE_SHIFT, &pfn, limit);
984 985 986 987 988 989 990 991 992 993
		if (npage <= 0) {
			WARN_ON(!npage);
			ret = (int)npage;
			break;
		}

		/* Map it! */
		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
				     dma->prot);
		if (ret) {
994 995
			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
						npage, true);
996 997 998 999 1000 1001 1002
			break;
		}

		size -= npage << PAGE_SHIFT;
		dma->size += npage << PAGE_SHIFT;
	}

1003 1004
	dma->iommu_mapped = true;

1005 1006 1007 1008 1009 1010
	if (ret)
		vfio_remove_dma(iommu, dma);

	return ret;
}

1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
/*
 * Check dma map request is within a valid iova range
 */
static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
				      dma_addr_t start, dma_addr_t end)
{
	struct list_head *iova = &iommu->iova_list;
	struct vfio_iova *node;

	list_for_each_entry(node, iova, list) {
		if (start >= node->start && end <= node->end)
			return true;
	}

	/*
	 * Check for list_empty() as well since a container with
	 * a single mdev device will have an empty list.
	 */
	return list_empty(iova);
}

1032 1033 1034
static int vfio_dma_do_map(struct vfio_iommu *iommu,
			   struct vfio_iommu_type1_dma_map *map)
{
1035
	dma_addr_t iova = map->iova;
1036
	unsigned long vaddr = map->vaddr;
1037 1038 1039
	size_t size = map->size;
	int ret = 0, prot = 0;
	uint64_t mask;
1040
	struct vfio_dma *dma;
1041

1042 1043 1044
	/* Verify that none of our __u64 fields overflow */
	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
		return -EINVAL;
1045

1046
	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
1047

1048 1049
	WARN_ON(mask & PAGE_MASK);

1050 1051 1052 1053 1054 1055
	/* READ/WRITE from device perspective */
	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
		prot |= IOMMU_WRITE;
	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
		prot |= IOMMU_READ;

1056
	if (!prot || !size || (size | iova | vaddr) & mask)
1057 1058
		return -EINVAL;

1059 1060
	/* Don't allow IOVA or virtual address wrap */
	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
1061 1062 1063 1064
		return -EINVAL;

	mutex_lock(&iommu->lock);

1065
	if (vfio_find_dma(iommu, iova, size)) {
1066 1067
		ret = -EEXIST;
		goto out_unlock;
1068 1069
	}

1070 1071 1072 1073 1074
	if (!iommu->dma_avail) {
		ret = -ENOSPC;
		goto out_unlock;
	}

1075 1076 1077 1078 1079
	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
		ret = -EINVAL;
		goto out_unlock;
	}

1080 1081
	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
	if (!dma) {
1082 1083
		ret = -ENOMEM;
		goto out_unlock;
1084 1085
	}

1086
	iommu->dma_avail--;
1087 1088
	dma->iova = iova;
	dma->vaddr = vaddr;
1089
	dma->prot = prot;
1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119

	/*
	 * We need to be able to both add to a task's locked memory and test
	 * against the locked memory limit and we need to be able to do both
	 * outside of this call path as pinning can be asynchronous via the
	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
	 * task_struct and VM locked pages requires an mm_struct, however
	 * holding an indefinite mm reference is not recommended, therefore we
	 * only hold a reference to a task.  We could hold a reference to
	 * current, however QEMU uses this call path through vCPU threads,
	 * which can be killed resulting in a NULL mm and failure in the unmap
	 * path when called via a different thread.  Avoid this problem by
	 * using the group_leader as threads within the same group require
	 * both CLONE_THREAD and CLONE_VM and will therefore use the same
	 * mm_struct.<