Commit aa32f116 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull hmm updates from Jason Gunthorpe:
 "This is another round of bug fixing and cleanup. This time the focus
  is on the driver pattern to use mmu notifiers to monitor a VA range.
  This code is lifted out of many drivers and hmm_mirror directly into
  the mmu_notifier core and written using the best ideas from all the
  driver implementations.

  This removes many bugs from the drivers and has a very pleasing
  diffstat. More drivers can still be converted, but that is for another
  cycle.

   - A shared branch with RDMA reworking the RDMA ODP implementation

   - New mmu_interval_notifier API. This is focused on the use case of
     monitoring a VA and simplifies the process for drivers

   - A common seq-count locking scheme built into the
     mmu_interval_notifier API usable by drivers that call
     get_user_pages() or hmm_range_fault() with the VA range

   - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU,...
parents d5bb349d 93f4e735
......@@ -147,49 +147,16 @@ Address space mirroring implementation and API
Address space mirroring's main objective is to allow duplication of a range of
CPU page table into a device page table; HMM helps keep both synchronized. A
device driver that wants to mirror a process address space must start with the
registration of an hmm_mirror struct::
int hmm_mirror_register(struct hmm_mirror *mirror,
struct mm_struct *mm);
The mirror struct has a set of callbacks that are used
to propagate CPU page tables::
struct hmm_mirror_ops {
/* release() - release hmm_mirror
*
* @mirror: pointer to struct hmm_mirror
*
* This is called when the mm_struct is being released. The callback
* must ensure that all access to any pages obtained from this mirror
* is halted before the callback returns. All future access should
* fault.
*/
void (*release)(struct hmm_mirror *mirror);
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
* @update: update information (see struct mmu_notifier_range)
* Return: -EAGAIN if update.blockable false and callback need to
* block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
* in response to this callback. The update argument tells what action
* to perform.
*
* The device driver must not return from this callback until the device
* page tables are completely updated (TLBs flushed, etc); this is a
* synchronous call.
*/
int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
const struct hmm_update *update);
};
The device driver must perform the update action to the range (mark range
read only, or fully unmap, etc.). The device must complete the update before
the driver callback returns.
registration of a mmu_interval_notifier::
mni->ops = &driver_ops;
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
unsigned long start, unsigned long length,
struct mm_struct *mm);
During the driver_ops->invalidate() callback the device driver must perform
the update action to the range (mark range read only, or fully unmap,
etc.). The device must complete the update before the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use::
......@@ -216,70 +183,46 @@ The usage pattern is::
struct hmm_range range;
...
range.notifier = &mni;
range.start = ...;
range.end = ...;
range.pfns = ...;
range.flags = ...;
range.values = ...;
range.pfn_shift = ...;
hmm_range_register(&range, mirror);
/*
* Just wait for range to be valid, safe to ignore return value as we
* will use the return value of hmm_range_fault() below under the
* mmap_sem to ascertain the validity of the range.
*/
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
if (!mmget_not_zero(mni->notifier.mm))
return -EFAULT;
again:
range.notifier_seq = mmu_interval_read_begin(&mni);
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
if (ret) {
up_read(&mm->mmap_sem);
if (ret == -EBUSY) {
/*
* No need to check hmm_range_wait_until_valid() return value
* on retry we will get proper error with hmm_range_fault()
*/
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
goto again;
}
hmm_range_unregister(&range);
if (ret == -EBUSY)
goto again;
return ret;
}
up_read(&mm->mmap_sem);
take_lock(driver->update);
if (!hmm_range_valid(&range)) {
if (mmu_interval_read_retry(&ni, range.notifier_seq) {
release_lock(driver->update);
up_read(&mm->mmap_sem);
goto again;
}
// Use pfns array content to update device page table
/* Use pfns array content to update device page table,
* under the update lock */
hmm_range_unregister(&range);
release_lock(driver->update);
up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
sync_cpu_device_pagetables() callback. That lock must be held before calling
hmm_range_valid() to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
concurrent device updates in multi-devices scenario.
HMM also serves as an impedance mismatch between how CPU page table updates
are done (by CPU write to the page table and TLB flushes) and how devices
update their own page table. Device updates are a multi-step process. First,
appropriate commands are written to a buffer, then this buffer is scheduled for
execution on the device. It is only once the device has executed commands in
the buffer that the update is done. Creating and scheduling the update command
buffer can happen concurrently for multiple devices. Waiting for each device to
report commands as executed is serialized (there is no point in doing this
concurrently).
invalidate() callback. That lock must be held before calling
mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
update.
Leverage default_flags and pfn_flags_mask
=========================================
......
......@@ -967,6 +967,8 @@ struct amdgpu_device {
struct mutex lock_reset;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
int asic_reset_res;
struct work_struct xgmi_reset_work;
......
......@@ -505,8 +505,7 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
*
* Returns 0 for success, negative errno for errors.
*/
static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
uint64_t user_addr)
static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
{
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
......@@ -1199,7 +1198,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
if (user_addr) {
ret = init_user_pages(*mem, current->mm, user_addr);
ret = init_user_pages(*mem, user_addr);
if (ret)
goto allocate_init_user_pages_failed;
}
......@@ -1744,6 +1743,10 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
return ret;
}
/*
* FIXME: Cannot ignore the return code, must hold
* notifier_lock
*/
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
/* Mark the BO as valid unless it was invalidated
......
......@@ -538,8 +538,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
e->tv.num_shared = 2;
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
if (p->bo_list->first_userptr != p->bo_list->num_entries)
p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX);
INIT_LIST_HEAD(&duplicates);
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
......@@ -1219,11 +1217,11 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
if (r)
goto error_unlock;
/* No memory allocation is allowed while holding the mn lock.
* p->mn is hold until amdgpu_cs_submit is finished and fence is added
* to BOs.
/* No memory allocation is allowed while holding the notifier lock.
* The lock is held until amdgpu_cs_submit is finished and fence is
* added to BOs.
*/
amdgpu_mn_lock(p->mn);
mutex_lock(&p->adev->notifier_lock);
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
......@@ -1266,13 +1264,13 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
amdgpu_mn_unlock(p->mn);
mutex_unlock(&p->adev->notifier_lock);
return 0;
error_abort:
drm_sched_job_cleanup(&job->base);
amdgpu_mn_unlock(p->mn);
mutex_unlock(&p->adev->notifier_lock);
error_unlock:
amdgpu_job_free(job);
......
......@@ -2794,6 +2794,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset);
mutex_init(&adev->notifier_lock);
mutex_init(&adev->virt.dpm_mutex);
mutex_init(&adev->psp.mutex);
......
......@@ -51,439 +51,107 @@
#include "amdgpu_amdkfd.h"
/**
* struct amdgpu_mn_node
* amdgpu_mn_invalidate_gfx - callback to notify about mm change
*
* @it: interval node defining start-last of the affected address range
* @bos: list of all BOs in the affected address range
*
* Manages all BOs which are affected of a certain range of address space.
*/
struct amdgpu_mn_node {
struct interval_tree_node it;
struct list_head bos;
};
/**
* amdgpu_mn_destroy - destroy the HMM mirror
*
* @work: previously sheduled work item
*
* Lazy destroys the notifier from a work item
*/
static void amdgpu_mn_destroy(struct work_struct *work)
{
struct amdgpu_mn *amn = container_of(work, struct amdgpu_mn, work);
struct amdgpu_device *adev = amn->adev;
struct amdgpu_mn_node *node, *next_node;
struct amdgpu_bo *bo, *next_bo;
mutex_lock(&adev->mn_lock);
down_write(&amn->lock);
hash_del(&amn->node);
rbtree_postorder_for_each_entry_safe(node, next_node,
&amn->objects.rb_root, it.rb) {
list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
bo->mn = NULL;
list_del_init(&bo->mn_list);
}
kfree(node);
}
up_write(&amn->lock);
mutex_unlock(&adev->mn_lock);
hmm_mirror_unregister(&amn->mirror);
kfree(amn);
}
/**
* amdgpu_hmm_mirror_release - callback to notify about mm destruction
*
* @mirror: the HMM mirror (mm) this callback is about
*
* Shedule a work item to lazy destroy HMM mirror.
*/
static void amdgpu_hmm_mirror_release(struct hmm_mirror *mirror)
{
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
INIT_WORK(&amn->work, amdgpu_mn_destroy);
schedule_work(&amn->work);
}
/**
* amdgpu_mn_lock - take the write side lock for this notifier
*
* @mn: our notifier
*/
void amdgpu_mn_lock(struct amdgpu_mn *mn)
{
if (mn)
down_write(&mn->lock);
}
/**
* amdgpu_mn_unlock - drop the write side lock for this notifier
*
* @mn: our notifier
*/
void amdgpu_mn_unlock(struct amdgpu_mn *mn)
{
if (mn)
up_write(&mn->lock);
}
/**
* amdgpu_mn_read_lock - take the read side lock for this notifier
*
* @amn: our notifier
* @blockable: is the notifier blockable
*/
static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
{
if (blockable)
down_read(&amn->lock);
else if (!down_read_trylock(&amn->lock))
return -EAGAIN;
return 0;
}
/**
* amdgpu_mn_read_unlock - drop the read side lock for this notifier
*
* @amn: our notifier
*/
static void amdgpu_mn_read_unlock(struct amdgpu_mn *amn)
{
up_read(&amn->lock);
}
/**
* amdgpu_mn_invalidate_node - unmap all BOs of a node
*
* @node: the node with the BOs to unmap
* @start: start of address range affected
* @end: end of address range affected
* @mni: the range (mm) is about to update
* @range: details on the invalidation
* @cur_seq: Value to pass to mmu_interval_set_seq()
*
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
unsigned long start,
unsigned long end)
static bool amdgpu_mn_invalidate_gfx(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct amdgpu_bo *bo;
struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
long r;
list_for_each_entry(bo, &node->bos, mn_list) {
if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
continue;
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
true, false, MAX_SCHEDULE_TIMEOUT);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
}
}
/**
* amdgpu_mn_sync_pagetables_gfx - callback to notify about mm change
*
* @mirror: the hmm_mirror (mm) is about to update
* @update: the update start, end address
*
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
static int
amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
const struct mmu_notifier_range *update)
{
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
if (!mmu_notifier_range_blockable(range))
return false;
/* notification is exclusive, but interval is inclusive */
end -= 1;
mutex_lock(&adev->notifier_lock);
/* TODO we should be able to split locking for interval tree and
* amdgpu_mn_invalidate_node
*/
if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;
mmu_interval_set_seq(mni, cur_seq);
it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;
if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);
amdgpu_mn_invalidate_node(node, start, end);
}
amdgpu_mn_read_unlock(amn);
return 0;
}
/**
* amdgpu_mn_sync_pagetables_hsa - callback to notify about mm change
*
* @mirror: the hmm_mirror (mm) is about to update
* @update: the update start, end address
*
* We temporarily evict all BOs between start and end. This
* necessitates evicting all user-mode queues of the process. The BOs
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
*/
static int
amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
const struct mmu_notifier_range *update)
{
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
/* notification is exclusive, but interval is inclusive */
end -= 1;
if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);
list_for_each_entry(bo, &node->bos, mn_list) {
struct kgd_mem *mem = bo->kfd_bo;
if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
start, end))
amdgpu_amdkfd_evict_userptr(mem, amn->mm);
}
}
amdgpu_mn_read_unlock(amn);
return 0;
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
MAX_SCHEDULE_TIMEOUT);
mutex_unlock(&adev->notifier_lock);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
return true;
}
/* Low bits of any reasonable mm pointer will be unused due to struct
* alignment. Use these bits to make a unique key from the mm pointer
* and notifier type.
*/
#define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type))
static struct hmm_mirror_ops amdgpu_hmm_mirror_ops[] = {
[AMDGPU_MN_TYPE_GFX] = {
.sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_gfx,
.release = amdgpu_hmm_mirror_release
},
[AMDGPU_MN_TYPE_HSA] = {
.sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_hsa,
.release = amdgpu_hmm_mirror_release
},
static const struct mmu_interval_notifier_ops amdgpu_mn_gfx_ops = {
.invalidate = amdgpu_mn_invalidate_gfx,
};
/**
* amdgpu_mn_get - create HMM mirror context
* amdgpu_mn_invalidate_hsa - callback to notify about mm change
*
* @adev: amdgpu device pointer
* @type: type of MMU notifier context
* @mni: the range (mm) is about to update
* @range: details on the invalidation
* @cur_seq: Value to pass to mmu_interval_set_seq()
*
* Creates a HMM mirror context for current->mm.
* We temporarily evict the BO attached to this range. This necessitates
* evicting all user-mode queues of the process.
*/
struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
enum amdgpu_mn_type type)
static bool amdgpu_mn_invalidate_hsa(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct mm_struct *mm = current->mm;
struct amdgpu_mn *amn;
unsigned long key = AMDGPU_MN_KEY(mm, type);
int r;
mutex_lock(&adev->mn_lock);
if (down_write_killable(&mm->mmap_sem)) {
mutex_unlock(&adev->mn_lock);
return ERR_PTR(-EINTR);
}
hash_for_each_possible(adev->mn_hash, amn, node, key)
if (AMDGPU_MN_KEY(amn->mm, amn->type) == key)
goto release_locks;
amn = kzalloc(sizeof(*amn), GFP_KERNEL);
if (!amn) {
amn = ERR_PTR(-ENOMEM);
goto release_locks;
}
amn->adev = adev;
amn->mm = mm;
init_rwsem(&amn->lock);
amn->type = type;
amn->objects = RB_ROOT_CACHED;
amn->mirror.ops = &amdgpu_hmm_mirror_ops[type];
r = hmm_mirror_register(&amn->mirror, mm);
if (r)
goto free_amn;
struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
hash_add(adev->mn_hash, &amn->node, AMDGPU_MN_KEY(mm, type));
if (!mmu_notifier_range_blockable(range))
return false;
release_locks:
up_write(&mm->mmap_sem);
mutex_unlock(&adev->mn_lock);
mutex_lock(&adev->notifier_lock);
return amn;
mmu_interval_set_seq(mni, cur_seq);
free_amn:
up_write(&mm->mmap_sem);
mutex_unlock(&adev->mn_lock);
kfree(amn);
amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm);
mutex_unlock(&adev->notifier_lock);
return ERR_PTR(r);
return true;
}
static const struct mmu_interval_notifier_ops amdgpu_mn_hsa_ops = {
.invalidate = amdgpu_mn_invalidate_hsa,
};
/**
* amdgpu_mn_register - register a BO for notifier updates
*
* @bo: amdgpu buffer object
* @addr: userptr addr we should monitor
*
* Registers an HMM mirror for the given BO at the specified address.
* Registers a mmu_notifier for the given BO at the specified address.
* Returns 0 on success, -ERRNO if anything goes wrong.
*/
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
{
unsigned long end = addr + amdgpu_bo_size(bo) - 1;
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
enum amdgpu_mn_type type =
bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX;
struct amdgpu_mn *amn;