]> git.ipfire.org Git - people/ms/linux.git/commitdiff
drm/amdkfd: Add user queue eviction restore SMI event
authorPhilip Yang <Philip.Yang@amd.com>
Fri, 14 Jan 2022 02:24:20 +0000 (21:24 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 30 Jun 2022 19:31:14 +0000 (15:31 -0400)
Output user queue eviction and restore event. User queue eviction may be
triggered by svm or userptr MMU notifier, TTM eviction, device suspend
and CRIU checkpoint and restore.

User queue restore may be rescheduled if eviction happens again while
restore.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index b25b41f5021389151260d15089c7367a83bf576b..73bf8b5f2aa9a4073fefdac97465e516cd26350a 100644 (file)
@@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
 }
 #endif
 /* KGD2KFD callbacks */
-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                                struct dma_fence *fence);
index 0036c9e405afdd2d39312de57d6295224122cb43..2fcc6e079769e5197bd1756c9188ba6a264e14ea 100644 (file)
@@ -32,6 +32,7 @@
 #include "amdgpu_dma_buf.h"
 #include <uapi/linux/kfd_ioctl.h>
 #include "amdgpu_xgmi.h"
+#include "kfd_smi_events.h"
 
 /* Userptr restore delay, just long enough to allow consecutive VM
  * changes to accumulate
@@ -2346,7 +2347,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
        evicted_bos = atomic_inc_return(&process_info->evicted_bos);
        if (evicted_bos == 1) {
                /* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
                if (r)
                        pr_err("Failed to quiesce KFD\n");
                schedule_delayed_work(&process_info->restore_userptr_work,
@@ -2620,13 +2621,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
 
 unlock_out:
        mutex_unlock(&process_info->lock);
-       mmput(mm);
-       put_task_struct(usertask);
 
        /* If validation failed, reschedule another attempt */
-       if (evicted_bos)
+       if (evicted_bos) {
                schedule_delayed_work(&process_info->restore_userptr_work,
                        msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+               kfd_smi_event_queue_restore_rescheduled(mm);
+       }
+       mmput(mm);
+       put_task_struct(usertask);
 }
 
 /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
index d07588230ed6013516b1adc597747f7113a1a2fb..2b3d8bc8f0aaeb2e243c43f75f40675fb3f039bb 100644 (file)
@@ -2434,7 +2434,7 @@ static int criu_restore(struct file *filep,
         * Set the process to evicted state to avoid running any new queues before all the memory
         * mappings are ready.
         */
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
        if (ret)
                goto exit_unlock;
 
@@ -2553,7 +2553,7 @@ static int criu_process_info(struct file *filep,
                goto err_unlock;
        }
 
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
        if (ret)
                goto err_unlock;
 
index c8fee0dbfdcb9deeb48807982dc852f07dccb954..6ec0e9f0927d9e035a17ee6381f755d3cb6d92ea 100644 (file)
@@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
        spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
 }
 
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
 {
        struct kfd_process *p;
        int r;
@@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
                return -ESRCH;
 
        WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
-       r = kfd_process_evict_queues(p);
+       r = kfd_process_evict_queues(p, trigger);
 
        kfd_unref_process(p);
        return r;
index 4c4bbd493caae58337bc11c4a6665379899cc7a3..d03a3b9c9c5d66cb532f4576e40c9ad7d1d5088b 100644 (file)
@@ -947,7 +947,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
 }
 
 void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
 int kfd_process_restore_queues(struct kfd_process *p);
 void kfd_suspend_all_processes(void);
 int kfd_resume_all_processes(void);
index a13e60d48b735c9bb57bbe489a77935691d102dc..fc38a4d81420df9536033ba672baaa7e38333ce3 100644 (file)
@@ -43,6 +43,7 @@ struct mm_struct;
 #include "kfd_device_queue_manager.h"
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
+#include "kfd_smi_events.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
  * Eviction is reference-counted per process-device. This means multiple
  * evictions from different sources can be nested safely.
  */
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
 {
        int r = 0;
        int i;
@@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
 
+               kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
+                                            trigger);
+
                r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
                                                            &pdd->qpd);
                /* evict return -EIO if HWS is hang or asic is resetting, in this case
@@ -1769,6 +1773,9 @@ fail:
 
                if (n_evicted == 0)
                        break;
+
+               kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
                if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd))
                        pr_err("Failed to restore queues\n");
@@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
 
+               kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
                r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd);
                if (r) {
@@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
        flush_delayed_work(&p->restore_work);
 
        pr_debug("Started evicting pasid 0x%x\n", p->pasid);
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
        if (!ret) {
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
@@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
                cancel_delayed_work_sync(&p->eviction_work);
                cancel_delayed_work_sync(&p->restore_work);
 
-               if (kfd_process_evict_queues(p))
+               if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
                        pr_err("Failed to suspend process 0x%x\n", p->pasid);
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
index ec4d278c2a471374bbbfece77318b05d948b29f8..3917c38204d099efa546facb592fb87dfdf5c528 100644 (file)
@@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
                          from, to, trigger);
 }
 
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
+                         "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
+                         dev->id, trigger);
+}
+
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
+                         "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
+                         dev->id);
+}
+
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
+{
+       struct kfd_process *p;
+       int i;
+
+       p = kfd_lookup_process_by_mm(mm);
+       if (!p)
+               return;
+
+       for (i = 0; i < p->n_pdds; i++) {
+               struct kfd_process_device *pdd = p->pdds[i];
+
+               kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
+                                 KFD_SMI_EVENT_QUEUE_RESTORE,
+                                 "%lld -%d %x %c\n", ktime_get_boottime_ns(),
+                                 p->lead_thread->pid, pdd->dev->id, 'R');
+       }
+       kfd_unref_process(p);
+}
+
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 {
        struct kfd_smi_client *client;
index ec5d74a2fef4433187884a3730a9e18cbb981602..b23292637239a163f20c53d270d0b9eb1a743792 100644 (file)
@@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
 void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
                             unsigned long start, unsigned long end,
                             uint32_t from, uint32_t to, uint32_t trigger);
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger);
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
 #endif
index e8ded7a02bcb1e0500e3affb343d25c26ee2f241..8bfb7b99e45d90acfc8993054ce064f4c544181b 100644 (file)
@@ -1730,14 +1730,16 @@ out_reschedule:
        mutex_unlock(&svms->lock);
        mmap_write_unlock(mm);
        mutex_unlock(&process_info->lock);
-       mmput(mm);
 
        /* If validation failed, reschedule another attempt */
        if (evicted_ranges) {
                pr_debug("reschedule to restore svm range\n");
                schedule_delayed_work(&svms->restore_work,
                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+
+               kfd_smi_event_queue_restore_rescheduled(mm);
        }
+       mmput(mm);
 }
 
 /**
@@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
                         prange->svms, prange->start, prange->last);
 
                /* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
                if (r)
                        pr_debug("failed to quiesce KFD\n");