]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
habanalabs: soft-reset device if context-switch fails
authorOded Gabbay <oded.gabbay@gmail.com>
Thu, 28 Feb 2019 08:46:21 +0000 (10:46 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 28 Feb 2019 12:07:52 +0000 (13:07 +0100)
This patch fix a bug in the driver, where if the TPC or MME remains in
non-IDLE even after all the command submissions are done (due to user bug
or malicious user), then future command submissions will fail in the
context-switch stage and the driver will remain in "stuck" mode.

The fix is to do a soft-reset of the device in case the context-switch
fails, because the device should be IDLE during context-switch. If it is
not IDLE, then something is wrong and we should reset the compute engines.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/habanalabs/command_submission.c
drivers/misc/habanalabs/goya/goya.c

index 25ad9d805cfa03b8f610a02677f54e3ec332cab9..3525236ed8d9d702e25fac066926ba1933fe4edc 100644 (file)
@@ -622,13 +622,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
                                        "Failed to switch to context %d, rejecting CS! %d\n",
                                        ctx->asid, rc);
                                /*
-                                * If we timedout, we need to soft-reset because
-                                * QMAN is probably stuck. However, we can't
-                                * call to reset here directly because of
-                                * deadlock, so need to do it at the very end
-                                * of this function
+                                * If we timedout, or if the device is not IDLE
+                                * while we want to do context-switch (-EBUSY),
+                                * we need to soft-reset because QMAN is
+                                * probably stuck. However, we can't call to
+                                * reset here directly because of deadlock, so
+                                * need to do it at the very end of this
+                                * function
                                 */
-                               if (rc == -ETIMEDOUT)
+                               if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
                                        need_soft_reset = true;
                                mutex_unlock(&hpriv->restore_phase_mutex);
                                goto out;
@@ -706,7 +708,7 @@ out:
                args->out.seq = cs_seq;
        }
 
-       if ((rc == -ETIMEDOUT) && (need_soft_reset))
+       if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset))
                hl_device_reset(hdev, false, false);
 
        return rc;
index 39824214ce61ae370f50c6ed54e915160a8cc952..11597432f5195b982034cdad7e522049d53e8b66 100644 (file)
@@ -3138,7 +3138,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
        if (!hdev->asic_funcs->is_device_idle(hdev)) {
                dev_err_ratelimited(hdev->dev,
                        "Can't send KMD job on QMAN0 if device is not idle\n");
-               return -EFAULT;
+               return -EBUSY;
        }
 
        fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,