From 18b8413b25b7070fa2e55858a2c808e6909581d0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ma=C3=ADra=20Canal?= Date: Thu, 30 Nov 2023 13:40:35 -0300 Subject: [PATCH] drm/v3d: Create a CPU job extension for a indirect CSD job MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit A CPU job is a type of job that performs operations that requires CPU intervention. An indirect CSD job is a job that, when executed in the queue, will map the indirect buffer, read the dispatch parameters, and submit a regular dispatch. Therefore, it is a job that needs CPU intervention. So, create a user extension for the CPU job that enables the creation of an indirect CSD. This user extension will allow the creation of a CSD job linked to a CPU job. The CPU job will wait for the indirect CSD job dependencies and, once they are signaled, it will update the CSD job parameters. Co-developed-by: Melissa Wen Signed-off-by: Melissa Wen Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://patchwork.freedesktop.org/patch/msgid/20231130164420.932823-14-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_drv.h | 31 ++++++++- drivers/gpu/drm/v3d/v3d_sched.c | 41 +++++++++++- drivers/gpu/drm/v3d/v3d_submit.c | 104 ++++++++++++++++++++++++++++++- include/uapi/drm/v3d_drm.h | 43 ++++++++++++- 4 files changed, 213 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index 39d62915cdd62..202c0d4b04a5f 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -316,12 +316,41 @@ struct v3d_csd_job { struct drm_v3d_submit_csd args; }; -enum v3d_cpu_job_type {}; +enum v3d_cpu_job_type { + V3D_CPU_JOB_TYPE_INDIRECT_CSD = 1, +}; + +struct v3d_indirect_csd_info { + /* Indirect CSD */ + struct v3d_csd_job *job; + + /* Clean cache job associated to the Indirect CSD job */ + struct v3d_job *clean_job; + + /* Offset within the BO where the workgroup counts are stored */ + u32 offset; + + /* Workgroups size */ + u32 wg_size; + + /* Indices of the uniforms with the workgroup dispatch counts + * in the uniform stream. + */ + u32 wg_uniform_offsets[3]; + + /* Indirect BO */ + struct drm_gem_object *indirect; + + /* Context of the Indirect CSD job */ + struct ww_acquire_ctx acquire_ctx; +}; struct v3d_cpu_job { struct v3d_job base; enum v3d_cpu_job_type job_type; + + struct v3d_indirect_csd_info indirect_csd; }; typedef void (*v3d_cpu_job_fn)(struct v3d_cpu_job *); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index ebbd00840a73c..307257ab0e4a2 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -25,6 +25,8 @@ #include "v3d_regs.h" #include "v3d_trace.h" +#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 + static struct v3d_job * to_v3d_job(struct drm_sched_job *sched_job) { @@ -268,7 +270,44 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) return fence; } -static const v3d_cpu_job_fn cpu_job_function[] = { }; +static void +v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) +{ + struct v3d_indirect_csd_info *indirect_csd = &job->indirect_csd; + struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); + struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect); + struct drm_v3d_submit_csd *args = &indirect_csd->job->args; + u32 *wg_counts; + + v3d_get_bo_vaddr(bo); + v3d_get_bo_vaddr(indirect); + + wg_counts = (uint32_t *)(bo->vaddr + indirect_csd->offset); + + if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0) + return; + + args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; + args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + + for (int i = 0; i < 3; i++) { + /* 0xffffffff indicates that the uniform rewrite is not needed */ + if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) { + u32 uniform_idx = indirect_csd->wg_uniform_offsets[i]; + ((uint32_t *)indirect->vaddr)[uniform_idx] = wg_counts[i]; + } + } + + v3d_put_bo_vaddr(indirect); + v3d_put_bo_vaddr(bo); +} + +static const v3d_cpu_job_fn cpu_job_function[] = { + [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = v3d_rewrite_csd_job_wg_counts_from_indirect, +}; static struct dma_fence * v3d_cpu_job_run(struct drm_sched_job *sched_job) diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c index eb26fe1e27e3a..0320695b941b7 100644 --- a/drivers/gpu/drm/v3d/v3d_submit.c +++ b/drivers/gpu/drm/v3d/v3d_submit.c @@ -391,6 +391,48 @@ v3d_get_multisync_submit_deps(struct drm_file *file_priv, return 0; } +/* Get data for the indirect CSD job submission. */ +static int +v3d_get_cpu_indirect_csd_params(struct drm_file *file_priv, + struct drm_v3d_extension __user *ext, + struct v3d_cpu_job *job) +{ + struct v3d_file_priv *v3d_priv = file_priv->driver_priv; + struct v3d_dev *v3d = v3d_priv->v3d; + struct drm_v3d_indirect_csd indirect_csd; + struct v3d_indirect_csd_info *info = &job->indirect_csd; + + if (!job) { + DRM_DEBUG("CPU job extension was attached to a GPU job.\n"); + return -EINVAL; + } + + if (job->job_type) { + DRM_DEBUG("Two CPU job extensions were added to the same CPU job.\n"); + return -EINVAL; + } + + if (copy_from_user(&indirect_csd, ext, sizeof(indirect_csd))) + return -EFAULT; + + if (!v3d_has_csd(v3d)) { + DRM_DEBUG("Attempting CSD submit on non-CSD hardware.\n"); + return -EINVAL; + } + + job->job_type = V3D_CPU_JOB_TYPE_INDIRECT_CSD; + info->offset = indirect_csd.offset; + info->wg_size = indirect_csd.wg_size; + memcpy(&info->wg_uniform_offsets, &indirect_csd.wg_uniform_offsets, + sizeof(indirect_csd.wg_uniform_offsets)); + + info->indirect = drm_gem_object_lookup(file_priv, indirect_csd.indirect); + + return v3d_setup_csd_jobs_and_bos(file_priv, v3d, &indirect_csd.submit, + &info->job, &info->clean_job, + NULL, &info->acquire_ctx); +} + /* Whenever userspace sets ioctl extensions, v3d_get_extensions parses data * according to the extension id (name). */ @@ -416,6 +458,9 @@ v3d_get_extensions(struct drm_file *file_priv, case DRM_V3D_EXT_ID_MULTI_SYNC: ret = v3d_get_multisync_submit_deps(file_priv, user_ext, se); break; + case DRM_V3D_EXT_ID_CPU_INDIRECT_CSD: + ret = v3d_get_cpu_indirect_csd_params(file_priv, user_ext, job); + break; default: DRM_DEBUG_DRIVER("Unknown extension id: %d\n", ext.id); return -EINVAL; @@ -790,7 +835,9 @@ fail: return ret; } -static const unsigned int cpu_job_bo_handle_count[] = { }; +static const unsigned int cpu_job_bo_handle_count[] = { + [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = 1, +}; /** * v3d_submit_cpu_ioctl() - Submits a CPU job to the V3D. @@ -808,7 +855,10 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data, struct v3d_dev *v3d = to_v3d_dev(dev); struct drm_v3d_submit_cpu *args = data; struct v3d_submit_ext se = {0}; + struct v3d_submit_ext *out_se = NULL; struct v3d_cpu_job *cpu_job = NULL; + struct v3d_csd_job *csd_job = NULL; + struct v3d_job *clean_job = NULL; struct ww_acquire_ctx acquire_ctx; int ret; @@ -847,6 +897,9 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data, if (ret) goto fail; + clean_job = cpu_job->indirect_csd.clean_job; + csd_job = cpu_job->indirect_csd.job; + if (args->bo_handle_count) { ret = v3d_lookup_bos(dev, file_priv, &cpu_job->base, args->bo_handles, args->bo_handle_count); @@ -860,19 +913,66 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data, mutex_lock(&v3d->sched_lock); v3d_push_job(&cpu_job->base); + + switch (cpu_job->job_type) { + case V3D_CPU_JOB_TYPE_INDIRECT_CSD: + ret = drm_sched_job_add_dependency(&csd_job->base.base, + dma_fence_get(cpu_job->base.done_fence)); + if (ret) + goto fail_unreserve; + + v3d_push_job(&csd_job->base); + + ret = drm_sched_job_add_dependency(&clean_job->base, + dma_fence_get(csd_job->base.done_fence)); + if (ret) + goto fail_unreserve; + + v3d_push_job(clean_job); + + break; + default: + break; + } mutex_unlock(&v3d->sched_lock); + out_se = (cpu_job->job_type == V3D_CPU_JOB_TYPE_INDIRECT_CSD) ? NULL : &se; + v3d_attach_fences_and_unlock_reservation(file_priv, &cpu_job->base, &acquire_ctx, 0, - NULL, cpu_job->base.done_fence); + out_se, cpu_job->base.done_fence); + + switch (cpu_job->job_type) { + case V3D_CPU_JOB_TYPE_INDIRECT_CSD: + v3d_attach_fences_and_unlock_reservation(file_priv, + clean_job, + &cpu_job->indirect_csd.acquire_ctx, + 0, &se, clean_job->done_fence); + break; + default: + break; + } v3d_job_put(&cpu_job->base); + v3d_job_put(&csd_job->base); + v3d_job_put(clean_job); return 0; +fail_unreserve: + mutex_unlock(&v3d->sched_lock); + + drm_gem_unlock_reservations(cpu_job->base.bo, cpu_job->base.bo_count, + &acquire_ctx); + + drm_gem_unlock_reservations(clean_job->bo, clean_job->bo_count, + &cpu_job->indirect_csd.acquire_ctx); + fail: v3d_job_cleanup((void *)cpu_job); + v3d_job_cleanup((void *)csd_job); + v3d_job_cleanup(clean_job); v3d_put_multisync_post_deps(&se); return ret; diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h index 00abef9d0db73..0c0f477825282 100644 --- a/include/uapi/drm/v3d_drm.h +++ b/include/uapi/drm/v3d_drm.h @@ -71,7 +71,8 @@ extern "C" { struct drm_v3d_extension { __u64 next; __u32 id; -#define DRM_V3D_EXT_ID_MULTI_SYNC 0x01 +#define DRM_V3D_EXT_ID_MULTI_SYNC 0x01 +#define DRM_V3D_EXT_ID_CPU_INDIRECT_CSD 0x02 __u32 flags; /* mbz */ }; @@ -365,8 +366,46 @@ struct drm_v3d_submit_csd { __u32 pad; }; +/** + * struct drm_v3d_indirect_csd - ioctl extension for the CPU job to create an + * indirect CSD + * + * When an extension of DRM_V3D_EXT_ID_CPU_INDIRECT_CSD id is defined, it + * points to this extension to define a indirect CSD submission. It creates a + * CPU job linked to a CSD job. The CPU job waits for the indirect CSD + * dependencies and, once they are signaled, it updates the CSD job config + * before allowing the CSD job execution. + */ +struct drm_v3d_indirect_csd { + struct drm_v3d_extension base; + + /* Indirect CSD */ + struct drm_v3d_submit_csd submit; + + /* Handle of the indirect BO, that should be also attached to the + * indirect CSD. + */ + __u32 indirect; + + /* Offset within the BO where the workgroup counts are stored */ + __u32 offset; + + /* Workgroups size */ + __u32 wg_size; + + /* Indices of the uniforms with the workgroup dispatch counts + * in the uniform stream. If the uniform rewrite is not needed, + * the offset must be 0xffffffff. + */ + __u32 wg_uniform_offsets[3]; +}; + struct drm_v3d_submit_cpu { - /* Pointer to a u32 array of the BOs that are referenced by the job. */ + /* Pointer to a u32 array of the BOs that are referenced by the job. + * + * For DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, it must contain only one BO, + * that contains the workgroup counts. + */ __u64 bo_handles; /* Number of BO handles passed in (size is that times 4). */ -- 2.47.3