drm/sched: Add fair scheduling policy

author Tvrtko Ursulin <tvrtko.ursulin@igalia.com>

Fri, 17 Apr 2026 10:37:25 +0000 (11:37 +0100)

committer Philipp Stanner <phasta@kernel.org>

Fri, 17 Apr 2026 12:43:28 +0000 (14:43 +0200)
author Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Fri, 17 Apr 2026 10:37:25 +0000 (11:37 +0100)
committer Philipp Stanner <phasta@kernel.org>
Fri, 17 Apr 2026 12:43:28 +0000 (14:43 +0200)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c

index d1571e2b51928c1a03eab496fe2d6179d553f66e..b57bafc42febd1a4690ce4024a9c691797f389d3 100644 (file)
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -126,6 +126,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
         entity->guilty = guilty;
         entity->priority = priority;
         entity->last_user = current->group_leader;
+       entity->rq_priority = drm_sched_policy == DRM_SCHED_POLICY_FAIR ?
+                             DRM_SCHED_PRIORITY_KERNEL : priority;
         entity->num_sched_list = num_sched_list;
         entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
         RCU_INIT_POINTER(entity->last_scheduled, NULL);
@@ -138,17 +140,23 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
                  */
                 pr_warn("%s: called with uninitialized scheduler\n", __func__);
         } else {
-               /* The "priority" of an entity cannot exceed the number of run-queues of a
-                * scheduler. Protect against num_rqs being 0, by converting to signed. Choose
-                * the lowest priority available.
+               enum drm_sched_priority p = entity->priority;
+
+               /*
+                * The "priority" of an entity cannot exceed the number of
+                * run-queues of a scheduler. Protect against num_rqs being 0,
+                * by converting to signed. Choose the lowest priority
+                * available.
                  */
-               if (entity->priority >= sched_list[0]->num_rqs) {
-                       dev_err(sched_list[0]->dev, "entity has out-of-bounds priority: %u. num_rqs: %u\n",
-                               entity->priority, sched_list[0]->num_rqs);
-                       entity->priority = max_t(s32, (s32) sched_list[0]->num_rqs - 1,
-                                                (s32) DRM_SCHED_PRIORITY_KERNEL);
+               if (p >= sched_list[0]->num_user_rqs) {
+                       dev_err(sched_list[0]->dev, "entity with out-of-bounds priority:%u num_user_rqs:%u\n",
+                               p, sched_list[0]->num_user_rqs);
+                       p = max_t(s32,
+                                 (s32)sched_list[0]->num_user_rqs - 1,
+                                 (s32)DRM_SCHED_PRIORITY_KERNEL);
+                       entity->priority = p;
                 }
-               entity->rq = sched_list[0]->sched_rq[entity->priority];
+               entity->rq = sched_list[0]->sched_rq[entity->rq_priority];
         }
  
         init_completion(&entity->entity_idle);
@@ -594,7 +602,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
  
         spin_lock(&entity->lock);
         sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
-       rq = sched ? sched->sched_rq[entity->priority] : NULL;
+       rq = sched ? sched->sched_rq[entity->rq_priority] : NULL;
         if (rq != entity->rq) {
                 drm_sched_rq_remove_entity(entity->rq, entity);
                 entity->rq = rq;
diff --git a/drivers/gpu/drm/scheduler/sched_internal.h b/drivers/gpu/drm/scheduler/sched_internal.h

index 743a2cc43702dde029ecb92e44241061c675ea91..64acd4b7a918f26693a686ef2d8ff977e3cd56ab 100644 (file)
--- a/drivers/gpu/drm/scheduler/sched_internal.h
+++ b/drivers/gpu/drm/scheduler/sched_internal.h
@@ -12,6 +12,8 @@
   * @kref: reference count for the object.
   * @lock: lock guarding the @runtime updates.
   * @runtime: time entity spent on the GPU.
+ * @prev_runtime: previous @runtime used to get the runtime delta.
+ * @vruntime: virtual runtime as accumulated by the fair algorithm.
   *
   * Because jobs and entities have decoupled lifetimes, ie. we cannot access the
   * entity once the job has been de-queued, and we do need know how much GPU time
@@ -22,6 +24,8 @@ struct drm_sched_entity_stats {
         struct kref     kref;
         spinlock_t      lock; /* Protects the below fields. */
         ktime_t         runtime;
+       ktime_t         prev_runtime;
+       ktime_t         vruntime;
  };
  
  /* Used to choose between FIFO and RR job-scheduling */
@@ -29,6 +33,7 @@ extern int drm_sched_policy;
  
  #define DRM_SCHED_POLICY_RR    0
  #define DRM_SCHED_POLICY_FIFO  1
+#define DRM_SCHED_POLICY_FAIR  2
  
  bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
                          struct drm_sched_entity *entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c

index 826ef4e0180a28442e6100e1f55cf2c15e849121..0aca41b4e334eb450629e7b14c04e516f2e1f637 100644 (file)
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -90,7 +90,7 @@ int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
   * DOC: sched_policy (int)
   * Used to override default entities scheduling policy in a run queue.
   */
-MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
+MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default), " __stringify(DRM_SCHED_POLICY_FAIR) " = Fair.");
  module_param_named(sched_policy, drm_sched_policy, int, 0444);
  
  static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
@@ -1141,11 +1141,13 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
                 sched->own_submit_wq = true;
         }
  
-       sched->sched_rq = kmalloc_objs(*sched->sched_rq, args->num_rqs,
-                                      GFP_KERNEL | __GFP_ZERO);
+       sched->num_user_rqs = args->num_rqs;
+       sched->num_rqs = drm_sched_policy != DRM_SCHED_POLICY_FAIR ?
+                        args->num_rqs : 1;
+       sched->sched_rq = kzalloc_objs(*sched->sched_rq, args->num_rqs);
         if (!sched->sched_rq)
                 goto Out_check_own;
-       sched->num_rqs = args->num_rqs;
+
         for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
                 sched->sched_rq[i] = kzalloc_obj(*sched->sched_rq[i]);
                 if (!sched->sched_rq[i])
@@ -1258,7 +1260,7 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
         if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
                 atomic_inc(&bad->karma);
  
-               for (i = DRM_SCHED_PRIORITY_HIGH; i < sched->num_rqs; i++) {
+               for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
                         struct drm_sched_rq *rq = sched->sched_rq[i];
  
                         spin_lock(&rq->lock);
diff --git a/drivers/gpu/drm/scheduler/sched_rq.c b/drivers/gpu/drm/scheduler/sched_rq.c

index 067083a59d59f6f27d8528ae32ea6422866df94a..8fec2d0c5a439be29d794ec6dc5f2748381a86c4 100644 (file)
--- a/drivers/gpu/drm/scheduler/sched_rq.c
+++ b/drivers/gpu/drm/scheduler/sched_rq.c
@@ -20,6 +20,35 @@ drm_sched_entity_compare_before(struct rb_node *a, const struct rb_node *b)
         return ktime_before(ea->oldest_job_waiting, eb->oldest_job_waiting);
  }
  
+static void drm_sched_rq_update_prio(struct drm_sched_rq *rq)
+{
+       enum drm_sched_priority prio = DRM_SCHED_PRIORITY_INVALID;
+       struct rb_node *rb;
+
+       lockdep_assert_held(&rq->lock);
+
+       rb = rb_first_cached(&rq->rb_tree_root);
+       if (rb) {
+               struct drm_sched_entity *entity =
+                       rb_entry(rb, typeof(*entity), rb_tree_node);
+
+               /*
+                * The normal locking order is entity then run-queue so taking
+                * the entity lock here would be a locking inversion for the
+                * case when the current head of the run-queue is different from
+                * the one we already have locked. The unlocked read is fine
+                * though, because if the priority had just changed it is no big
+                * deal for our algorithm, but just a transient reachable only
+                * by drivers with userspace dynamic priority changes API. Equal
+                * in effect to the priority change becoming visible a few
+                * instructions later.
+                */
+               prio = READ_ONCE(entity->priority);
+       }
+
+       rq->head_prio = prio;
+}
+
  static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
                                             struct drm_sched_rq *rq)
  {
@@ -29,6 +58,7 @@ static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
         if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
                 rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
                 RB_CLEAR_NODE(&entity->rb_tree_node);
+               drm_sched_rq_update_prio(rq);
         }
  }
  
@@ -50,6 +80,7 @@ static void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
  
         rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
                       drm_sched_entity_compare_before);
+       drm_sched_rq_update_prio(rq);
  }
  
  /**
@@ -66,6 +97,133 @@ void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
         INIT_LIST_HEAD(&rq->entities);
         rq->rb_tree_root = RB_ROOT_CACHED;
         rq->sched = sched;
+       rq->head_prio = DRM_SCHED_PRIORITY_INVALID;
+}
+
+/*
+ * Core part of the CFS-like algorithm is that the virtual runtime of lower
+ * priority tasks should grow quicker than the higher priority ones, so that
+ * when we then schedule entities with the aim of keeping their accumulated
+ * virtual time balanced, we can approach fair distribution of GPU time.
+ *
+ * For converting the real GPU time into virtual we pick some multipliers with
+ * the idea to achieve the following GPU time distribution:
+ *
+ *  - Kernel priority gets roughly 2x GPU time compared to high.
+ *  - High gets ~4x relative to normal.
+ *  - Normal gets ~8x relative to low.
+ */
+static const unsigned int vruntime_shift[] = {
+       [DRM_SCHED_PRIORITY_KERNEL] = 1,
+       [DRM_SCHED_PRIORITY_HIGH]   = 2,
+       [DRM_SCHED_PRIORITY_NORMAL] = 4,
+       [DRM_SCHED_PRIORITY_LOW]    = 7,
+};
+
+static ktime_t
+drm_sched_rq_get_min_vruntime(struct drm_sched_rq *rq)
+{
+       ktime_t vruntime = 0;
+       struct rb_node *rb;
+
+       lockdep_assert_held(&rq->lock);
+
+       rb = rb_first_cached(&rq->rb_tree_root);
+       if (rb) {
+               struct drm_sched_entity *entity =
+                       rb_entry(rb, typeof(*entity), rb_tree_node);
+               struct drm_sched_entity_stats *stats = entity->stats;
+
+               spin_lock(&stats->lock);
+               vruntime = stats->vruntime;
+               spin_unlock(&stats->lock);
+       }
+
+       return vruntime;
+}
+
+static void
+drm_sched_entity_save_vruntime(struct drm_sched_entity *entity,
+                              ktime_t min_vruntime)
+{
+       struct drm_sched_entity_stats *stats = entity->stats;
+       ktime_t vruntime;
+
+       spin_lock(&stats->lock);
+       vruntime = stats->vruntime;
+       if (min_vruntime && vruntime > min_vruntime)
+               vruntime = ktime_sub(vruntime, min_vruntime);
+       else
+               vruntime = 0;
+       stats->vruntime = vruntime;
+       spin_unlock(&stats->lock);
+}
+
+static ktime_t
+drm_sched_entity_restore_vruntime(struct drm_sched_entity *entity,
+                                 ktime_t min_vruntime,
+                                 enum drm_sched_priority rq_prio)
+{
+       struct drm_sched_entity_stats *stats = entity->stats;
+       enum drm_sched_priority prio = entity->priority;
+       ktime_t vruntime;
+
+       BUILD_BUG_ON(DRM_SCHED_PRIORITY_NORMAL < DRM_SCHED_PRIORITY_HIGH);
+
+       spin_lock(&stats->lock);
+       vruntime = stats->vruntime;
+
+       /*
+        * Special handling for entities which were picked from the top of the
+        * queue and are now re-joining the top with another one already there.
+        */
+       if (!vruntime && rq_prio != DRM_SCHED_PRIORITY_INVALID) {
+               if (prio > rq_prio) {
+                       /*
+                        * Lower priority should not overtake higher when re-
+                        * joining at the top of the queue.
+                        */
+                       vruntime = ns_to_ktime(prio - rq_prio);
+               } else if (prio < rq_prio) {
+                       /*
+                        * Higher priority can go first.
+                        */
+                       vruntime = -ns_to_ktime(rq_prio - prio);
+               }
+       }
+
+       /*
+        * Restore saved relative position in the queue.
+        */
+       vruntime = ktime_add(min_vruntime, vruntime);
+
+       stats->vruntime = vruntime;
+       spin_unlock(&stats->lock);
+
+       return vruntime;
+}
+
+static ktime_t drm_sched_entity_update_vruntime(struct drm_sched_entity *entity)
+{
+       struct drm_sched_entity_stats *stats = entity->stats;
+       ktime_t runtime, prev;
+
+       spin_lock(&stats->lock);
+       prev = stats->prev_runtime;
+       runtime = stats->runtime;
+       stats->prev_runtime = runtime;
+       runtime = ktime_add_ns(stats->vruntime,
+                              ktime_to_ns(ktime_sub(runtime, prev)) <<
+                              vruntime_shift[entity->priority]);
+       stats->vruntime = runtime;
+       spin_unlock(&stats->lock);
+
+       return runtime;
+}
+
+static ktime_t drm_sched_entity_get_job_ts(struct drm_sched_entity *entity)
+{
+       return drm_sched_entity_update_vruntime(entity);
  }
  
  /**
@@ -102,8 +260,14 @@ drm_sched_rq_add_entity(struct drm_sched_entity *entity, ktime_t ts)
                 list_add_tail(&entity->list, &rq->entities);
         }
  
-       if (drm_sched_policy == DRM_SCHED_POLICY_RR)
+       if (drm_sched_policy == DRM_SCHED_POLICY_FAIR) {
+               ts = drm_sched_rq_get_min_vruntime(rq);
+               ts = drm_sched_entity_restore_vruntime(entity, ts,
+                                                      rq->head_prio);
+       } else if (drm_sched_policy == DRM_SCHED_POLICY_RR) {
                 ts = entity->rr_ts;
+       }
+
         drm_sched_rq_update_fifo_locked(entity, rq, ts);
  
         spin_unlock(&rq->lock);
@@ -175,7 +339,9 @@ void drm_sched_rq_pop_entity(struct drm_sched_entity *entity)
         if (next_job) {
                 ktime_t ts;
  
-               if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
+               if (drm_sched_policy == DRM_SCHED_POLICY_FAIR)
+                       ts = drm_sched_entity_get_job_ts(entity);
+               else if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
                         ts = next_job->submit_ts;
                 else
                         ts = drm_sched_rq_next_rr_ts(rq, entity);
@@ -183,6 +349,13 @@ void drm_sched_rq_pop_entity(struct drm_sched_entity *entity)
                 drm_sched_rq_update_fifo_locked(entity, rq, ts);
         } else {
                 drm_sched_rq_remove_fifo_locked(entity, rq);
+
+               if (drm_sched_policy == DRM_SCHED_POLICY_FAIR) {
+                       ktime_t min_vruntime;
+
+                       min_vruntime = drm_sched_rq_get_min_vruntime(rq);
+                       drm_sched_entity_save_vruntime(entity, min_vruntime);
+               }
         }
         spin_unlock(&rq->lock);
         spin_unlock(&entity->lock);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h

index c41a97a2c1ee889e965b9bba42ce9283f60ac527..69c2097664fd003f4f664f9c76a859bcf3fd0090 100644 (file)
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -63,6 +63,7 @@ struct drm_file;
   * to an array, and as such should start at 0.
   */
  enum drm_sched_priority {
+       DRM_SCHED_PRIORITY_INVALID = -1, /* Internal marker - do not use. */
         DRM_SCHED_PRIORITY_KERNEL,
         DRM_SCHED_PRIORITY_HIGH,
         DRM_SCHED_PRIORITY_NORMAL,
@@ -150,6 +151,11 @@ struct drm_sched_entity {
          */
         enum drm_sched_priority         priority;
  
+       /**
+        * @rq_priority: Run-queue priority
+        */
+       enum drm_sched_priority         rq_priority;
+
         /**
          * @rr_ts:
          *
@@ -254,10 +260,11 @@ struct drm_sched_entity {
   * struct drm_sched_rq - queue of entities to be scheduled.
   *
   * @sched: the scheduler to which this rq belongs to.
- * @lock: protects @entities, @rb_tree_root and @rr_ts.
+ * @lock: protects @entities, @rb_tree_root, @rr_ts and @head_prio.
   * @rr_ts: monotonically incrementing fake timestamp for RR mode.
   * @entities: list of the entities to be scheduled.
   * @rb_tree_root: root of time based priority queue of entities for FIFO scheduling
+ * @head_prio: priority of the top tree element.
   *
   * Run queue is a set of entities scheduling command submissions for
   * one specific ring. It implements the scheduling policy that selects
@@ -271,6 +278,7 @@ struct drm_sched_rq {
         ktime_t                         rr_ts;
         struct list_head                entities;
         struct rb_root_cached           rb_tree_root;
+       enum drm_sched_priority         head_prio;
  };
  
  /**
@@ -563,8 +571,10 @@ struct drm_sched_backend_ops {
   * @credit_count: the current credit count of this scheduler
   * @timeout: the time after which a job is removed from the scheduler.
   * @name: name of the ring for which this scheduler is being used.
- * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
- *           as there's usually one run-queue per priority, but could be less.
+ * @num_user_rqs: Number of run-queues. This is at most
+ *                DRM_SCHED_PRIORITY_COUNT, as there's usually one run-queue per
+ *                priority, but could be less.
+ * @num_rqs: Equal to @num_user_rqs for FIFO and RR and 1 for the FAIR policy.
   * @sched_rq: An allocated array of run-queues of size @num_rqs;
   * @job_scheduled: once drm_sched_entity_flush() is called the scheduler
   *                 waits on this wait queue until all the scheduled jobs are
@@ -597,6 +607,7 @@ struct drm_gpu_scheduler {
         long                            timeout;
         const char                      *name;
         u32                             num_rqs;
+       u32                             num_user_rqs;
         struct drm_sched_rq             **sched_rq;
         wait_queue_head_t               job_scheduled;
         atomic64_t                      job_id_count;
author	Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
	Fri, 17 Apr 2026 10:37:25 +0000 (11:37 +0100)
committer	Philipp Stanner <phasta@kernel.org>
	Fri, 17 Apr 2026 12:43:28 +0000 (14:43 +0200)
drivers/gpu/drm/scheduler/sched_entity.c		patch \| blob \| blame \| history
drivers/gpu/drm/scheduler/sched_internal.h		patch \| blob \| blame \| history
drivers/gpu/drm/scheduler/sched_main.c		patch \| blob \| blame \| history
drivers/gpu/drm/scheduler/sched_rq.c		patch \| blob \| blame \| history
include/drm/gpu_scheduler.h		patch \| blob \| blame \| history