--- /dev/null
+From 457e67a728696c4f8e6423c64e93def50530db9a Mon Sep 17 00:00:00 2001
+From: Eric Anholt <eric@anholt.net>
+Date: Thu, 20 Oct 2016 16:48:12 -0700
+Subject: drm/vc4: Fix termination of the initial scan for branch targets.
+
+From: Eric Anholt <eric@anholt.net>
+
+commit 457e67a728696c4f8e6423c64e93def50530db9a upstream.
+
+The loop is scanning until the original max_ip (size of the BO), but
+we want to not examine any code after the PROG_END's delay slots.
+There was a block trying to do that, except that we had some early
+continue statements if the signal wasn't a PROG_END or a BRANCH.
+
+The failure mode would be that a valid shader is rejected because some
+undefined memory after the PROG_END slots is parsed as a branch and
+the rest of its setup is illegal. I haven't seen this in the wild,
+but valgrind was complaining when about this up in the userland
+simulator mode.
+
+Signed-off-by: Eric Anholt <eric@anholt.net>
+Cc: Amit Pundir <amit.pundir@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/vc4/vc4_validate_shaders.c | 19 ++++++++-----------
+ 1 file changed, 8 insertions(+), 11 deletions(-)
+
+--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
++++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+@@ -608,9 +608,7 @@ static bool
+ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
+ {
+ uint32_t max_branch_target = 0;
+- bool found_shader_end = false;
+ int ip;
+- int shader_end_ip = 0;
+ int last_branch = -2;
+
+ for (ip = 0; ip < validation_state->max_ip; ip++) {
+@@ -621,8 +619,13 @@ vc4_validate_branches(struct vc4_shader_
+ uint32_t branch_target_ip;
+
+ if (sig == QPU_SIG_PROG_END) {
+- shader_end_ip = ip;
+- found_shader_end = true;
++ /* There are two delay slots after program end is
++ * signaled that are still executed, then we're
++ * finished. validation_state->max_ip is the
++ * instruction after the last valid instruction in the
++ * program.
++ */
++ validation_state->max_ip = ip + 3;
+ continue;
+ }
+
+@@ -676,15 +679,9 @@ vc4_validate_branches(struct vc4_shader_
+ }
+ set_bit(after_delay_ip, validation_state->branch_targets);
+ max_branch_target = max(max_branch_target, after_delay_ip);
+-
+- /* There are two delay slots after program end is signaled
+- * that are still executed, then we're finished.
+- */
+- if (found_shader_end && ip == shader_end_ip + 2)
+- break;
+ }
+
+- if (max_branch_target > shader_end_ip) {
++ if (max_branch_target > validation_state->max_ip - 3) {
+ DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
+ return false;
+ }
--- /dev/null
+From 3a62234680d86efa0239665ed8a0e908f1aef147 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <eric@anholt.net>
+Date: Fri, 4 Nov 2016 15:58:38 -0700
+Subject: drm/vc4: Use runtime autosuspend to avoid thrashing V3D power state.
+
+From: Eric Anholt <eric@anholt.net>
+
+commit 3a62234680d86efa0239665ed8a0e908f1aef147 upstream.
+
+The pm_runtime_put() we were using immediately released power on the
+device, which meant that we were generally turning the device off and
+on once per frame. In many profiles I've looked at, that added up to
+about 1% of CPU time, but this could get worse in the case of frequent
+rendering and readback (as may happen in X rendering). By keeping the
+device on until we've been idle for a couple of frames, we drop the
+overhead of runtime PM down to sub-.1%.
+
+Signed-off-by: Eric Anholt <eric@anholt.net>
+Cc: Amit Pundir <amit.pundir@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/vc4/vc4_drv.c | 9 ++++++---
+ drivers/gpu/drm/vc4/vc4_gem.c | 6 ++++--
+ drivers/gpu/drm/vc4/vc4_v3d.c | 2 ++
+ 3 files changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/gpu/drm/vc4/vc4_drv.c
++++ b/drivers/gpu/drm/vc4/vc4_drv.c
+@@ -61,21 +61,24 @@ static int vc4_get_param_ioctl(struct dr
+ if (ret < 0)
+ return ret;
+ args->value = V3D_READ(V3D_IDENT0);
+- pm_runtime_put(&vc4->v3d->pdev->dev);
++ pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
++ pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
+ break;
+ case DRM_VC4_PARAM_V3D_IDENT1:
+ ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+ if (ret < 0)
+ return ret;
+ args->value = V3D_READ(V3D_IDENT1);
+- pm_runtime_put(&vc4->v3d->pdev->dev);
++ pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
++ pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
+ break;
+ case DRM_VC4_PARAM_V3D_IDENT2:
+ ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+ if (ret < 0)
+ return ret;
+ args->value = V3D_READ(V3D_IDENT2);
+- pm_runtime_put(&vc4->v3d->pdev->dev);
++ pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
++ pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
+ break;
+ case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+ args->value = true;
+--- a/drivers/gpu/drm/vc4/vc4_gem.c
++++ b/drivers/gpu/drm/vc4/vc4_gem.c
+@@ -711,8 +711,10 @@ vc4_complete_exec(struct drm_device *dev
+ }
+
+ mutex_lock(&vc4->power_lock);
+- if (--vc4->power_refcount == 0)
+- pm_runtime_put(&vc4->v3d->pdev->dev);
++ if (--vc4->power_refcount == 0) {
++ pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
++ pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
++ }
+ mutex_unlock(&vc4->power_lock);
+
+ kfree(exec);
+--- a/drivers/gpu/drm/vc4/vc4_v3d.c
++++ b/drivers/gpu/drm/vc4/vc4_v3d.c
+@@ -222,6 +222,8 @@ static int vc4_v3d_bind(struct device *d
+ return ret;
+ }
+
++ pm_runtime_use_autosuspend(dev);
++ pm_runtime_set_autosuspend_delay(dev, 40); /* a little over 2 frames. */
+ pm_runtime_enable(dev);
+
+ return 0;