--- /dev/null
+From d3abaf43bab8d5b0a3c6b982100d9e2be96de4ad Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Sat, 13 Oct 2018 20:32:17 -0700
+Subject: acpi, nfit: Fix Address Range Scrub completion tracking
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit d3abaf43bab8d5b0a3c6b982100d9e2be96de4ad upstream.
+
+The Address Range Scrub implementation tried to skip running scrubs
+against ranges that were already scrubbed by the BIOS. Unfortunately
+that support also resulted in early scrub completions as evidenced by
+this debug output from nfit_test:
+
+ nd_region region9: ARS: range 1 short complete
+ nd_region region3: ARS: range 1 short complete
+ nd_region region4: ARS: range 2 ARS start (0)
+ nd_region region4: ARS: range 2 short complete
+
+...i.e. completions without any indications that the scrub was started.
+
+This state of affairs was hard to see in the code due to the
+proliferation of state bits and mistakenly trying to track done state
+per-range when the completion is a global property of the bus.
+
+So, kill the four ARS state bits (ARS_REQ, ARS_REQ_REDO, ARS_DONE, and
+ARS_SHORT), and replace them with just 2 request flags ARS_REQ_SHORT and
+ARS_REQ_LONG. The implementation will still complete and reap the
+results of BIOS initiated ARS, but it will not attempt to use that
+information to affect the completion status of scrubbing the ranges from
+a Linux perspective.
+
+Instead, try to synchronously run a short ARS per range at init time and
+schedule a long scrub in the background. If ARS is busy with an ARS
+request, schedule both a short and a long scrub for when ARS returns to
+idle. This logic also satisfies the intent of what ARS_REQ_REDO was
+trying to achieve. The new rule is that the REQ flag stays set until the
+next successful ars_start() for that range.
+
+With the new policy that the REQ flags are not cleared until the next
+start, the implementation no longer loses requests as can be seen from
+the following log:
+
+ nd_region region3: ARS: range 1 ARS start short (0)
+ nd_region region9: ARS: range 1 ARS start short (0)
+ nd_region region3: ARS: range 1 complete
+ nd_region region4: ARS: range 2 ARS start short (0)
+ nd_region region9: ARS: range 1 complete
+ nd_region region9: ARS: range 1 ARS start long (0)
+ nd_region region4: ARS: range 2 complete
+ nd_region region3: ARS: range 1 ARS start long (0)
+ nd_region region9: ARS: range 1 complete
+ nd_region region3: ARS: range 1 complete
+ nd_region region4: ARS: range 2 ARS start long (0)
+ nd_region region4: ARS: range 2 complete
+
+...note that the nfit_test emulated driver provides 2 buses, that is why
+some of the range indices are duplicated. Notice that each range
+now successfully completes a short and long scrub.
+
+Cc: <stable@vger.kernel.org>
+Fixes: 14c73f997a5e ("nfit, address-range-scrub: introduce nfit_spa->ars_state")
+Fixes: cc3d3458d46f ("acpi/nfit: queue issuing of ars when an uc error...")
+Reported-by: Jacek Zloch <jacek.zloch@intel.com>
+Reported-by: Krzysztof Rusocki <krzysztof.rusocki@intel.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/acpi/nfit/core.c | 161 +++++++++++++++++++++++++++--------------------
+ drivers/acpi/nfit/nfit.h | 9 +-
+ 2 files changed, 100 insertions(+), 70 deletions(-)
+
+--- a/drivers/acpi/nfit/core.c
++++ b/drivers/acpi/nfit/core.c
+@@ -2456,7 +2456,8 @@ static int ars_get_cap(struct acpi_nfit_
+ return cmd_rc;
+ }
+
+-static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa)
++static int ars_start(struct acpi_nfit_desc *acpi_desc,
++ struct nfit_spa *nfit_spa, enum nfit_ars_state req_type)
+ {
+ int rc;
+ int cmd_rc;
+@@ -2467,7 +2468,7 @@ static int ars_start(struct acpi_nfit_de
+ memset(&ars_start, 0, sizeof(ars_start));
+ ars_start.address = spa->address;
+ ars_start.length = spa->length;
+- if (test_bit(ARS_SHORT, &nfit_spa->ars_state))
++ if (req_type == ARS_REQ_SHORT)
+ ars_start.flags = ND_ARS_RETURN_PREV_DATA;
+ if (nfit_spa_type(spa) == NFIT_SPA_PM)
+ ars_start.type = ND_ARS_PERSISTENT;
+@@ -2524,6 +2525,15 @@ static void ars_complete(struct acpi_nfi
+ struct nd_region *nd_region = nfit_spa->nd_region;
+ struct device *dev;
+
++ lockdep_assert_held(&acpi_desc->init_mutex);
++ /*
++ * Only advance the ARS state for ARS runs initiated by the
++ * kernel, ignore ARS results from BIOS initiated runs for scrub
++ * completion tracking.
++ */
++ if (acpi_desc->scrub_spa != nfit_spa)
++ return;
++
+ if ((ars_status->address >= spa->address && ars_status->address
+ < spa->address + spa->length)
+ || (ars_status->address < spa->address)) {
+@@ -2543,23 +2553,13 @@ static void ars_complete(struct acpi_nfi
+ } else
+ return;
+
+- if (test_bit(ARS_DONE, &nfit_spa->ars_state))
+- return;
+-
+- if (!test_and_clear_bit(ARS_REQ, &nfit_spa->ars_state))
+- return;
+-
++ acpi_desc->scrub_spa = NULL;
+ if (nd_region) {
+ dev = nd_region_dev(nd_region);
+ nvdimm_region_notify(nd_region, NVDIMM_REVALIDATE_POISON);
+ } else
+ dev = acpi_desc->dev;
+-
+- dev_dbg(dev, "ARS: range %d %s complete\n", spa->range_index,
+- test_bit(ARS_SHORT, &nfit_spa->ars_state)
+- ? "short" : "long");
+- clear_bit(ARS_SHORT, &nfit_spa->ars_state);
+- set_bit(ARS_DONE, &nfit_spa->ars_state);
++ dev_dbg(dev, "ARS: range %d complete\n", spa->range_index);
+ }
+
+ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc)
+@@ -2840,46 +2840,55 @@ static int acpi_nfit_query_poison(struct
+ return 0;
+ }
+
+-static int ars_register(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa,
+- int *query_rc)
++static int ars_register(struct acpi_nfit_desc *acpi_desc,
++ struct nfit_spa *nfit_spa)
+ {
+- int rc = *query_rc;
++ int rc;
+
+- if (no_init_ars)
++ if (no_init_ars || test_bit(ARS_FAILED, &nfit_spa->ars_state))
+ return acpi_nfit_register_region(acpi_desc, nfit_spa);
+
+- set_bit(ARS_REQ, &nfit_spa->ars_state);
+- set_bit(ARS_SHORT, &nfit_spa->ars_state);
++ set_bit(ARS_REQ_SHORT, &nfit_spa->ars_state);
++ set_bit(ARS_REQ_LONG, &nfit_spa->ars_state);
+
+- switch (rc) {
++ switch (acpi_nfit_query_poison(acpi_desc)) {
+ case 0:
+ case -EAGAIN:
+- rc = ars_start(acpi_desc, nfit_spa);
+- if (rc == -EBUSY) {
+- *query_rc = rc;
++ rc = ars_start(acpi_desc, nfit_spa, ARS_REQ_SHORT);
++ /* shouldn't happen, try again later */
++ if (rc == -EBUSY)
+ break;
+- } else if (rc == 0) {
+- rc = acpi_nfit_query_poison(acpi_desc);
+- } else {
++ if (rc) {
+ set_bit(ARS_FAILED, &nfit_spa->ars_state);
+ break;
+ }
+- if (rc == -EAGAIN)
+- clear_bit(ARS_SHORT, &nfit_spa->ars_state);
+- else if (rc == 0)
+- ars_complete(acpi_desc, nfit_spa);
++ clear_bit(ARS_REQ_SHORT, &nfit_spa->ars_state);
++ rc = acpi_nfit_query_poison(acpi_desc);
++ if (rc)
++ break;
++ acpi_desc->scrub_spa = nfit_spa;
++ ars_complete(acpi_desc, nfit_spa);
++ /*
++ * If ars_complete() says we didn't complete the
++ * short scrub, we'll try again with a long
++ * request.
++ */
++ acpi_desc->scrub_spa = NULL;
+ break;
+ case -EBUSY:
++ case -ENOMEM:
+ case -ENOSPC:
++ /*
++ * BIOS was using ARS, wait for it to complete (or
++ * resources to become available) and then perform our
++ * own scrubs.
++ */
+ break;
+ default:
+ set_bit(ARS_FAILED, &nfit_spa->ars_state);
+ break;
+ }
+
+- if (test_and_clear_bit(ARS_DONE, &nfit_spa->ars_state))
+- set_bit(ARS_REQ, &nfit_spa->ars_state);
+-
+ return acpi_nfit_register_region(acpi_desc, nfit_spa);
+ }
+
+@@ -2901,6 +2910,8 @@ static unsigned int __acpi_nfit_scrub(st
+ struct device *dev = acpi_desc->dev;
+ struct nfit_spa *nfit_spa;
+
++ lockdep_assert_held(&acpi_desc->init_mutex);
++
+ if (acpi_desc->cancel)
+ return 0;
+
+@@ -2924,21 +2935,49 @@ static unsigned int __acpi_nfit_scrub(st
+
+ ars_complete_all(acpi_desc);
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
++ enum nfit_ars_state req_type;
++ int rc;
++
+ if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
+ continue;
+- if (test_bit(ARS_REQ, &nfit_spa->ars_state)) {
+- int rc = ars_start(acpi_desc, nfit_spa);
+
+- clear_bit(ARS_DONE, &nfit_spa->ars_state);
+- dev = nd_region_dev(nfit_spa->nd_region);
+- dev_dbg(dev, "ARS: range %d ARS start (%d)\n",
+- nfit_spa->spa->range_index, rc);
+- if (rc == 0 || rc == -EBUSY)
+- return 1;
+- dev_err(dev, "ARS: range %d ARS failed (%d)\n",
+- nfit_spa->spa->range_index, rc);
+- set_bit(ARS_FAILED, &nfit_spa->ars_state);
++ /* prefer short ARS requests first */
++ if (test_bit(ARS_REQ_SHORT, &nfit_spa->ars_state))
++ req_type = ARS_REQ_SHORT;
++ else if (test_bit(ARS_REQ_LONG, &nfit_spa->ars_state))
++ req_type = ARS_REQ_LONG;
++ else
++ continue;
++ rc = ars_start(acpi_desc, nfit_spa, req_type);
++
++ dev = nd_region_dev(nfit_spa->nd_region);
++ dev_dbg(dev, "ARS: range %d ARS start %s (%d)\n",
++ nfit_spa->spa->range_index,
++ req_type == ARS_REQ_SHORT ? "short" : "long",
++ rc);
++ /*
++ * Hmm, we raced someone else starting ARS? Try again in
++ * a bit.
++ */
++ if (rc == -EBUSY)
++ return 1;
++ if (rc == 0) {
++ dev_WARN_ONCE(dev, acpi_desc->scrub_spa,
++ "scrub start while range %d active\n",
++ acpi_desc->scrub_spa->spa->range_index);
++ clear_bit(req_type, &nfit_spa->ars_state);
++ acpi_desc->scrub_spa = nfit_spa;
++ /*
++ * Consider this spa last for future scrub
++ * requests
++ */
++ list_move_tail(&nfit_spa->list, &acpi_desc->spas);
++ return 1;
+ }
++
++ dev_err(dev, "ARS: range %d ARS failed (%d)\n",
++ nfit_spa->spa->range_index, rc);
++ set_bit(ARS_FAILED, &nfit_spa->ars_state);
+ }
+ return 0;
+ }
+@@ -2994,6 +3033,7 @@ static void acpi_nfit_init_ars(struct ac
+ struct nd_cmd_ars_cap ars_cap;
+ int rc;
+
++ set_bit(ARS_FAILED, &nfit_spa->ars_state);
+ memset(&ars_cap, 0, sizeof(ars_cap));
+ rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa);
+ if (rc < 0)
+@@ -3010,16 +3050,14 @@ static void acpi_nfit_init_ars(struct ac
+ nfit_spa->clear_err_unit = ars_cap.clear_err_unit;
+ acpi_desc->max_ars = max(nfit_spa->max_ars, acpi_desc->max_ars);
+ clear_bit(ARS_FAILED, &nfit_spa->ars_state);
+- set_bit(ARS_REQ, &nfit_spa->ars_state);
+ }
+
+ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
+ {
+ struct nfit_spa *nfit_spa;
+- int rc, query_rc;
++ int rc;
+
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+- set_bit(ARS_FAILED, &nfit_spa->ars_state);
+ switch (nfit_spa_type(nfit_spa->spa)) {
+ case NFIT_SPA_VOLATILE:
+ case NFIT_SPA_PM:
+@@ -3028,20 +3066,12 @@ static int acpi_nfit_register_regions(st
+ }
+ }
+
+- /*
+- * Reap any results that might be pending before starting new
+- * short requests.
+- */
+- query_rc = acpi_nfit_query_poison(acpi_desc);
+- if (query_rc == 0)
+- ars_complete_all(acpi_desc);
+-
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
+ switch (nfit_spa_type(nfit_spa->spa)) {
+ case NFIT_SPA_VOLATILE:
+ case NFIT_SPA_PM:
+ /* register regions and kick off initial ARS run */
+- rc = ars_register(acpi_desc, nfit_spa, &query_rc);
++ rc = ars_register(acpi_desc, nfit_spa);
+ if (rc)
+ return rc;
+ break;
+@@ -3236,7 +3266,8 @@ static int acpi_nfit_clear_to_send(struc
+ return 0;
+ }
+
+-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, unsigned long flags)
++int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc,
++ enum nfit_ars_state req_type)
+ {
+ struct device *dev = acpi_desc->dev;
+ int scheduled = 0, busy = 0;
+@@ -3256,13 +3287,10 @@ int acpi_nfit_ars_rescan(struct acpi_nfi
+ if (test_bit(ARS_FAILED, &nfit_spa->ars_state))
+ continue;
+
+- if (test_and_set_bit(ARS_REQ, &nfit_spa->ars_state))
++ if (test_and_set_bit(req_type, &nfit_spa->ars_state))
+ busy++;
+- else {
+- if (test_bit(ARS_SHORT, &flags))
+- set_bit(ARS_SHORT, &nfit_spa->ars_state);
++ else
+ scheduled++;
+- }
+ }
+ if (scheduled) {
+ sched_ars(acpi_desc);
+@@ -3448,10 +3476,11 @@ static void acpi_nfit_update_notify(stru
+ static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle)
+ {
+ struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev);
+- unsigned long flags = (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) ?
+- 0 : 1 << ARS_SHORT;
+
+- acpi_nfit_ars_rescan(acpi_desc, flags);
++ if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON)
++ acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG);
++ else
++ acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_SHORT);
+ }
+
+ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
+--- a/drivers/acpi/nfit/nfit.h
++++ b/drivers/acpi/nfit/nfit.h
+@@ -118,9 +118,8 @@ enum nfit_dimm_notifiers {
+ };
+
+ enum nfit_ars_state {
+- ARS_REQ,
+- ARS_DONE,
+- ARS_SHORT,
++ ARS_REQ_SHORT,
++ ARS_REQ_LONG,
+ ARS_FAILED,
+ };
+
+@@ -197,6 +196,7 @@ struct acpi_nfit_desc {
+ struct device *dev;
+ u8 ars_start_flags;
+ struct nd_cmd_ars_status *ars_status;
++ struct nfit_spa *scrub_spa;
+ struct delayed_work dwork;
+ struct list_head list;
+ struct kernfs_node *scrub_count_state;
+@@ -251,7 +251,8 @@ struct nfit_blk {
+
+ extern struct list_head acpi_descs;
+ extern struct mutex acpi_desc_lock;
+-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, unsigned long flags);
++int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc,
++ enum nfit_ars_state req_type);
+
+ #ifdef CONFIG_X86_MCE
+ void nfit_mce_register(void);