In rare cases, ath10k may lose connection with the PCIe bus due to
some unknown reasons, which could further lead to system crashes during
resuming due to watchdog timeout:
ath10k_pci 0000:01:00.0: wmi command 20486 timeout, restarting hardware
ath10k_pci 0000:01:00.0: already restarting
ath10k_pci 0000:01:00.0: failed to stop WMI vdev 0: -11
ath10k_pci 0000:01:00.0: failed to stop vdev 0: -11
ieee80211 phy0: PM: **** DPM device timeout ****
Call Trace:
panic+0x125/0x315
dpm_watchdog_set+0x54/0x54
dpm_watchdog_handler+0x57/0x57
call_timer_fn+0x31/0x13c
At this point, all WMI commands will timeout and attempt to restart
device. So set a threshold for consecutive restart failures. If the
threshold is exceeded, consider the hardware is unreliable and all
ath10k operations should be skipped to avoid system crash.
fail_cont_count and pending_recovery are atomic variables, and
do not involve complex conditional logic. Therefore, even if recovery
check and reconfig complete are executed concurrently, the recovery
mechanism will not be broken.
Tested-on: QCA6174 hw3.2 PCI WLAN.RM.4.4.1-00288-QCARMSWPZ-1
Signed-off-by: Kang Yang <kang.yang@oss.qualcomm.com>
Reviewed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Link: https://patch.msgid.link/20250623022731.509-1-kang.yang@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
return 0;
}
+static bool ath10k_core_needs_recovery(struct ath10k *ar)
+{
+ long time_left;
+
+ /* Sometimes the recovery will fail and then the next all recovery fail,
+ * so avoid infinite recovery.
+ */
+ if (atomic_read(&ar->fail_cont_count) >= ATH10K_RECOVERY_MAX_FAIL_COUNT) {
+ ath10k_err(ar, "consecutive fail %d times, will shutdown driver!",
+ atomic_read(&ar->fail_cont_count));
+ ar->state = ATH10K_STATE_WEDGED;
+ return false;
+ }
+
+ ath10k_dbg(ar, ATH10K_DBG_BOOT, "total recovery count: %d", ++ar->recovery_count);
+
+ if (atomic_read(&ar->pending_recovery)) {
+ /* Sometimes it happened another recovery work before the previous one
+ * completed, then the second recovery work will destroy the previous
+ * one, thus below is to avoid that.
+ */
+ time_left = wait_for_completion_timeout(&ar->driver_recovery,
+ ATH10K_RECOVERY_TIMEOUT_HZ);
+ if (time_left) {
+ ath10k_warn(ar, "previous recovery succeeded, skip this!\n");
+ return false;
+ }
+
+ /* Record the continuous recovery fail count when recovery failed. */
+ atomic_inc(&ar->fail_cont_count);
+
+ /* Avoid having multiple recoveries at the same time. */
+ return false;
+ }
+
+ atomic_inc(&ar->pending_recovery);
+
+ return true;
+}
+
void ath10k_core_start_recovery(struct ath10k *ar)
{
- if (test_and_set_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags)) {
- ath10k_warn(ar, "already restarting\n");
+ if (!ath10k_core_needs_recovery(ar))
return;
- }
queue_work(ar->workqueue, &ar->restart_work);
}
struct ath10k *ar = container_of(work, struct ath10k, restart_work);
int ret;
+ reinit_completion(&ar->driver_recovery);
+
set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
/* Place a barrier to make sure the compiler doesn't reorder
if (ret)
ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d",
ret);
-
- complete(&ar->driver_recovery);
}
static void ath10k_core_set_coverage_class_work(struct work_struct *work)
* Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
* Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
*/
#ifndef _CORE_H_
IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
#define ATH10K_ITER_RESUME_FLAGS (IEEE80211_IFACE_ITER_RESUME_ALL |\
IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
+#define ATH10K_RECOVERY_TIMEOUT_HZ (5 * HZ)
+#define ATH10K_RECOVERY_MAX_FAIL_COUNT 4
struct ath10k;
/* Per Station statistics service */
ATH10K_FLAG_PEER_STATS,
- /* Indicates that ath10k device is during recovery process and not complete */
- ATH10K_FLAG_RESTARTING,
-
/* protected by conf_mutex */
ATH10K_FLAG_NAPI_ENABLED,
};
struct work_struct bundle_tx_work;
struct work_struct tx_complete_work;
+ atomic_t pending_recovery;
+ unsigned int recovery_count;
+ /* continuous recovery fail count */
+ atomic_t fail_cont_count;
+
/* cycle count is reported twice for each visited channel during scan.
* access protected by data_lock
*/
ath10k_info(ar, "device successfully recovered\n");
ar->state = ATH10K_STATE_ON;
ieee80211_wake_queues(ar->hw);
- clear_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags);
+
+ /* Clear recovery state. */
+ complete(&ar->driver_recovery);
+ atomic_set(&ar->fail_cont_count, 0);
+ atomic_set(&ar->pending_recovery, 0);
+
if (ar->hw_params.hw_restart_disconnect) {
list_for_each_entry(arvif, &ar->arvifs, list) {
if (arvif->is_up && arvif->vdev_type == WMI_VDEV_TYPE_STA)
* Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
* Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
*/
#include <linux/skbuff.h>
}
wait_event_timeout(ar->wmi.tx_credits_wq, ({
+ if (ar->state == ATH10K_STATE_WEDGED) {
+ ret = -ESHUTDOWN;
+ ath10k_dbg(ar, ATH10K_DBG_WMI,
+ "drop wmi command %d, hardware is wedged\n", cmd_id);
+ }
/* try to send pending beacons first. they take priority */
ath10k_wmi_tx_beacons_nowait(ar);