]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
accel/habanalabs: no CPUCP prints on heartbeat failure
authorOhad Sharabi <osharabi@habana.ai>
Sun, 18 Feb 2024 09:54:29 +0000 (11:54 +0200)
committerOfir Bitton <obitton@habana.ai>
Sun, 23 Jun 2024 06:45:51 +0000 (09:45 +0300)
If we detected heartbet event while some daemon in the background send
(via driver interface) CPUCP messages the dmesg will be flooded.

Instead, a slight refactor in hl_fw_send_cpu_message() returns -EAGAIN
when CPU is disabled (i.e. heartbeat failure) and only then.

Later, all calling functions that may be invoked by user space can issue
prints only if the error code is not -EAGAIN.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
Signed-off-by: Ofir Bitton <obitton@habana.ai>
drivers/accel/habanalabs/common/debugfs.c
drivers/accel/habanalabs/common/device.c
drivers/accel/habanalabs/common/firmware_if.c
drivers/accel/habanalabs/common/hwmon.c
drivers/accel/habanalabs/gaudi/gaudi.c
drivers/accel/habanalabs/gaudi2/gaudi2.c
drivers/accel/habanalabs/goya/goya.c

index b1c88d1837d97c7a12f56e9d29e461a19f7f936c..de3ae2e47ec48e70ef638a439a6ca0315107dff5 100644 (file)
@@ -42,9 +42,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
        pkt.i2c_reg = i2c_reg;
        pkt.i2c_len = i2c_len;
 
-       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               0, val);
-       if (rc)
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, val);
+       if (rc && rc != -EAGAIN)
                dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
 
        return rc;
@@ -75,10 +74,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
        pkt.i2c_len = i2c_len;
        pkt.value = cpu_to_le64(val);
 
-       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               0, NULL);
-
-       if (rc)
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+       if (rc && rc != -EAGAIN)
                dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
 
        return rc;
@@ -99,10 +96,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
        pkt.led_index = cpu_to_le32(led);
        pkt.value = cpu_to_le64(state);
 
-       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               0, NULL);
-
-       if (rc)
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+       if (rc && rc != -EAGAIN)
                dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
 }
 
index eee41c367bd1823188ac873da91fbe75b178322c..087bbb1778e5d23402f47c3226f6b0cfccf4b190 100644 (file)
@@ -1498,10 +1498,8 @@ static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
                 * of heartbeat, the device CPU is marked as disable
                 * so this message won't be sent
                 */
-               if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
-                       dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+               if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
                        return;
-               }
 
                /* verify that last EQs are handled before disabled is set */
                if (hdev->cpu_queues_enable)
index 348418643709dc5a9cb8af291b079809c70da42f..b5b960ce4ebdf86adc4e319d21227551f4a2d4ca 100644 (file)
@@ -370,43 +370,63 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value)
 {
        struct cpucp_packet pkt = {};
+       int rc;
 
        pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
        pkt.value = cpu_to_le64(value);
 
-       return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+       if (rc)
+               dev_err(hdev->dev, "Failed to disable FW's PCI access\n");
+
+       return rc;
 }
 
+/**
+ * hl_fw_send_cpu_message() - send CPU message to the device.
+ *
+ * @hdev: pointer to hl_device structure.
+ * @hw_queue_id: HW queue ID
+ * @msg: raw data of the message/packet
+ * @size: size of @msg in bytes
+ * @timeout_us: timeout in usec to wait for CPU reply on the message
+ * @result: return code reported by FW
+ *
+ * send message to the device CPU.
+ *
+ * Return: 0 on success, non-zero for failure.
+ *     -ENOMEM: memory allocation failure
+ *     -EAGAIN: CPU is disabled (try again when enabled)
+ *     -ETIMEDOUT: timeout waiting for FW response
+ *     -EIO: protocol error
+ */
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
-                               u16 len, u32 timeout, u64 *result)
+                               u16 size, u32 timeout_us, u64 *result)
 {
        struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
        struct asic_fixed_properties *prop = &hdev->asic_prop;
+       u32 tmp, expected_ack_val, pi, opcode;
        struct cpucp_packet *pkt;
        dma_addr_t pkt_dma_addr;
        struct hl_bd *sent_bd;
-       u32 tmp, expected_ack_val, pi, opcode;
-       int rc;
+       int rc = 0, fw_rc;
 
-       pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr);
+       pkt = hl_cpu_accessible_dma_pool_alloc(hdev, size, &pkt_dma_addr);
        if (!pkt) {
-               dev_err(hdev->dev,
-                       "Failed to allocate DMA memory for packet to CPU\n");
+               dev_err(hdev->dev, "Failed to allocate DMA memory for packet to CPU\n");
                return -ENOMEM;
        }
 
-       memcpy(pkt, msg, len);
+       memcpy(pkt, msg, size);
 
        mutex_lock(&hdev->send_cpu_message_lock);
 
        /* CPU-CP messages can be sent during soft-reset */
-       if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
-               rc = 0;
+       if (hdev->disabled && !hdev->reset_info.in_compute_reset)
                goto out;
-       }
 
        if (hdev->device_cpu_disabled) {
-               rc = -EIO;
+               rc = -EAGAIN;
                goto out;
        }
 
@@ -422,7 +442,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
         * Which means that we don't need to lock the access to the entire H/W
         * queues module when submitting a JOB to the CPU queue.
         */
-       hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr);
+       hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), size, pkt_dma_addr);
 
        if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
                expected_ack_val = queue->pi;
@@ -431,7 +451,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
        rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
                                (tmp == expected_ack_val), 1000,
-                               timeout, true);
+                               timeout_us, true);
 
        hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
 
@@ -450,8 +470,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
        tmp = le32_to_cpu(pkt->ctl);
 
-       rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
-       if (rc) {
+       fw_rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
+       if (fw_rc) {
                opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT;
 
                if (!prop->supports_advanced_cpucp_rc) {
@@ -460,7 +480,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
                        goto scrub_descriptor;
                }
 
-               switch (rc) {
+               switch (fw_rc) {
                case cpucp_packet_invalid:
                        dev_err(hdev->dev,
                                "CPU packet %d is not supported by F/W\n", opcode);
@@ -485,7 +505,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
                /* propagate the return code from the f/w to the callers who want to check it */
                if (result)
-                       *result = rc;
+                       *result = fw_rc;
 
                rc = -EIO;
 
@@ -505,7 +525,7 @@ scrub_descriptor:
 out:
        mutex_unlock(&hdev->send_cpu_message_lock);
 
-       hl_cpu_accessible_dma_pool_free(hdev, len, pkt);
+       hl_cpu_accessible_dma_pool_free(hdev, size, pkt);
 
        return rc;
 }
@@ -575,7 +595,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 int hl_fw_test_cpu_queue(struct hl_device *hdev)
 {
        struct cpucp_packet test_pkt = {};
-       u64 result;
+       u64 result = 0;
        int rc;
 
        test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -648,7 +668,7 @@ int hl_fw_send_device_activity(struct hl_device *hdev, bool open)
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
        struct cpucp_packet hb_pkt;
-       u64 result;
+       u64 result = 0;
        int rc;
 
        memset(&hb_pkt, 0, sizeof(hb_pkt));
@@ -910,7 +930,7 @@ static int hl_fw_send_msi_info_msg(struct hl_device *hdev)
 {
        struct cpucp_array_data_packet *pkt;
        size_t total_pkt_size, data_size;
-       u64 result;
+       u64 result = 0;
        int rc;
 
        /* skip sending this info for unsupported ASICs */
@@ -1001,11 +1021,10 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                        HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
-
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP EEPROM packet, error %d\n",
-                       rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP EEPROM packet, error %d\n", rc);
                goto out;
        }
 
@@ -1046,7 +1065,9 @@ int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data)
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                                        HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
                goto out;
        }
 
@@ -1080,8 +1101,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
                return rc;
        }
        counters->rx_throughput = result;
@@ -1095,8 +1117,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
                return rc;
        }
        counters->tx_throughput = result;
@@ -1109,8 +1132,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
                return rc;
        }
        counters->replay_cnt = (u32) result;
@@ -1130,9 +1154,9 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CpuCP total energy pkt, error %d\n",
-                               rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CpuCP total energy pkt, error %d\n", rc);
                return rc;
        }
 
@@ -1208,7 +1232,8 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
                return rc;
        }
 
@@ -1235,7 +1260,8 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
                return rc;
        }
 
@@ -1272,8 +1298,9 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                        HL_CPUCP_INFO_TIMEOUT_USEC, &result);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
                goto out;
        }
 
@@ -1298,7 +1325,8 @@ int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
        if (rc) {
-               dev_err(hdev->dev,
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
                                "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
                goto out;
        }
@@ -3147,10 +3175,10 @@ long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
        pkt.pll_index = cpu_to_le32((u32)used_pll_idx);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
        if (rc) {
-               dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
-                       used_pll_idx, rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
+                               used_pll_idx, rc);
                return rc;
        }
 
@@ -3174,8 +3202,7 @@ void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
        pkt.value = cpu_to_le64(freq);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n",
                        used_pll_idx, rc);
 }
@@ -3191,9 +3218,9 @@ long hl_fw_get_max_power(struct hl_device *hdev)
        pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
        if (rc) {
-               dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
                return rc;
        }
 
@@ -3215,8 +3242,7 @@ void hl_fw_set_max_power(struct hl_device *hdev)
        pkt.value = cpu_to_le64(hdev->max_power);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
 }
 
@@ -3242,11 +3268,11 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void
        pkt.data_max_size = cpu_to_le32(size);
        pkt.nonce = cpu_to_le32(nonce);
 
-       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       timeout, NULL);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), timeout, NULL);
        if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
                goto out;
        }
 
@@ -3288,10 +3314,12 @@ int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)&pkt, sizeof(pkt),
                                                HL_CPUCP_INFO_TIMEOUT_USEC, &result);
-       if (rc)
-               dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
-       else
+       if (rc) {
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
+       } else {
                dev_dbg(hdev->dev, "generic pkt was successful, result: 0x%llx\n", result);
+       }
 
        *size = (u32)result;
 
index 36b951b5f5039d6a86bd4c7779eccd7a79d5632e..52d1e6bf10dce4fdfb44c791ef3f8329650cd0e3 100644 (file)
@@ -585,9 +585,10 @@ int hl_get_temperature(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get temperature from sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get temperature from sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
@@ -610,8 +611,7 @@ int hl_set_temperature(struct hl_device *hdev,
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                                0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err_ratelimited(hdev->dev,
                        "Failed to set temperature of sensor %d, error %d\n",
                        sensor_index, rc);
@@ -639,9 +639,10 @@ int hl_get_voltage(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get voltage from sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get voltage from sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
@@ -668,9 +669,10 @@ int hl_get_current(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get current from sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get current from sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
@@ -697,9 +699,10 @@ int hl_get_fan_speed(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get fan speed from sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get fan speed from sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
@@ -726,9 +729,10 @@ int hl_get_pwm_info(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get pwm info from sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get pwm info from sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
@@ -751,8 +755,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                                0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err_ratelimited(hdev->dev,
                        "Failed to set pwm info to sensor %d, error %d\n",
                        sensor_index, rc);
@@ -774,8 +777,7 @@ int hl_set_voltage(struct hl_device *hdev,
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                                0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err_ratelimited(hdev->dev,
                        "Failed to set voltage of sensor %d, error %d\n",
                        sensor_index, rc);
@@ -797,10 +799,8 @@ int hl_set_current(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
        pkt.value = __cpu_to_le64(value);
 
-       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               0, NULL);
-
-       if (rc)
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+       if (rc && rc != -EAGAIN)
                dev_err_ratelimited(hdev->dev,
                        "Failed to set current of sensor %d, error %d\n",
                        sensor_index, rc);
@@ -830,8 +830,7 @@ int hl_set_power(struct hl_device *hdev,
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
                                                0, NULL);
-
-       if (rc)
+       if (rc && rc != -EAGAIN)
                dev_err_ratelimited(hdev->dev,
                        "Failed to set power of sensor %d, error %d\n",
                        sensor_index, rc);
@@ -859,9 +858,10 @@ int hl_get_power(struct hl_device *hdev,
        *value = (long) result;
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "Failed to get power of sensor %d, error %d\n",
-                       sensor_index, rc);
+               if (rc != -EAGAIN)
+                       dev_err_ratelimited(hdev->dev,
+                               "Failed to get power of sensor %d, error %d\n",
+                               sensor_index, rc);
                *value = 0;
        }
 
index f2b04ffb0ecb281f5a0b4f43d480e4b9b051ea55..fa893a9b826ec4d9e3fc3c867a969a2621ecaf42 100644 (file)
@@ -1639,10 +1639,8 @@ static int gaudi_late_init(struct hl_device *hdev)
        }
 
        rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
-       if (rc) {
-               dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+       if (rc)
                return rc;
-       }
 
        /* Scrub both SRAM and DRAM */
        rc = hdev->asic_funcs->scrub_device_mem(hdev);
@@ -4154,13 +4152,7 @@ skip_reset:
 
 static int gaudi_suspend(struct hl_device *hdev)
 {
-       int rc;
-
-       rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-       if (rc)
-               dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-       return rc;
+       return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 static int gaudi_resume(struct hl_device *hdev)
index ba1518f2bf5c8581cb9039ed08382250aeafd80c..962b7fcd4318c55b5d5f7f66aedcac530bf982a4 100644 (file)
@@ -3312,10 +3312,8 @@ static int gaudi2_late_init(struct hl_device *hdev)
 
        rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
                                        gaudi2->virt_msix_db_dma_addr);
-       if (rc) {
-               dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+       if (rc)
                return rc;
-       }
 
        rc = gaudi2_fetch_psoc_frequency(hdev);
        if (rc) {
@@ -6467,13 +6465,7 @@ skip_reset:
 
 static int gaudi2_suspend(struct hl_device *hdev)
 {
-       int rc;
-
-       rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-       if (rc)
-               dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-       return rc;
+       return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 static int gaudi2_resume(struct hl_device *hdev)
index 5a359c3bdc782e618b6d8d55fc7c635fec67bb11..84768e306269906d9b95324566100728d0251861 100644 (file)
@@ -893,11 +893,8 @@ int goya_late_init(struct hl_device *hdev)
        WREG32(mmMMU_LOG2_DDR_SIZE, ilog2(prop->dram_size));
 
        rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
-       if (rc) {
-               dev_err(hdev->dev,
-                       "Failed to enable PCI access from CPU %d\n", rc);
+       if (rc)
                return rc;
-       }
 
        /* force setting to low frequency */
        goya->curr_pll_profile = PLL_LOW;
@@ -2864,13 +2861,7 @@ static int goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 
 int goya_suspend(struct hl_device *hdev)
 {
-       int rc;
-
-       rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-       if (rc)
-               dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-       return rc;
+       return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 int goya_resume(struct hl_device *hdev)