habanalabs: set 4s timeout for message to device CPU
We see that sometimes the CPU in GOYA and GAUDI is occupied by the power/thermal loop and can't answer requests from the driver fast enough. Therefore, to avoid false notifications on timeouts, increase the timeout to 4 seconds on each message sent to the device CPU. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Reviewed-by: Tomer Tayar <ttayar@habana.ai>
This commit is contained in:
parent
e38bfd30e0
commit
788cacf308
|
@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
|
|||
pkt.i2c_reg = i2c_reg;
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
HL_DEVICE_TIMEOUT_USEC, (long *) val);
|
||||
0, (long *) val);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
|
||||
|
@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
|
|||
pkt.value = cpu_to_le64(val);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
HL_DEVICE_TIMEOUT_USEC, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
|
||||
|
@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
|
|||
pkt.value = cpu_to_le64(state);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
HL_DEVICE_TIMEOUT_USEC, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
|
||||
|
|
|
@ -61,7 +61,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
|
|||
pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||
|
||||
return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
|
||||
sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
|
||||
sizeof(pkt), 0, NULL);
|
||||
}
|
||||
|
||||
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
|
||||
|
@ -144,7 +144,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
|
|||
pkt.value = cpu_to_le64(event_type);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
0, &result);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
|
||||
|
@ -183,7 +183,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
|
|||
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
|
||||
total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
total_pkt_size, 0, &result);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "failed to unmask IRQ array\n");
|
||||
|
@ -204,7 +204,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
|
|||
test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
|
||||
sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
sizeof(test_pkt), 0, &result);
|
||||
|
||||
if (!rc) {
|
||||
if (result != ARMCP_PACKET_FENCE_VAL)
|
||||
|
@ -248,7 +248,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
|
|||
hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
|
||||
sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
sizeof(hb_pkt), 0, &result);
|
||||
|
||||
if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
|
||||
rc = -EIO;
|
||||
|
|
|
@ -80,6 +80,7 @@
|
|||
#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
|
||||
#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
|
||||
#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
|
||||
#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */
|
||||
|
||||
#define GAUDI_QMAN0_FENCE_VAL 0x72E91AB9
|
||||
|
||||
|
@ -3479,6 +3480,9 @@ static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (!timeout)
|
||||
timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC;
|
||||
|
||||
return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len,
|
||||
timeout, result);
|
||||
}
|
||||
|
|
|
@ -88,6 +88,7 @@
|
|||
#define GOYA_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100)
|
||||
#define GOYA_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
|
||||
#define GOYA_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
|
||||
#define GOYA_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */
|
||||
|
||||
#define GOYA_QMAN0_FENCE_VAL 0xD169B243
|
||||
|
||||
|
@ -2830,6 +2831,9 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (!timeout)
|
||||
timeout = GOYA_MSG_TO_CPU_TIMEOUT_USEC;
|
||||
|
||||
return hl_fw_send_cpu_message(hdev, GOYA_QUEUE_ID_CPU_PQ, msg, len,
|
||||
timeout, result);
|
||||
}
|
||||
|
@ -4431,8 +4435,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
|
|||
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
|
||||
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||
|
||||
rc = goya_send_cpu_message(hdev, (u32 *) pkt, total_pkt_size,
|
||||
HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
|
||||
total_pkt_size, 0, &result);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "failed to unmask IRQ array\n");
|
||||
|
@ -4464,8 +4468,8 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
|
|||
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||
pkt.value = cpu_to_le64(event_type);
|
||||
|
||||
rc = goya_send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
HL_DEVICE_TIMEOUT_USEC, &result);
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
0, &result);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
|
||||
|
|
|
@ -588,7 +588,11 @@ enum hl_pll_frequency {
|
|||
* @hw_queues_unlock: release H/W queues lock.
|
||||
* @get_pci_id: retrieve PCI ID.
|
||||
* @get_eeprom_data: retrieve EEPROM data from F/W.
|
||||
* @send_cpu_message: send buffer to ArmCP.
|
||||
* @send_cpu_message: send message to F/W. If the message is timedout, the
|
||||
* driver will eventually reset the device. The timeout can
|
||||
* be determined by the calling function or it can be 0 and
|
||||
* then the timeout is the default timeout for the specific
|
||||
* ASIC
|
||||
* @get_hw_state: retrieve the H/W state
|
||||
* @pci_bars_map: Map PCI BARs.
|
||||
* @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
#include <linux/pci.h>
|
||||
#include <linux/hwmon.h>
|
||||
|
||||
#define SENSORS_PKT_TIMEOUT 1000000 /* 1s */
|
||||
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)
|
||||
|
||||
int hl_build_hwmon_channel_info(struct hl_device *hdev,
|
||||
|
@ -323,7 +322,7 @@ int hl_get_temperature(struct hl_device *hdev,
|
|||
pkt.type = __cpu_to_le16(attr);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, value);
|
||||
0, value);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -350,7 +349,7 @@ int hl_set_temperature(struct hl_device *hdev,
|
|||
pkt.value = __cpu_to_le64(value);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev,
|
||||
|
@ -374,7 +373,7 @@ int hl_get_voltage(struct hl_device *hdev,
|
|||
pkt.type = __cpu_to_le16(attr);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, value);
|
||||
0, value);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -400,7 +399,7 @@ int hl_get_current(struct hl_device *hdev,
|
|||
pkt.type = __cpu_to_le16(attr);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, value);
|
||||
0, value);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -426,7 +425,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
|
|||
pkt.type = __cpu_to_le16(attr);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, value);
|
||||
0, value);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -452,7 +451,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
|
|||
pkt.type = __cpu_to_le16(attr);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, value);
|
||||
0, value);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -479,7 +478,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
|
|||
pkt.value = cpu_to_le64(value);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev,
|
||||
|
@ -502,7 +501,7 @@ int hl_set_voltage(struct hl_device *hdev,
|
|||
pkt.value = __cpu_to_le64(value);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev,
|
||||
|
@ -527,7 +526,7 @@ int hl_set_current(struct hl_device *hdev,
|
|||
pkt.value = __cpu_to_le64(value);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SENSORS_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev,
|
||||
|
|
|
@ -9,9 +9,6 @@
|
|||
|
||||
#include <linux/pci.h>
|
||||
|
||||
#define SET_CLK_PKT_TIMEOUT 1000000 /* 1s */
|
||||
#define SET_PWR_PKT_TIMEOUT 1000000 /* 1s */
|
||||
|
||||
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
|
||||
{
|
||||
struct armcp_packet pkt;
|
||||
|
@ -29,7 +26,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
|
|||
pkt.pll_index = cpu_to_le32(pll_index);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SET_CLK_PKT_TIMEOUT, &result);
|
||||
0, &result);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev,
|
||||
|
@ -54,7 +51,7 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
|
|||
pkt.value = cpu_to_le64(freq);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SET_CLK_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev,
|
||||
|
@ -74,7 +71,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
|
|||
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SET_PWR_PKT_TIMEOUT, &result);
|
||||
0, &result);
|
||||
|
||||
if (rc) {
|
||||
dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
|
||||
|
@ -96,7 +93,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
|
|||
pkt.value = cpu_to_le64(value);
|
||||
|
||||
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||
SET_PWR_PKT_TIMEOUT, NULL);
|
||||
0, NULL);
|
||||
|
||||
if (rc)
|
||||
dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
|
||||
|
|
Loading…
Reference in New Issue