habanalabs: fix soft reset accounting

Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.

We also shouldn't print soft-reset when doing the reset upon device
release.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Oded Gabbay 2021-11-16 09:46:02 +02:00
parent d8eb50f31c
commit 60e0431f41
1 changed files with 25 additions and 25 deletions

View File

@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
reset_upon_device_release = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i, rc;
if (!hdev->init_done) {
dev_err(hdev->dev,
"Can't reset before initialization is done\n");
dev_err(hdev->dev, "Can't reset before initialization is done\n");
return 0;
}
@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return -EINVAL;
}
reset_upon_device_release = true;
goto do_reset;
}
@ -1024,12 +1026,10 @@ do_reset:
if (hard_reset)
dev_info(hdev->dev, "Going to reset device\n");
else if (flags & HL_DRV_RESET_DEV_RELEASE)
dev_info(hdev->dev,
"Going to reset device after it was released by user\n");
else if (reset_upon_device_release)
dev_info(hdev->dev, "Going to reset device after release by user\n");
else
dev_info(hdev->dev,
"Going to reset compute engines of inference device\n");
dev_info(hdev->dev, "Going to reset engines of inference device\n");
}
again:
@ -1174,16 +1174,14 @@ kill_processes:
rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
dev_err(hdev->dev,
"failed to initialize the H/W after reset\n");
dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
goto out_err;
}
/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev,
"device is not idle (mask 0x%llx_%llx) after reset\n",
dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
@ -1192,23 +1190,20 @@ kill_processes:
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
dev_err(hdev->dev,
"Failed to detect if device is alive after reset\n");
dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
goto out_err;
}
if (hard_reset) {
rc = device_late_init(hdev);
if (rc) {
dev_err(hdev->dev,
"Failed late init after hard reset\n");
dev_err(hdev->dev, "Failed late init after hard reset\n");
goto out_err;
}
rc = hl_vm_init(hdev);
if (rc) {
dev_err(hdev->dev,
"Failed to init memory module after hard reset\n");
dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
goto out_err;
}
@ -1216,8 +1211,11 @@ kill_processes:
} else {
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
if (rc) {
dev_err(hdev->dev,
"Failed late init after soft reset\n");
if (reset_upon_device_release)
dev_err(hdev->dev,
"Failed late init in reset after device release\n");
else
dev_err(hdev->dev, "Failed late init after soft reset\n");
goto out_err;
}
}
@ -1236,7 +1234,7 @@ kill_processes:
* the device will be operational although it shouldn't be
*/
hdev->asic_funcs->enable_events_from_fw(hdev);
} else {
} else if (!reset_upon_device_release) {
hdev->soft_reset_cnt++;
}
@ -1246,12 +1244,14 @@ out_err:
hdev->disabled = true;
if (hard_reset) {
dev_err(hdev->dev,
"Failed to reset! Device is NOT usable\n");
dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
hdev->hard_reset_cnt++;
} else if (reset_upon_device_release) {
dev_err(hdev->dev, "Failed to reset device after user release\n");
hard_reset = true;
goto again;
} else {
dev_err(hdev->dev,
"Failed to do soft-reset, trying hard reset\n");
dev_err(hdev->dev, "Failed to do soft-reset\n");
hdev->soft_reset_cnt++;
hard_reset = true;
goto again;