habanalabs: improve IOCTLs behavior when disabled or reset
This patch makes some improvement in how IOCTLs behave when the device is disabled or under reset. The new code checks, at the start of every IOCTL, if the device is disabled or in reset. If so, it prints an appropriate kernel message and returns -EBUSY to user-space. In addition, the code modifies the location of where the hard_reset_pending flag is being set or cleared: 1. It is now cleared immediately after the reset *tear-down* flow is finished but before the re-initialization flow begins. 2. It is being set in the remove function of the device, to make the behavior the same with the hard-reset flow There are two exceptions to the disable or in reset check: 1. The HL_INFO_DEVICE_STATUS opcode in the INFO IOCTL. This opcode allows the user to inquire about the status of the device, whether it is operational, in reset or malfunction (disabled). If the driver will block this IOCTL, the user won't be able to retrieve the status in case of malfunction or in reset. 2. The WAIT_FOR_CS IOCTL. This IOCTL allows the user to inquire about the status of a CS. We want to allow the user to continue to do so, even if we started a soft-reset process because it will allow the user to get the correct error code for each CS he submitted. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
caa3c8e525
commit
3f5398cfbf
|
@ -214,6 +214,13 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
|
||||||
u64 handle;
|
u64 handle;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
if (hl_device_disabled_or_in_reset(hdev)) {
|
||||||
|
dev_warn_ratelimited(hdev->dev,
|
||||||
|
"Device is %s. Can't execute CB IOCTL\n",
|
||||||
|
atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
switch (args->in.op) {
|
switch (args->in.op) {
|
||||||
case HL_CB_OP_CREATE:
|
case HL_CB_OP_CREATE:
|
||||||
rc = hl_cb_create(hdev, &hpriv->cb_mgr, args->in.cb_size,
|
rc = hl_cb_create(hdev, &hpriv->cb_mgr, args->in.cb_size,
|
||||||
|
|
|
@ -641,6 +641,8 @@ again:
|
||||||
if ((hard_reset) && (!from_hard_reset_thread)) {
|
if ((hard_reset) && (!from_hard_reset_thread)) {
|
||||||
struct hl_device_reset_work *device_reset_work;
|
struct hl_device_reset_work *device_reset_work;
|
||||||
|
|
||||||
|
hdev->hard_reset_pending = true;
|
||||||
|
|
||||||
if (!hdev->pdev) {
|
if (!hdev->pdev) {
|
||||||
dev_err(hdev->dev,
|
dev_err(hdev->dev,
|
||||||
"Reset action is NOT supported in simulator\n");
|
"Reset action is NOT supported in simulator\n");
|
||||||
|
@ -648,8 +650,6 @@ again:
|
||||||
goto out_err;
|
goto out_err;
|
||||||
}
|
}
|
||||||
|
|
||||||
hdev->hard_reset_pending = true;
|
|
||||||
|
|
||||||
device_reset_work = kzalloc(sizeof(*device_reset_work),
|
device_reset_work = kzalloc(sizeof(*device_reset_work),
|
||||||
GFP_ATOMIC);
|
GFP_ATOMIC);
|
||||||
if (!device_reset_work) {
|
if (!device_reset_work) {
|
||||||
|
@ -718,6 +718,7 @@ again:
|
||||||
|
|
||||||
if (hard_reset) {
|
if (hard_reset) {
|
||||||
hdev->device_cpu_disabled = false;
|
hdev->device_cpu_disabled = false;
|
||||||
|
hdev->hard_reset_pending = false;
|
||||||
|
|
||||||
if (hdev->kernel_ctx) {
|
if (hdev->kernel_ctx) {
|
||||||
dev_crit(hdev->dev,
|
dev_crit(hdev->dev,
|
||||||
|
@ -779,8 +780,6 @@ again:
|
||||||
}
|
}
|
||||||
|
|
||||||
hl_set_max_power(hdev, hdev->max_power);
|
hl_set_max_power(hdev, hdev->max_power);
|
||||||
|
|
||||||
hdev->hard_reset_pending = false;
|
|
||||||
} else {
|
} else {
|
||||||
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
|
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
|
@ -1069,6 +1068,8 @@ void hl_device_fini(struct hl_device *hdev)
|
||||||
hdev->asic_funcs->hw_queues_lock(hdev);
|
hdev->asic_funcs->hw_queues_lock(hdev);
|
||||||
hdev->asic_funcs->hw_queues_unlock(hdev);
|
hdev->asic_funcs->hw_queues_unlock(hdev);
|
||||||
|
|
||||||
|
hdev->hard_reset_pending = true;
|
||||||
|
|
||||||
device_kill_open_processes(hdev);
|
device_kill_open_processes(hdev);
|
||||||
|
|
||||||
hl_hwmon_fini(hdev);
|
hl_hwmon_fini(hdev);
|
||||||
|
|
|
@ -202,7 +202,8 @@ static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data)
|
||||||
|
|
||||||
if (hl_device_disabled_or_in_reset(hdev)) {
|
if (hl_device_disabled_or_in_reset(hdev)) {
|
||||||
dev_warn_ratelimited(hdev->dev,
|
dev_warn_ratelimited(hdev->dev,
|
||||||
"Device is disabled or in reset. Can't execute INFO IOCTL\n");
|
"Device is %s. Can't execute INFO IOCTL\n",
|
||||||
|
atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1159,7 +1159,8 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
|
||||||
|
|
||||||
if (hl_device_disabled_or_in_reset(hdev)) {
|
if (hl_device_disabled_or_in_reset(hdev)) {
|
||||||
dev_warn_ratelimited(hdev->dev,
|
dev_warn_ratelimited(hdev->dev,
|
||||||
"Device is disabled or in reset. Can't execute memory IOCTL\n");
|
"Device is %s. Can't execute MEMORY IOCTL\n",
|
||||||
|
atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue