habanalabs: define soft-reset as inference op
Soft-reset is the procedure where we reset only the compute/DMA engines of the device, without requiring the current user-space process to release the device. This type of reset can happen if TDR event occurred (a workload got stuck) or by a root request through sysfs. This is only relevant for inference ASICs, as there is no real-world use-case to do that in training, because training runs on multiple devices. In addition, we also do (in certain ASICs) a reset upon device release. That reset uses the same code as the soft-reset. Therefore, to better differentiate between the two resets, it is better to rename the soft-reset support as "inference soft-reset", to make the code more self-explanatory. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
dd08335fb9
commit
a00f1f571e
|
@ -954,7 +954,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
|
|||
goto do_reset;
|
||||
}
|
||||
|
||||
if (!hard_reset && !hdev->allow_external_soft_reset) {
|
||||
if (!hard_reset && !hdev->allow_inference_soft_reset) {
|
||||
hard_instead_soft = true;
|
||||
hard_reset = true;
|
||||
}
|
||||
|
|
|
@ -2440,8 +2440,12 @@ struct multi_cs_data {
|
|||
* @collective_mon_idx: helper index for collective initialization
|
||||
* @supports_coresight: is CoreSight supported.
|
||||
* @supports_soft_reset: is soft reset supported.
|
||||
* @allow_external_soft_reset: true if soft reset initiated by user or TDR is
|
||||
* allowed.
|
||||
* @allow_inference_soft_reset: true if the ASIC supports soft reset that is
|
||||
* initiated by user or TDR. This is only true
|
||||
* in inference ASICs, as there is no real-world
|
||||
* use-case of doing soft-reset in training (due
|
||||
* to the fact that training runs on multiple
|
||||
* devices)
|
||||
* @supports_cb_mapping: is mapping a CB to the device's MMU supported.
|
||||
* @needs_reset: true if reset_on_lockup is false and device should be reset
|
||||
* due to lockup.
|
||||
|
@ -2572,7 +2576,7 @@ struct hl_device {
|
|||
u8 collective_mon_idx;
|
||||
u8 supports_coresight;
|
||||
u8 supports_soft_reset;
|
||||
u8 allow_external_soft_reset;
|
||||
u8 allow_inference_soft_reset;
|
||||
u8 supports_cb_mapping;
|
||||
u8 needs_reset;
|
||||
u8 process_kill_trial_cnt;
|
||||
|
|
|
@ -206,12 +206,12 @@ static ssize_t soft_reset_store(struct device *dev,
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (!hdev->allow_external_soft_reset) {
|
||||
dev_err(hdev->dev, "Device does not support soft-reset\n");
|
||||
if (!hdev->allow_inference_soft_reset) {
|
||||
dev_err(hdev->dev, "Device does not support inference soft-reset\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
|
||||
dev_warn(hdev->dev, "Inference Soft-Reset requested through sysfs\n");
|
||||
|
||||
hl_device_reset(hdev, 0);
|
||||
|
||||
|
|
|
@ -959,7 +959,7 @@ static int goya_sw_init(struct hl_device *hdev)
|
|||
spin_lock_init(&goya->hw_queues_lock);
|
||||
hdev->supports_coresight = true;
|
||||
hdev->supports_soft_reset = true;
|
||||
hdev->allow_external_soft_reset = true;
|
||||
hdev->allow_inference_soft_reset = true;
|
||||
hdev->supports_wait_for_multi_cs = false;
|
||||
|
||||
hdev->asic_funcs->set_pci_memory_regions(hdev);
|
||||
|
|
Loading…
Reference in New Issue