debugfs: add skip_reset_on_timeout option
To be able to debug long-running CS better, without changing the userspace code, we are adding a new option through debugfs interface to skip the reset of the device in case of CS timeout. Signed-off-by: Yuri Nudelman <ynudelman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
38e19d0b87
commit
4d041216c8
|
@ -207,6 +207,14 @@ Contact: ogabbay@kernel.org
|
|||
Description: Sets the PCI power state. Valid values are "1" for D0 and "2"
|
||||
for D3Hot
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout
|
||||
Date: Jun 2021
|
||||
KernelVersion: 5.13
|
||||
Contact: ynudelman@habana.ai
|
||||
Description: Sets the skip reset on timeout option for the device. Value of
|
||||
"0" means device will be reset in case some CS has timed out,
|
||||
otherwise it will not be reset.
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
|
||||
Date: Mar 2020
|
||||
KernelVersion: 5.6
|
||||
|
|
|
@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
|
|||
cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
|
||||
cs->timeout_jiffies = timeout;
|
||||
cs->skip_reset_on_timeout =
|
||||
hdev->skip_reset_on_timeout ||
|
||||
!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
|
||||
cs->submission_time_jiffies = jiffies;
|
||||
INIT_LIST_HEAD(&cs->job_list);
|
||||
|
|
|
@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
|
|||
dev_entry->root,
|
||||
&dev_entry->blob_desc);
|
||||
|
||||
debugfs_create_x8("skip_reset_on_timeout",
|
||||
0644,
|
||||
dev_entry->root,
|
||||
&hdev->skip_reset_on_timeout);
|
||||
|
||||
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
|
||||
debugfs_create_file(hl_debugfs_list[i].name,
|
||||
0444,
|
||||
|
|
|
@ -2191,6 +2191,8 @@ struct hl_mmu_funcs {
|
|||
* @supports_staged_submission: true if staged submissions are supported
|
||||
* @curr_reset_cause: saves an enumerated reset cause when a hard reset is
|
||||
* triggered, and cleared after it is shared with preboot.
|
||||
* @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
|
||||
* complete instead.
|
||||
*/
|
||||
struct hl_device {
|
||||
struct pci_dev *pdev;
|
||||
|
@ -2305,6 +2307,7 @@ struct hl_device {
|
|||
u8 device_fini_pending;
|
||||
u8 supports_staged_submission;
|
||||
u8 curr_reset_cause;
|
||||
u8 skip_reset_on_timeout;
|
||||
|
||||
/* Parameters for bring-up */
|
||||
u64 nic_ports_mask;
|
||||
|
|
Loading…
Reference in New Issue