habanalabs: allow reset upon device release

We introduce a new type of reset which is reset upon device release.
This reset is very similar to soft reset except the fact it is
performed only upon device release and not upon user sysfs request
nor TDR.

The purpose of this reset is to make sure the device is returned to
IDLE state after the current user has finished working with the device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Ofir Bitton 2021-06-08 17:24:52 +03:00 committed by Oded Gabbay
parent 4d041216c8
commit 23bace677a
4 changed files with 30 additions and 5 deletions

View File

@ -86,7 +86,7 @@ static void hpriv_release(struct kref *ref)
if ((hdev->reset_if_device_not_idle && !device_is_idle) if ((hdev->reset_if_device_not_idle && !device_is_idle)
|| hdev->reset_upon_device_release) || hdev->reset_upon_device_release)
hl_device_reset(hdev, 0); hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
} }
void hl_hpriv_get(struct hl_fpriv *hpriv) void hl_hpriv_get(struct hl_fpriv *hpriv)
@ -885,7 +885,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
int hl_device_reset(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags)
{ {
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
bool hard_reset, from_hard_reset_thread; bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
int i, rc; int i, rc;
if (!hdev->init_done) { if (!hdev->init_done) {
@ -897,11 +897,28 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hard_reset = (flags & HL_RESET_HARD) != 0; hard_reset = (flags & HL_RESET_HARD) != 0;
from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0; from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
if ((!hard_reset) && (!hdev->supports_soft_reset)) { if (!hard_reset && !hdev->supports_soft_reset) {
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n"); hard_instead_soft = true;
hard_reset = true; hard_reset = true;
} }
if (hdev->reset_upon_device_release &&
(flags & HL_RESET_DEVICE_RELEASE)) {
dev_dbg(hdev->dev,
"Perform %s-reset upon device release\n",
hard_reset ? "hard" : "soft");
goto do_reset;
}
if (!hard_reset && !hdev->allow_external_soft_reset) {
hard_instead_soft = true;
hard_reset = true;
}
if (hard_instead_soft)
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
do_reset:
/* Re-entry of reset thread */ /* Re-entry of reset thread */
if (from_hard_reset_thread && hdev->process_kill_trial_cnt) if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
goto kill_processes; goto kill_processes;

View File

@ -119,11 +119,15 @@ enum hl_mmu_page_table_location {
* *
* - HL_RESET_TDR * - HL_RESET_TDR
* Set if reset is due to TDR * Set if reset is due to TDR
*
* - HL_RESET_DEVICE_RELEASE
* Set if reset is due to device release
*/ */
#define HL_RESET_HARD (1 << 0) #define HL_RESET_HARD (1 << 0)
#define HL_RESET_FROM_RESET_THREAD (1 << 1) #define HL_RESET_FROM_RESET_THREAD (1 << 1)
#define HL_RESET_HEARTBEAT (1 << 2) #define HL_RESET_HEARTBEAT (1 << 2)
#define HL_RESET_TDR (1 << 3) #define HL_RESET_TDR (1 << 3)
#define HL_RESET_DEVICE_RELEASE (1 << 4)
#define HL_MAX_SOBS_PER_MONITOR 8 #define HL_MAX_SOBS_PER_MONITOR 8
@ -2181,6 +2185,8 @@ struct hl_mmu_funcs {
* @collective_mon_idx: helper index for collective initialization * @collective_mon_idx: helper index for collective initialization
* @supports_coresight: is CoreSight supported. * @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported. * @supports_soft_reset: is soft reset supported.
* @allow_external_soft_reset: true if soft reset initiated by user or TDR is
* allowed.
* @supports_cb_mapping: is mapping a CB to the device's MMU supported. * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
* @needs_reset: true if reset_on_lockup is false and device should be reset * @needs_reset: true if reset_on_lockup is false and device should be reset
* due to lockup. * due to lockup.
@ -2301,6 +2307,7 @@ struct hl_device {
u8 collective_mon_idx; u8 collective_mon_idx;
u8 supports_coresight; u8 supports_coresight;
u8 supports_soft_reset; u8 supports_soft_reset;
u8 allow_external_soft_reset;
u8 supports_cb_mapping; u8 supports_cb_mapping;
u8 needs_reset; u8 needs_reset;
u8 process_kill_trial_cnt; u8 process_kill_trial_cnt;

View File

@ -208,7 +208,7 @@ static ssize_t soft_reset_store(struct device *dev,
goto out; goto out;
} }
if (!hdev->supports_soft_reset) { if (!hdev->allow_external_soft_reset) {
dev_err(hdev->dev, "Device does not support soft-reset\n"); dev_err(hdev->dev, "Device does not support soft-reset\n");
goto out; goto out;
} }

View File

@ -954,6 +954,7 @@ static int goya_sw_init(struct hl_device *hdev)
spin_lock_init(&goya->hw_queues_lock); spin_lock_init(&goya->hw_queues_lock);
hdev->supports_coresight = true; hdev->supports_coresight = true;
hdev->supports_soft_reset = true; hdev->supports_soft_reset = true;
hdev->allow_external_soft_reset = true;
goya_set_pci_memory_regions(hdev); goya_set_pci_memory_regions(hdev);