habanalabs: add debug flag to prevent failure on timeout

Sometimes it is useful to allow the command to continue running despite
the timeout occurred, to differentiate between really stuck or just very
time consuming commands. This can be achieved by passing a new debug
flag alongside the cs, HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT.

Anyway, if the timeout occurred, a warning print shall be issued,
however this shall not fail the submission.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Yuri Nudelman 2021-05-25 14:49:52 +03:00 committed by Oded Gabbay
parent 254fac6d1a
commit 8e8125f192
3 changed files with 26 additions and 5 deletions

View File

@ -556,6 +556,13 @@ out:
else if (!cs->submitted)
cs->fence->error = -EBUSY;
if (unlikely(cs->skip_reset_on_timeout)) {
dev_err(hdev->dev,
"Command submission %llu completed after %llu (s)\n",
cs->sequence,
div_u64(jiffies - cs->submission_time_jiffies, HZ));
}
if (cs->timestamp)
cs->fence->timestamp = ktime_get();
complete_all(&cs->fence->completion);
@ -571,6 +578,8 @@ static void cs_timedout(struct work_struct *work)
int rc;
struct hl_cs *cs = container_of(work, struct hl_cs,
work_tdr.work);
bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
rc = cs_get_unless_zero(cs);
if (!rc)
return;
@ -581,7 +590,8 @@ static void cs_timedout(struct work_struct *work)
}
/* Mark the CS is timed out so we won't try to cancel its TDR */
cs->timedout = true;
if (likely(!skip_reset_on_timeout))
cs->timedout = true;
hdev = cs->ctx->hdev;
@ -613,10 +623,12 @@ static void cs_timedout(struct work_struct *work)
cs_put(cs);
if (hdev->reset_on_lockup)
hl_device_reset(hdev, HL_RESET_TDR);
else
hdev->needs_reset = true;
if (likely(!skip_reset_on_timeout)) {
if (hdev->reset_on_lockup)
hl_device_reset(hdev, HL_RESET_TDR);
else
hdev->needs_reset = true;
}
}
static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@ -650,6 +662,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
cs->type = cs_type;
cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
cs->timeout_jiffies = timeout;
cs->skip_reset_on_timeout =
!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
cs->submission_time_jiffies = jiffies;
INIT_LIST_HEAD(&cs->job_list);
INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
kref_init(&cs->refcount);

View File

@ -1421,6 +1421,7 @@ struct hl_userptr {
* @staged_sequence: the sequence of the staged submission this CS is part of,
* relevant only if staged_cs is set.
* @timeout_jiffies: cs timeout in jiffies.
* @submission_time_jiffies: submission time of the cs
* @type: CS_TYPE_*.
* @submitted: true if CS was submitted to H/W.
* @completed: true if CS was completed by device.
@ -1433,6 +1434,8 @@ struct hl_userptr {
* @staged_first: true if this is the first staged CS and we need to receive
* timeout for this CS.
* @staged_cs: true if this CS is part of a staged submission.
* @skip_reset_on_timeout: true if we shall not reset the device in case
* timeout occurs (debug scenario).
*/
struct hl_cs {
u16 *jobs_in_queue_cnt;
@ -1450,6 +1453,7 @@ struct hl_cs {
u64 sequence;
u64 staged_sequence;
u64 timeout_jiffies;
u64 submission_time_jiffies;
enum hl_cs_type type;
u8 submitted;
u8 completed;
@ -1460,6 +1464,7 @@ struct hl_cs {
u8 staged_last;
u8 staged_first;
u8 staged_cs;
u8 skip_reset_on_timeout;
};
/**

View File

@ -664,6 +664,7 @@ struct hl_cs_chunk {
#define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST 0x80
#define HL_CS_FLAGS_STAGED_SUBMISSION_LAST 0x100
#define HL_CS_FLAGS_CUSTOM_TIMEOUT 0x200
#define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT 0x400
#define HL_CS_STATUS_SUCCESS 0