diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 98cd52bb005f..b4fcad0e62f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) return adev->have_atomics_support; } +void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev) +{ + amdgpu_device_flush_hdp(adev, NULL); +} + void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) { amdgpu_umc_poison_handler(adev, reset); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index dd740e64e6e1..2d0406bff84e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, uint64_t *mmap_offset); int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem, struct dma_buf **dmabuf); +void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a6570b124b2b..1fae97df7a1e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, pr_debug("Write ptr address == 0x%016llX\n", args->write_pointer_address); + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0); return 0; err_create_queue: @@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v args->launch_mode.launch_mode); break; case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES: + r = suspend_queues(target, + args->suspend_queues.num_queues, + args->suspend_queues.grace_period, + args->suspend_queues.exception_mask, + (uint32_t *)args->suspend_queues.queue_array_ptr); + + break; case KFD_IOC_DBG_TRAP_RESUME_QUEUES: + r = resume_queues(target, args->resume_queues.num_queues, + (uint32_t *)args->resume_queues.queue_array_ptr); + break; case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH: case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH: case KFD_IOC_DBG_TRAP_SET_FLAGS: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index 53c3418562d4..f4d3dfb35cb3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -339,6 +339,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind } kfd_dbg_set_workaround(target, false); + + if (!unwind) { + int resume_count = resume_queues(target, 0, NULL); + + if (resume_count) + pr_debug("Resumed %d queues\n", resume_count); + } } static void kfd_dbg_clean_exception_status(struct kfd_process *target) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 44d87943e40a..bc9e81293165 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -952,6 +952,92 @@ out_unlock: return retval; } +/* suspend_single_queue does not lock the dqm like the + * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should + * lock the dqm before calling, and unlock after calling. + * + * The reason we don't lock the dqm is because this function may be + * called on multiple queues in a loop, so rather than locking/unlocking + * multiple times, we will just keep the dqm locked for all of the calls. + */ +static int suspend_single_queue(struct device_queue_manager *dqm, + struct kfd_process_device *pdd, + struct queue *q) +{ + bool is_new; + + if (q->properties.is_suspended) + return 0; + + pr_debug("Suspending PASID %u queue [%i]\n", + pdd->process->pasid, + q->properties.queue_id); + + is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW); + + if (is_new || q->properties.is_being_destroyed) { + pr_debug("Suspend: skip %s queue id %i\n", + is_new ? "new" : "destroyed", + q->properties.queue_id); + return -EBUSY; + } + + q->properties.is_suspended = true; + if (q->properties.is_active) { + if (dqm->dev->kfd->shared_resources.enable_mes) { + int r = remove_queue_mes(dqm, q, &pdd->qpd); + + if (r) + return r; + } + + decrement_queue_count(dqm, &pdd->qpd, q); + q->properties.is_active = false; + } + + return 0; +} + +/* resume_single_queue does not lock the dqm like the functions + * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should + * lock the dqm before calling, and unlock after calling. + * + * The reason we don't lock the dqm is because this function may be + * called on multiple queues in a loop, so rather than locking/unlocking + * multiple times, we will just keep the dqm locked for all of the calls. + */ +static int resume_single_queue(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + struct kfd_process_device *pdd; + + if (!q->properties.is_suspended) + return 0; + + pdd = qpd_to_pdd(qpd); + + pr_debug("Restoring from suspend PASID %u queue [%i]\n", + pdd->process->pasid, + q->properties.queue_id); + + q->properties.is_suspended = false; + + if (QUEUE_IS_ACTIVE(q->properties)) { + if (dqm->dev->kfd->shared_resources.enable_mes) { + int r = add_queue_mes(dqm, q, &pdd->qpd); + + if (r) + return r; + } + + q->properties.is_active = true; + increment_queue_count(dqm, qpd, q); + } + + return 0; +} + static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -1926,6 +2012,31 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, return map_queues_cpsch(dqm); } +static int wait_on_destroy_queue(struct device_queue_manager *dqm, + struct queue *q) +{ + struct kfd_process_device *pdd = kfd_get_process_device_data(q->device, + q->process); + int ret = 0; + + if (pdd->qpd.is_debug) + return ret; + + q->properties.is_being_destroyed = true; + + if (pdd->process->debug_trap_enabled && q->properties.is_suspended) { + dqm_unlock(dqm); + mutex_unlock(&q->process->mutex); + ret = wait_event_interruptible(dqm->destroy_wait, + !q->properties.is_suspended); + + mutex_lock(&q->process->mutex); + dqm_lock(dqm); + } + + return ret; +} + static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -1945,11 +2056,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, q->properties.queue_id); } - retval = 0; - /* remove queue from list to prevent rescheduling after preemption */ dqm_lock(dqm); + retval = wait_on_destroy_queue(dqm, q); + + if (retval) { + dqm_unlock(dqm); + return retval; + } + if (qpd->is_debug) { /* * error, currently we do not allow to destroy a queue @@ -1996,7 +2112,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, dqm_unlock(dqm); - /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ + /* + * Do free_mqd and raise delete event after dqm_unlock(dqm) to avoid + * circular locking + */ + kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE), + qpd->pqm->process, q->device, + -1, false, NULL, 0); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); return retval; @@ -2461,8 +2584,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) goto out_free; } - if (!dqm->ops.initialize(dqm)) + if (!dqm->ops.initialize(dqm)) { + init_waitqueue_head(&dqm->destroy_wait); return dqm; + } out_free: kfree(dqm); @@ -2602,6 +2727,320 @@ out_unlock: return r; } +#define QUEUE_NOT_FOUND -1 +/* invalidate queue operation in array */ +static void q_array_invalidate(uint32_t num_queues, uint32_t *queue_ids) +{ + int i; + + for (i = 0; i < num_queues; i++) + queue_ids[i] |= KFD_DBG_QUEUE_INVALID_MASK; +} + +/* find queue index in array */ +static int q_array_get_index(unsigned int queue_id, + uint32_t num_queues, + uint32_t *queue_ids) +{ + int i; + + for (i = 0; i < num_queues; i++) + if (queue_id == (queue_ids[i] & ~KFD_DBG_QUEUE_INVALID_MASK)) + return i; + + return QUEUE_NOT_FOUND; +} + +struct copy_context_work_handler_workarea { + struct work_struct copy_context_work; + struct kfd_process *p; +}; + +static void copy_context_work_handler (struct work_struct *work) +{ + struct copy_context_work_handler_workarea *workarea; + struct mqd_manager *mqd_mgr; + struct queue *q; + struct mm_struct *mm; + struct kfd_process *p; + uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size; + int i; + + workarea = container_of(work, + struct copy_context_work_handler_workarea, + copy_context_work); + + p = workarea->p; + mm = get_task_mm(p->lead_thread); + + if (!mm) + return; + + kthread_use_mm(mm); + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + + list_for_each_entry(q, &qpd->queues_list, list) { + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; + + /* We ignore the return value from get_wave_state + * because + * i) right now, it always returns 0, and + * ii) if we hit an error, we would continue to the + * next queue anyway. + */ + mqd_mgr->get_wave_state(mqd_mgr, + q->mqd, + &q->properties, + (void __user *) q->properties.ctx_save_restore_area_address, + &tmp_ctl_stack_used_size, + &tmp_save_area_used_size); + } + } + kthread_unuse_mm(mm); + mmput(mm); +} + +static uint32_t *get_queue_ids(uint32_t num_queues, uint32_t *usr_queue_id_array) +{ + size_t array_size = num_queues * sizeof(uint32_t); + uint32_t *queue_ids = NULL; + + if (!usr_queue_id_array) + return NULL; + + queue_ids = kzalloc(array_size, GFP_KERNEL); + if (!queue_ids) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(queue_ids, usr_queue_id_array, array_size)) + return ERR_PTR(-EFAULT); + + return queue_ids; +} + +int resume_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t *usr_queue_id_array) +{ + uint32_t *queue_ids = NULL; + int total_resumed = 0; + int i; + + if (usr_queue_id_array) { + queue_ids = get_queue_ids(num_queues, usr_queue_id_array); + + if (IS_ERR(queue_ids)) + return PTR_ERR(queue_ids); + + /* mask all queues as invalid. unmask per successful request */ + q_array_invalidate(num_queues, queue_ids); + } + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + struct queue *q; + int r, per_device_resumed = 0; + + dqm_lock(dqm); + + /* unmask queues that resume or already resumed as valid */ + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = QUEUE_NOT_FOUND; + + if (queue_ids) + q_idx = q_array_get_index( + q->properties.queue_id, + num_queues, + queue_ids); + + if (!queue_ids || q_idx != QUEUE_NOT_FOUND) { + int err = resume_single_queue(dqm, &pdd->qpd, q); + + if (queue_ids) { + if (!err) { + queue_ids[q_idx] &= + ~KFD_DBG_QUEUE_INVALID_MASK; + } else { + queue_ids[q_idx] |= + KFD_DBG_QUEUE_ERROR_MASK; + break; + } + } + + if (dqm->dev->kfd->shared_resources.enable_mes) { + wake_up_all(&dqm->destroy_wait); + if (!err) + total_resumed++; + } else { + per_device_resumed++; + } + } + } + + if (!per_device_resumed) { + dqm_unlock(dqm); + continue; + } + + r = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, + USE_DEFAULT_GRACE_PERIOD); + if (r) { + pr_err("Failed to resume process queues\n"); + if (queue_ids) { + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index( + q->properties.queue_id, + num_queues, + queue_ids); + + /* mask queue as error on resume fail */ + if (q_idx != QUEUE_NOT_FOUND) + queue_ids[q_idx] |= + KFD_DBG_QUEUE_ERROR_MASK; + } + } + } else { + wake_up_all(&dqm->destroy_wait); + total_resumed += per_device_resumed; + } + + dqm_unlock(dqm); + } + + if (queue_ids) { + if (copy_to_user((void __user *)usr_queue_id_array, queue_ids, + num_queues * sizeof(uint32_t))) + pr_err("copy_to_user failed on queue resume\n"); + + kfree(queue_ids); + } + + return total_resumed; +} + +int suspend_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t grace_period, + uint64_t exception_clear_mask, + uint32_t *usr_queue_id_array) +{ + uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array); + int total_suspended = 0; + int i; + + if (IS_ERR(queue_ids)) + return PTR_ERR(queue_ids); + + /* mask all queues as invalid. umask on successful request */ + q_array_invalidate(num_queues, queue_ids); + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + struct queue *q; + int r, per_device_suspended = 0; + + mutex_lock(&p->event_mutex); + dqm_lock(dqm); + + /* unmask queues that suspend or already suspended */ + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index(q->properties.queue_id, + num_queues, + queue_ids); + + if (q_idx != QUEUE_NOT_FOUND) { + int err = suspend_single_queue(dqm, pdd, q); + bool is_mes = dqm->dev->kfd->shared_resources.enable_mes; + + if (!err) { + queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; + if (exception_clear_mask && is_mes) + q->properties.exception_status &= + ~exception_clear_mask; + + if (is_mes) + total_suspended++; + else + per_device_suspended++; + } else if (err != -EBUSY) { + r = err; + queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; + break; + } + } + } + + if (!per_device_suspended) { + dqm_unlock(dqm); + mutex_unlock(&p->event_mutex); + if (total_suspended) + amdgpu_amdkfd_debug_mem_fence(dqm->dev->adev); + continue; + } + + r = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, + grace_period); + + if (r) + pr_err("Failed to suspend process queues.\n"); + else + total_suspended += per_device_suspended; + + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index(q->properties.queue_id, + num_queues, queue_ids); + + if (q_idx == QUEUE_NOT_FOUND) + continue; + + /* mask queue as error on suspend fail */ + if (r) + queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; + else if (exception_clear_mask) + q->properties.exception_status &= + ~exception_clear_mask; + } + + dqm_unlock(dqm); + mutex_unlock(&p->event_mutex); + amdgpu_device_flush_hdp(dqm->dev->adev, NULL); + } + + if (total_suspended) { + struct copy_context_work_handler_workarea copy_context_worker; + + INIT_WORK_ONSTACK( + ©_context_worker.copy_context_work, + copy_context_work_handler); + + copy_context_worker.p = p; + + schedule_work(©_context_worker.copy_context_work); + + + flush_work(©_context_worker.copy_context_work); + destroy_work_on_stack(©_context_worker.copy_context_work); + } + + if (copy_to_user((void __user *)usr_queue_id_array, queue_ids, + num_queues * sizeof(uint32_t))) + pr_err("copy_to_user failed on queue suspend\n"); + + kfree(queue_ids); + + return total_suspended; +} + int debug_lock_and_unmap(struct device_queue_manager *dqm) { int r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index bb75d93712eb..d4e6dbffe8c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -263,6 +263,8 @@ struct device_queue_manager { uint32_t current_logical_xcc_start; uint32_t wait_times; + + wait_queue_head_t destroy_wait; }; void device_queue_manager_init_cik( @@ -290,6 +292,14 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd); int release_debug_trap_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd); +int suspend_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t grace_period, + uint64_t exception_clear_mask, + uint32_t *usr_queue_id_array); +int resume_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t *usr_queue_id_array); int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index a0ac4f2fe6b5..94c0fc2e57b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -237,6 +237,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, u32 *save_area_used_size) { struct v10_compute_mqd *m; + struct kfd_context_save_area_header header; m = get_mqd(mqd); @@ -255,6 +256,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, * accessible to user mode */ + header.wave_state.control_stack_size = *ctl_stack_used_size; + header.wave_state.wave_state_size = *save_area_used_size; + + header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; + header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state))) + return -EFAULT; + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 9a9b4e853516..31fec5e70d13 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -291,7 +291,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, u32 *save_area_used_size) { struct v11_compute_mqd *m; - /*struct mqd_user_context_save_area_header header;*/ + struct kfd_context_save_area_header header; m = get_mqd(mqd); @@ -309,16 +309,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, * it's part of the context save area that is already * accessible to user mode */ -/* - header.control_stack_size = *ctl_stack_used_size; - header.wave_state_size = *save_area_used_size; + header.wave_state.control_stack_size = *ctl_stack_used_size; + header.wave_state.wave_state_size = *save_area_used_size; - header.wave_state_offset = m->cp_hqd_wg_state_offset; - header.control_stack_offset = m->cp_hqd_cntl_stack_offset; + header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; + header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; - if (copy_to_user(ctl_stack, &header, sizeof(header))) + if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state))) return -EFAULT; -*/ + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 5b87c244e909..601bb9f68048 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -311,6 +311,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, u32 *save_area_used_size) { struct v9_mqd *m; + struct kfd_context_save_area_header header; /* Control stack is located one page after MQD. */ void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); @@ -322,7 +323,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, *save_area_used_size = m->cp_hqd_wg_state_offset - m->cp_hqd_cntl_stack_size; - if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) + header.wave_state.control_stack_size = *ctl_stack_used_size; + header.wave_state.wave_state_size = *save_area_used_size; + + header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; + header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state))) + return -EFAULT; + + if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset, + mqd_ctl_stack + m->cp_hqd_cntl_stack_offset, + *ctl_stack_used_size)) return -EFAULT; return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index cd2d56e5cdf0..05da43bf233a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -510,6 +510,8 @@ struct queue_properties { uint32_t doorbell_off; bool is_interop; bool is_evicted; + bool is_suspended; + bool is_being_destroyed; bool is_active; bool is_gws; uint32_t pm4_target_xcc; @@ -535,7 +537,8 @@ struct queue_properties { #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ (q).queue_address != 0 && \ (q).queue_percent > 0 && \ - !(q).is_evicted) + !(q).is_evicted && \ + !(q).is_suspended) enum mqd_update_flag { UPDATE_FLAG_DBG_WA_ENABLE = 1, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 70852a200d8f..01ccab607a69 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -187,6 +187,7 @@ static int init_user_queue(struct process_queue_manager *pqm, /* Doorbell initialized in user space*/ q_properties->doorbell_ptr = NULL; + q_properties->exception_status = KFD_EC_MASK(EC_QUEUE_NEW); /* let DQM handle it*/ q_properties->vmid = 0;