drm/amdkfd: Fix suspend/resume issue on Carrizo v2
When we do suspend/resume through "sudo pm-suspend" while there is HSA activity running, upon resume we will encounter HWS hanging, which is caused by memory read/write failures. The root cause is that when suspend, we neglected to unbind pasid from kfd device. Another major change is that the bind/unbinding is changed to be performed on a per process basis, instead of whether there are queues in dqm. v2: - free IOMMU device if kfd_bind_processes_to_device fails in kfd_resume - add comments to kfd_bind/unbind_processes_to/from_device - minor cleanups Signed-off-by: Yong Zhao <yong.zhao@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
b8935a7c4b
commit
733fa1f742
|
@ -184,7 +184,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
|
|||
struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
|
||||
|
||||
if (dev)
|
||||
kfd_unbind_process_from_device(dev, pasid);
|
||||
kfd_process_iommu_unbind_callback(dev, pasid);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -332,12 +332,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
|
|||
|
||||
void kgd2kfd_suspend(struct kfd_dev *kfd)
|
||||
{
|
||||
if (kfd->init_complete) {
|
||||
kfd->dqm->ops.stop(kfd->dqm);
|
||||
amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
|
||||
amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
|
||||
amd_iommu_free_device(kfd->pdev);
|
||||
}
|
||||
if (!kfd->init_complete)
|
||||
return;
|
||||
|
||||
kfd->dqm->ops.stop(kfd->dqm);
|
||||
|
||||
kfd_unbind_processes_from_device(kfd);
|
||||
|
||||
amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
|
||||
amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
|
||||
amd_iommu_free_device(kfd->pdev);
|
||||
}
|
||||
|
||||
int kgd2kfd_resume(struct kfd_dev *kfd)
|
||||
|
@ -362,6 +366,10 @@ static int kfd_resume(struct kfd_dev *kfd)
|
|||
amd_iommu_set_invalid_ppr_cb(kfd->pdev,
|
||||
iommu_invalid_ppr_cb);
|
||||
|
||||
err = kfd_bind_processes_to_device(kfd);
|
||||
if (err)
|
||||
goto processes_bind_error;
|
||||
|
||||
err = kfd->dqm->ops.start(kfd->dqm);
|
||||
if (err) {
|
||||
dev_err(kfd_device,
|
||||
|
@ -373,6 +381,7 @@ static int kfd_resume(struct kfd_dev *kfd)
|
|||
return err;
|
||||
|
||||
dqm_start_error:
|
||||
processes_bind_error:
|
||||
amd_iommu_free_device(kfd->pdev);
|
||||
|
||||
return err;
|
||||
|
|
|
@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
|
|||
|
||||
static int start_cpsch(struct device_queue_manager *dqm)
|
||||
{
|
||||
struct device_process_node *node;
|
||||
int retval;
|
||||
|
||||
retval = 0;
|
||||
|
@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
|
||||
init_interrupts(dqm);
|
||||
|
||||
list_for_each_entry(node, &dqm->queues, list)
|
||||
if (node->qpd->pqm->process && dqm->dev)
|
||||
kfd_bind_process_to_device(dqm->dev,
|
||||
node->qpd->pqm->process);
|
||||
|
||||
execute_queues_cpsch(dqm, true);
|
||||
|
||||
return 0;
|
||||
|
@ -714,15 +708,8 @@ fail_packet_manager_init:
|
|||
|
||||
static int stop_cpsch(struct device_queue_manager *dqm)
|
||||
{
|
||||
struct device_process_node *node;
|
||||
struct kfd_process_device *pdd;
|
||||
|
||||
destroy_queues_cpsch(dqm, true, true);
|
||||
|
||||
list_for_each_entry(node, &dqm->queues, list) {
|
||||
pdd = qpd_to_pdd(node->qpd);
|
||||
pdd->bound = false;
|
||||
}
|
||||
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
|
||||
pm_uninit(&dqm->packets);
|
||||
|
||||
|
|
|
@ -432,6 +432,13 @@ struct qcm_process_device {
|
|||
uint32_t sh_hidden_private_base;
|
||||
};
|
||||
|
||||
|
||||
enum kfd_pdd_bound {
|
||||
PDD_UNBOUND = 0,
|
||||
PDD_BOUND,
|
||||
PDD_BOUND_SUSPENDED,
|
||||
};
|
||||
|
||||
/* Data that is per-process-per device. */
|
||||
struct kfd_process_device {
|
||||
/*
|
||||
|
@ -456,7 +463,7 @@ struct kfd_process_device {
|
|||
uint64_t scratch_limit;
|
||||
|
||||
/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
|
||||
bool bound;
|
||||
enum kfd_pdd_bound bound;
|
||||
|
||||
/* This flag tells if we should reset all
|
||||
* wavefronts on process termination
|
||||
|
@ -547,8 +554,10 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
|
|||
struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
|
||||
|
||||
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
|
||||
struct kfd_process *p);
|
||||
void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
|
||||
struct kfd_process *p);
|
||||
int kfd_bind_processes_to_device(struct kfd_dev *dev);
|
||||
void kfd_unbind_processes_from_device(struct kfd_dev *dev);
|
||||
void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
|
||||
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
|
||||
struct kfd_process *p);
|
||||
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
|
||||
|
|
|
@ -174,9 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
|
|||
if (pdd->reset_wavefronts)
|
||||
dbgdev_wave_reset_wavefronts(pdd->dev, p);
|
||||
|
||||
amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
|
||||
list_del(&pdd->per_device_list);
|
||||
if (pdd->bound == PDD_BOUND)
|
||||
amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
|
||||
|
||||
list_del(&pdd->per_device_list);
|
||||
kfree(pdd);
|
||||
}
|
||||
|
||||
|
@ -351,9 +352,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
|
|||
|
||||
list_for_each_entry(pdd, &p->per_device_data, per_device_list)
|
||||
if (pdd->dev == dev)
|
||||
break;
|
||||
return pdd;
|
||||
|
||||
return pdd;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
|
||||
|
@ -368,6 +369,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
|
|||
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
|
||||
pdd->qpd.dqm = dev->dqm;
|
||||
pdd->reset_wavefronts = false;
|
||||
pdd->bound = PDD_UNBOUND;
|
||||
list_add(&pdd->per_device_list, &p->per_device_data);
|
||||
}
|
||||
|
||||
|
@ -393,19 +395,91 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
|
|||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
if (pdd->bound)
|
||||
if (pdd->bound == PDD_BOUND) {
|
||||
return pdd;
|
||||
} else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
|
||||
pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
pdd->bound = true;
|
||||
pdd->bound = PDD_BOUND;
|
||||
|
||||
return pdd;
|
||||
}
|
||||
|
||||
void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
|
||||
/*
|
||||
* Bind processes do the device that have been temporarily unbound
|
||||
* (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
|
||||
*/
|
||||
int kfd_bind_processes_to_device(struct kfd_dev *dev)
|
||||
{
|
||||
struct kfd_process_device *pdd;
|
||||
struct kfd_process *p;
|
||||
unsigned int temp;
|
||||
int err = 0;
|
||||
|
||||
int idx = srcu_read_lock(&kfd_processes_srcu);
|
||||
|
||||
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
||||
mutex_lock(&p->mutex);
|
||||
pdd = kfd_get_process_device_data(dev, p);
|
||||
if (pdd->bound != PDD_BOUND_SUSPENDED) {
|
||||
mutex_unlock(&p->mutex);
|
||||
continue;
|
||||
}
|
||||
|
||||
err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
|
||||
p->lead_thread);
|
||||
if (err < 0) {
|
||||
pr_err("unexpected pasid %d binding failure\n",
|
||||
p->pasid);
|
||||
mutex_unlock(&p->mutex);
|
||||
break;
|
||||
}
|
||||
|
||||
pdd->bound = PDD_BOUND;
|
||||
mutex_unlock(&p->mutex);
|
||||
}
|
||||
|
||||
srcu_read_unlock(&kfd_processes_srcu, idx);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Temporarily unbind currently bound processes from the device and
|
||||
* mark them as PDD_BOUND_SUSPENDED. These processes will be restored
|
||||
* to PDD_BOUND state in kfd_bind_processes_to_device.
|
||||
*/
|
||||
void kfd_unbind_processes_from_device(struct kfd_dev *dev)
|
||||
{
|
||||
struct kfd_process_device *pdd;
|
||||
struct kfd_process *p;
|
||||
unsigned int temp, temp_bound, temp_pasid;
|
||||
|
||||
int idx = srcu_read_lock(&kfd_processes_srcu);
|
||||
|
||||
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
||||
mutex_lock(&p->mutex);
|
||||
pdd = kfd_get_process_device_data(dev, p);
|
||||
temp_bound = pdd->bound;
|
||||
temp_pasid = p->pasid;
|
||||
if (pdd->bound == PDD_BOUND)
|
||||
pdd->bound = PDD_BOUND_SUSPENDED;
|
||||
mutex_unlock(&p->mutex);
|
||||
|
||||
if (temp_bound == PDD_BOUND)
|
||||
amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
|
||||
}
|
||||
|
||||
srcu_read_unlock(&kfd_processes_srcu, idx);
|
||||
}
|
||||
|
||||
void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
|
||||
{
|
||||
struct kfd_process *p;
|
||||
struct kfd_process_device *pdd;
|
||||
|
@ -438,15 +512,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
|
|||
pdd->reset_wavefronts = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Just mark pdd as unbound, because we still need it
|
||||
* to call amd_iommu_unbind_pasid() in when the
|
||||
* process exits.
|
||||
* We don't call amd_iommu_unbind_pasid() here
|
||||
* because the IOMMU called us.
|
||||
*/
|
||||
pdd->bound = false;
|
||||
|
||||
mutex_unlock(&p->mutex);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue