drm/amdkfd: Handle VM faults in KFD
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per per-vmid. amdkfd needs to get the information from amdgpu through the new get_vm_fault_info interface. On GFX9 and later, all the required information is in the IH ring 2. amdkfd unmaps all queues from the faulting process and create new run-list without the guilty process 3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY Signed-off-by: shaoyun liu <shaoyun.liu@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
b97dfa27ef
commit
2640c3facb
|
@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
|
|||
return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
|
||||
ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
|
||||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
|
||||
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
|
||||
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
|
||||
ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
|
||||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
|
||||
}
|
||||
|
||||
static void cik_event_interrupt_wq(struct kfd_dev *dev,
|
||||
const uint32_t *ih_ring_entry)
|
||||
{
|
||||
unsigned int pasid;
|
||||
const struct cik_ih_ring_entry *ihre =
|
||||
(const struct cik_ih_ring_entry *)ih_ring_entry;
|
||||
uint32_t context_id = ihre->data & 0xfffffff;
|
||||
|
||||
pasid = (ihre->ring_id & 0xffff0000) >> 16;
|
||||
unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
|
||||
unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
|
||||
|
||||
if (pasid == 0)
|
||||
return;
|
||||
|
@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
|
|||
kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
|
||||
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
|
||||
kfd_signal_hw_exception_event(pasid);
|
||||
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
|
||||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
|
||||
struct kfd_vm_fault_info info;
|
||||
|
||||
kfd_process_vm_fault(dev->dqm, pasid);
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
|
||||
if (!info.page_addr && !info.status)
|
||||
return;
|
||||
|
||||
if (info.vmid == vmid)
|
||||
kfd_signal_vm_fault_event(dev, pasid, &info);
|
||||
else
|
||||
kfd_signal_vm_fault_event(dev, pasid, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
const struct kfd_event_interrupt_class event_interrupt_class_cik = {
|
||||
|
|
|
@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
|
|||
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
|
||||
#define CIK_INTSRC_SDMA_TRAP 0xE0
|
||||
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
|
||||
#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
|
||||
#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
|
|||
kfree(dqm);
|
||||
}
|
||||
|
||||
int kfd_process_vm_fault(struct device_queue_manager *dqm,
|
||||
unsigned int pasid)
|
||||
{
|
||||
struct kfd_process_device *pdd;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
int ret = 0;
|
||||
|
||||
if (!p)
|
||||
return -EINVAL;
|
||||
pdd = kfd_get_process_device_data(dqm->dev, p);
|
||||
if (pdd)
|
||||
ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
|
||||
kfd_unref_process(p);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
||||
static void seq_reg_dump(struct seq_file *m,
|
||||
|
|
|
@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
|
|||
mutex_unlock(&p->event_mutex);
|
||||
kfd_unref_process(p);
|
||||
}
|
||||
|
||||
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
|
||||
struct kfd_vm_fault_info *info)
|
||||
{
|
||||
struct kfd_event *ev;
|
||||
uint32_t id;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||
|
||||
if (!p)
|
||||
return; /* Presumably process exited. */
|
||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||
memory_exception_data.gpu_id = dev->id;
|
||||
memory_exception_data.failure.imprecise = 1;
|
||||
/* Set failure reason */
|
||||
if (info) {
|
||||
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
|
||||
memory_exception_data.failure.NotPresent =
|
||||
info->prot_valid ? 1 : 0;
|
||||
memory_exception_data.failure.NoExecute =
|
||||
info->prot_exec ? 1 : 0;
|
||||
memory_exception_data.failure.ReadOnly =
|
||||
info->prot_write ? 1 : 0;
|
||||
memory_exception_data.failure.imprecise = 0;
|
||||
}
|
||||
mutex_lock(&p->event_mutex);
|
||||
|
||||
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
||||
idr_for_each_entry_continue(&p->event_idr, ev, id)
|
||||
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
|
||||
ev->memory_exception_data = memory_exception_data;
|
||||
set_event(ev);
|
||||
}
|
||||
|
||||
mutex_unlock(&p->event_mutex);
|
||||
kfd_unref_process(p);
|
||||
}
|
||||
|
|
|
@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
|
|||
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
|
||||
source_id == SOC15_INTSRC_SDMA_TRAP ||
|
||||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
|
||||
source_id == SOC15_INTSRC_CP_BAD_OPCODE;
|
||||
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
|
||||
client_id == SOC15_IH_CLIENTID_VMC ||
|
||||
client_id == SOC15_IH_CLIENTID_UTCL2;
|
||||
}
|
||||
|
||||
static void event_interrupt_wq_v9(struct kfd_dev *dev,
|
||||
|
@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
|
|||
kfd_signal_hw_exception_event(pasid);
|
||||
else if (client_id == SOC15_IH_CLIENTID_VMC ||
|
||||
client_id == SOC15_IH_CLIENTID_UTCL2) {
|
||||
/* TODO */
|
||||
struct kfd_vm_fault_info info = {0};
|
||||
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
|
||||
|
||||
info.vmid = vmid;
|
||||
info.mc_id = client_id;
|
||||
info.page_addr = ih_ring_entry[4] |
|
||||
(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
|
||||
info.prot_valid = ring_id & 0x08;
|
||||
info.prot_read = ring_id & 0x10;
|
||||
info.prot_write = ring_id & 0x20;
|
||||
|
||||
kfd_process_vm_fault(dev->dqm, pasid);
|
||||
kfd_signal_vm_fault_event(dev, pasid, &info);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
|
|||
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
|
||||
enum kfd_queue_type type);
|
||||
void kernel_queue_uninit(struct kernel_queue *kq);
|
||||
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
|
||||
|
||||
/* Process Queue Manager */
|
||||
struct process_queue_node {
|
||||
|
@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
|
|||
uint64_t *event_page_offset, uint32_t *event_slot_index);
|
||||
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
|
||||
|
||||
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
|
||||
struct kfd_vm_fault_info *info);
|
||||
|
||||
void kfd_flush_tlb(struct kfd_process_device *pdd);
|
||||
|
||||
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
|
||||
|
|
|
@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
|
|||
__u32 NotPresent; /* Page not present or supervisor privilege */
|
||||
__u32 ReadOnly; /* Write access to a read-only page */
|
||||
__u32 NoExecute; /* Execute access to a page marked NX */
|
||||
__u32 pad;
|
||||
__u32 imprecise; /* Can't determine the exact fault address */
|
||||
};
|
||||
|
||||
/* memory exception data*/
|
||||
|
|
Loading…
Reference in New Issue