From 66b783b4465de05ce6b370b1ae97e95b0fc14a34 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Fri, 27 Oct 2017 19:35:18 -0400 Subject: [PATCH 01/26] drm/amdkfd: Add SDMA trap src id to the KFD isr wanted list This enables SDMA signalling with event interrupt. Signed-off-by: Besar Wicaksono Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 3 +++ drivers/gpu/drm/amd/amdkfd/cik_int.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 211fc48697fa..66164aa67c66 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -36,6 +36,7 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, /* Do not process in ISR, just request it to be forwarded to WQ. */ return (pasid != 0) && (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || + ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); } @@ -54,6 +55,8 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, 0, 0); + else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) + kfd_signal_event_interrupt(pasid, 0, 0); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index 79a16d24c1b8..109298b9d507 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -32,9 +32,10 @@ struct cik_ih_ring_entry { uint32_t reserved; }; -#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 +#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 +#define CIK_INTSRC_SDMA_TRAP 0xE0 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF #endif From 9b56bb115460cee92a80bf85232b4b7da2f080e6 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:19 -0400 Subject: [PATCH 02/26] drm/amdkfd: Don't dereference kfd_process.mm The kfd_process doesn't own a reference to the mm_struct, so it can disappear without warning even while the kfd_process still exists. Therefore, avoid dereferencing the kfd_process.mm pointer and make it opaque. Use get_task_mm to get a temporary reference to the mm when it's needed. v2: removed unnecessary WARN_ON Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++---- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++++++- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 - 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 944abfad39c1..61ce5476c055 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -24,8 +24,8 @@ #include #include #include +#include #include -#include #include #include #include "kfd_priv.h" @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, * running so the lookup function returns a locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct mm_struct *mm; if (!p) return; /* Presumably process exited. */ + /* Take a safe reference to the mm_struct, which may otherwise + * disappear even while the kfd_process is still referenced. + */ + mm = get_task_mm(p->lead_thread); + if (!mm) { + mutex_unlock(&p->mutex); + return; /* Process is exiting */ + } + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - down_read(&p->mm->mmap_sem); - vma = find_vma(p->mm, address); + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); memory_exception_data.gpu_id = dev->id; memory_exception_data.va = address; @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, } } - up_read(&p->mm->mmap_sem); + up_read(&mm->mmap_sem); + mmput(mm); mutex_lock(&p->event_mutex); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 7d86ec9790d3..1a483a7ecd4e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -494,7 +494,12 @@ struct kfd_process { */ struct hlist_node kfd_processes; - struct mm_struct *mm; + /* + * Opaque pointer to mm_struct. We don't hold a reference to + * it so it should never be dereferenced from here. This is + * only used for looking up processes by their mm. + */ + void *mm; struct mutex mutex; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 3ccb3b53216e..695fa2ae8e5b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -200,7 +200,6 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu) struct kfd_process *p; p = container_of(rcu, struct kfd_process, rcu); - WARN_ON(atomic_read(&p->mm->mm_count) <= 0); mmdrop(p->mm); From 1f9d09becb9aa6ddc67a44391f05cb96bfab80df Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Fri, 27 Oct 2017 19:35:20 -0400 Subject: [PATCH 03/26] drm/amdkfd: Short cut for kfd_wait_on_events without waiting If kfd_wait_on_events can return immediately, we don't need to populate the wait list and don't need to enter the sleep-loop. Signed-off-by: Sean Keely Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 43 ++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 61ce5476c055..f3d88c833135 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -617,7 +617,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) return event_waiters; } -static int init_event_waiter(struct kfd_process *p, +static int init_event_waiter_get_status(struct kfd_process *p, struct kfd_event_waiter *waiter, uint32_t event_id, uint32_t input_index) @@ -632,11 +632,20 @@ static int init_event_waiter(struct kfd_process *p, waiter->activated = ev->signaled; ev->signaled = ev->signaled && !ev->auto_reset; - list_add(&waiter->waiters, &ev->waiters); - return 0; } +static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) +{ + struct kfd_event *ev = waiter->event; + + /* Only add to the wait list if we actually need to + * wait on this event. + */ + if (!waiter->activated) + list_add(&waiter->waiters, &ev->waiters); +} + static bool test_event_condition(bool all, uint32_t num_events, struct kfd_event_waiter *event_waiters) { @@ -724,11 +733,17 @@ int kfd_wait_on_events(struct kfd_process *p, (struct kfd_event_data __user *) data; uint32_t i; int ret = 0; + struct kfd_event_waiter *event_waiters = NULL; long timeout = user_timeout_to_jiffies(user_timeout_ms); mutex_lock(&p->event_mutex); + /* Set to something unreasonable - this is really + * just a bool for now. + */ + *wait_result = KFD_WAIT_TIMEOUT; + event_waiters = alloc_event_waiters(num_events); if (!event_waiters) { ret = -ENOMEM; @@ -744,14 +759,34 @@ int kfd_wait_on_events(struct kfd_process *p, goto fail; } - ret = init_event_waiter(p, &event_waiters[i], + ret = init_event_waiter_get_status(p, &event_waiters[i], event_data.event_id, i); if (ret) goto fail; } + /* Check condition once. */ + if (test_event_condition(all, num_events, event_waiters)) { + if (copy_signaled_event_data(num_events, + event_waiters, events)) + *wait_result = KFD_WAIT_COMPLETE; + else + *wait_result = KFD_WAIT_ERROR; + free_waiters(num_events, event_waiters); + } else { + /* Add to wait lists if we need to wait. */ + for (i = 0; i < num_events; i++) + init_event_waiter_add_to_waitlist(&event_waiters[i]); + } + mutex_unlock(&p->event_mutex); + /* Return if all waits were already satisfied. */ + if (*wait_result != KFD_WAIT_TIMEOUT) { + __set_current_state(TASK_RUNNING); + return ret; + } + while (true) { if (fatal_signal_pending(current)) { ret = -EINTR; From d9aeec4cbb58599008e6dd4cc23f5bfbdbd0f4ff Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Fri, 27 Oct 2017 19:35:21 -0400 Subject: [PATCH 04/26] drm/amdkfd: Fix scheduler race in kfd_wait_on_events sleep loop Signed-off-by: Sean Keely Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index f3d88c833135..1efd6a8c2a40 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -806,6 +806,17 @@ int kfd_wait_on_events(struct kfd_process *p, break; } + /* Set task state to interruptible sleep before + * checking wake-up conditions. A concurrent wake-up + * will put the task back into runnable state. In that + * case schedule_timeout will not put the task to + * sleep and we'll get a chance to re-check the + * updated conditions almost immediately. Otherwise, + * this race condition would lead to a soft hang or a + * very long sleep. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (test_event_condition(all, num_events, event_waiters)) { if (copy_signaled_event_data(num_events, event_waiters, events)) @@ -820,7 +831,7 @@ int kfd_wait_on_events(struct kfd_process *p, break; } - timeout = schedule_timeout_interruptible(timeout); + timeout = schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); From fdf0c8332a0309ac619e22e82b6014c77b2a3518 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:22 -0400 Subject: [PATCH 05/26] drm/amdkfd: Clean up kfd_wait_on_events Cleaned up the code while resolving some potential bugs and inconsistencies in the process. Clean-ups: * Remove enum kfd_event_wait_result, which duplicates KFD_IOC_EVENT_RESULT definitions * alloc_event_waiters can be called without holding p->event_mutex * Return an error code from copy_signaled_event_data instead of bool * Clean up error handling code paths to minimize duplication in kfd_wait_on_events Fixes: * Consistently return an error code from kfd_wait_on_events and set wait_result to KFD_IOC_WAIT_RESULT_FAIL in all failure cases. * Always call free_waiters while holding p->event_mutex * copy_signaled_event_data might sleep. Don't call it while the task state is TASK_INTERRUPTIBLE. Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 71 ++++++++++-------------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8 +-- 3 files changed, 32 insertions(+), 52 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0ef82b229754..a25321ff448f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -835,15 +835,12 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_wait_events_args *args = data; - enum kfd_event_wait_result wait_result; int err; err = kfd_wait_on_events(p, args->num_events, (void __user *)args->events_ptr, (args->wait_for_all != 0), - args->timeout, &wait_result); - - args->wait_result = wait_result; + args->timeout, &args->wait_result); return err; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 1efd6a8c2a40..33cafbb96520 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -668,7 +668,7 @@ static bool test_event_condition(bool all, uint32_t num_events, * Copy event specific data, if defined. * Currently only memory exception events have additional data to copy to user */ -static bool copy_signaled_event_data(uint32_t num_events, +static int copy_signaled_event_data(uint32_t num_events, struct kfd_event_waiter *event_waiters, struct kfd_event_data __user *data) { @@ -686,11 +686,11 @@ static bool copy_signaled_event_data(uint32_t num_events, src = &event->memory_exception_data; if (copy_to_user(dst, src, sizeof(struct kfd_hsa_memory_exception_data))) - return false; + return -EFAULT; } } - return true; + return 0; } @@ -727,7 +727,7 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters) int kfd_wait_on_events(struct kfd_process *p, uint32_t num_events, void __user *data, bool all, uint32_t user_timeout_ms, - enum kfd_event_wait_result *wait_result) + uint32_t *wait_result) { struct kfd_event_data __user *events = (struct kfd_event_data __user *) data; @@ -737,18 +737,18 @@ int kfd_wait_on_events(struct kfd_process *p, struct kfd_event_waiter *event_waiters = NULL; long timeout = user_timeout_to_jiffies(user_timeout_ms); + event_waiters = alloc_event_waiters(num_events); + if (!event_waiters) { + ret = -ENOMEM; + goto out; + } + mutex_lock(&p->event_mutex); /* Set to something unreasonable - this is really * just a bool for now. */ - *wait_result = KFD_WAIT_TIMEOUT; - - event_waiters = alloc_event_waiters(num_events); - if (!event_waiters) { - ret = -ENOMEM; - goto fail; - } + *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT; for (i = 0; i < num_events; i++) { struct kfd_event_data event_data; @@ -756,23 +756,21 @@ int kfd_wait_on_events(struct kfd_process *p, if (copy_from_user(&event_data, &events[i], sizeof(struct kfd_event_data))) { ret = -EFAULT; - goto fail; + goto out_unlock; } ret = init_event_waiter_get_status(p, &event_waiters[i], event_data.event_id, i); if (ret) - goto fail; + goto out_unlock; } /* Check condition once. */ if (test_event_condition(all, num_events, event_waiters)) { - if (copy_signaled_event_data(num_events, - event_waiters, events)) - *wait_result = KFD_WAIT_COMPLETE; - else - *wait_result = KFD_WAIT_ERROR; - free_waiters(num_events, event_waiters); + *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE; + ret = copy_signaled_event_data(num_events, + event_waiters, events); + goto out_unlock; } else { /* Add to wait lists if we need to wait. */ for (i = 0; i < num_events; i++) @@ -781,12 +779,6 @@ int kfd_wait_on_events(struct kfd_process *p, mutex_unlock(&p->event_mutex); - /* Return if all waits were already satisfied. */ - if (*wait_result != KFD_WAIT_TIMEOUT) { - __set_current_state(TASK_RUNNING); - return ret; - } - while (true) { if (fatal_signal_pending(current)) { ret = -EINTR; @@ -818,16 +810,12 @@ int kfd_wait_on_events(struct kfd_process *p, set_current_state(TASK_INTERRUPTIBLE); if (test_event_condition(all, num_events, event_waiters)) { - if (copy_signaled_event_data(num_events, - event_waiters, events)) - *wait_result = KFD_WAIT_COMPLETE; - else - *wait_result = KFD_WAIT_ERROR; + *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE; break; } if (timeout <= 0) { - *wait_result = KFD_WAIT_TIMEOUT; + *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT; break; } @@ -835,19 +823,20 @@ int kfd_wait_on_events(struct kfd_process *p, } __set_current_state(TASK_RUNNING); + /* copy_signaled_event_data may sleep. So this has to happen + * after the task state is set back to RUNNING. + */ + if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) + ret = copy_signaled_event_data(num_events, + event_waiters, events); + mutex_lock(&p->event_mutex); +out_unlock: free_waiters(num_events, event_waiters); mutex_unlock(&p->event_mutex); - - return ret; - -fail: - if (event_waiters) - free_waiters(num_events, event_waiters); - - mutex_unlock(&p->event_mutex); - - *wait_result = KFD_WAIT_ERROR; +out: + if (ret) + *wait_result = KFD_IOC_WAIT_RESULT_FAIL; return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 1a483a7ecd4e..d3cf53a50ced 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -726,19 +726,13 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd); extern const struct kfd_event_interrupt_class event_interrupt_class_cik; extern const struct kfd_device_global_init_class device_global_init_class_cik; -enum kfd_event_wait_result { - KFD_WAIT_COMPLETE, - KFD_WAIT_TIMEOUT, - KFD_WAIT_ERROR -}; - void kfd_event_init_process(struct kfd_process *p); void kfd_event_free_process(struct kfd_process *p); int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma); int kfd_wait_on_events(struct kfd_process *p, uint32_t num_events, void __user *data, bool all, uint32_t user_timeout_ms, - enum kfd_event_wait_result *wait_result); + uint32_t *wait_result); void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); void kfd_signal_iommu_event(struct kfd_dev *dev, From fe528c13acc764965929b7fcb5fadf2c15b57373 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:23 -0400 Subject: [PATCH 06/26] drm/amdkfd: Fix event destruction with pending waiters When an event with pending waiters is destroyed, those waiters may end up sleeping forever unless they are notified and woken up. Implement the notification by clearing the waiter->event pointer, which becomes invalid anyway, when the event is freed, and waking up the waiting tasks. Waiters on an event that's destroyed return failure. Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 72 ++++++++++++++++--------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 33cafbb96520..2f0fe124ce08 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -345,18 +345,24 @@ void kfd_event_init_process(struct kfd_process *p) static void destroy_event(struct kfd_process *p, struct kfd_event *ev) { + /* Wake up pending waiters. They will return failure */ + while (!list_empty(&ev->waiters)) { + struct kfd_event_waiter *waiter = + list_first_entry(&ev->waiters, struct kfd_event_waiter, + waiters); + + waiter->event = NULL; + /* _init because free_waiters will call list_del */ + list_del_init(&waiter->waiters); + wake_up_process(waiter->sleeping_task); + } + if (ev->signal_page) { release_event_notification_slot(ev->signal_page, ev->signal_slot_index); p->signal_event_count--; } - /* - * Abandon the list of waiters. Individual waiting threads will - * clean up their own data. - */ - list_del(&ev->waiters); - hash_del(&ev->events); kfree(ev); } @@ -646,22 +652,36 @@ static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) list_add(&waiter->waiters, &ev->waiters); } -static bool test_event_condition(bool all, uint32_t num_events, +/* test_event_condition - Test condition of events being waited for + * @all: Return completion only if all events have signaled + * @num_events: Number of events to wait for + * @event_waiters: Array of event waiters, one per event + * + * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have + * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all) + * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of + * the events have been destroyed. + */ +static uint32_t test_event_condition(bool all, uint32_t num_events, struct kfd_event_waiter *event_waiters) { uint32_t i; uint32_t activated_count = 0; for (i = 0; i < num_events; i++) { + if (!event_waiters[i].event) + return KFD_IOC_WAIT_RESULT_FAIL; + if (event_waiters[i].activated) { if (!all) - return true; + return KFD_IOC_WAIT_RESULT_COMPLETE; activated_count++; } } - return activated_count == num_events; + return activated_count == num_events ? + KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT; } /* @@ -745,11 +765,6 @@ int kfd_wait_on_events(struct kfd_process *p, mutex_lock(&p->event_mutex); - /* Set to something unreasonable - this is really - * just a bool for now. - */ - *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT; - for (i = 0; i < num_events; i++) { struct kfd_event_data event_data; @@ -766,17 +781,22 @@ int kfd_wait_on_events(struct kfd_process *p, } /* Check condition once. */ - if (test_event_condition(all, num_events, event_waiters)) { - *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE; + *wait_result = test_event_condition(all, num_events, event_waiters); + if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) { ret = copy_signaled_event_data(num_events, event_waiters, events); goto out_unlock; - } else { - /* Add to wait lists if we need to wait. */ - for (i = 0; i < num_events; i++) - init_event_waiter_add_to_waitlist(&event_waiters[i]); + } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) { + /* This should not happen. Events shouldn't be + * destroyed while we're holding the event_mutex + */ + goto out_unlock; } + /* Add to wait lists if we need to wait. */ + for (i = 0; i < num_events; i++) + init_event_waiter_add_to_waitlist(&event_waiters[i]); + mutex_unlock(&p->event_mutex); while (true) { @@ -809,15 +829,13 @@ int kfd_wait_on_events(struct kfd_process *p, */ set_current_state(TASK_INTERRUPTIBLE); - if (test_event_condition(all, num_events, event_waiters)) { - *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE; + *wait_result = test_event_condition(all, num_events, + event_waiters); + if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT) break; - } - if (timeout <= 0) { - *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT; + if (timeout <= 0) break; - } timeout = schedule_timeout(timeout); } @@ -837,6 +855,8 @@ out_unlock: out: if (ret) *wait_result = KFD_IOC_WAIT_RESULT_FAIL; + else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL) + ret = -EIO; return ret; } From ebf947fe93dd3627774f6bb4daa57b4e2897929d Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:24 -0400 Subject: [PATCH 07/26] drm/amdkfd: remove redundant kfd_event_waiter.input_index This always identical with the index of the event_waiter in the array. No need to store it in the waiter record. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 2f0fe124ce08..b4cda9256225 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -49,7 +49,6 @@ struct kfd_event_waiter { /* Event */ struct kfd_event *event; - uint32_t input_index; }; /* @@ -625,8 +624,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) static int init_event_waiter_get_status(struct kfd_process *p, struct kfd_event_waiter *waiter, - uint32_t event_id, - uint32_t input_index) + uint32_t event_id) { struct kfd_event *ev = lookup_event_by_id(p, event_id); @@ -634,7 +632,6 @@ static int init_event_waiter_get_status(struct kfd_process *p, return -EINVAL; waiter->event = ev; - waiter->input_index = input_index; waiter->activated = ev->signaled; ev->signaled = ev->signaled && !ev->auto_reset; @@ -702,7 +699,7 @@ static int copy_signaled_event_data(uint32_t num_events, waiter = &event_waiters[i]; event = waiter->event; if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { - dst = &data[waiter->input_index].memory_exception_data; + dst = &data[i].memory_exception_data; src = &event->memory_exception_data; if (copy_to_user(dst, src, sizeof(struct kfd_hsa_memory_exception_data))) @@ -775,7 +772,7 @@ int kfd_wait_on_events(struct kfd_process *p, } ret = init_event_waiter_get_status(p, &event_waiters[i], - event_data.event_id, i); + event_data.event_id); if (ret) goto out_unlock; } From 74e4071665da243e0ff1b7a0e1b8ac7d89cdb3ca Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:25 -0400 Subject: [PATCH 08/26] drm/amdkfd: Use wait_queue_t to implement event waiting Use standard wait queues for waiting and waking up waiting threads instead of inventing our own. We still have our own wait loop because the HSA event semantics require the ability to have one thread waiting on multiple wait queues (events) at the same time. Signed-off-by: Kent Russell Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 59 +++++++++---------------- drivers/gpu/drm/amd/amdkfd/kfd_events.h | 3 +- 2 files changed, 24 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index b4cda9256225..7dae26f0e11a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -33,22 +33,12 @@ #include /* - * A task can only be on a single wait_queue at a time, but we need to support - * waiting on multiple events (any/all). - * Instead of each event simply having a wait_queue with sleeping tasks, it - * has a singly-linked list of tasks. - * A thread that wants to sleep creates an array of these, one for each event - * and adds one to each event's waiter chain. + * Wrapper around wait_queue_entry_t */ struct kfd_event_waiter { - struct list_head waiters; - struct task_struct *sleeping_task; - - /* Transitions to true when the event this belongs to is signaled. */ - bool activated; - - /* Event */ - struct kfd_event *event; + wait_queue_entry_t wait; + struct kfd_event *event; /* Event to wait for */ + bool activated; /* Becomes true when event is signaled */ }; /* @@ -344,17 +334,12 @@ void kfd_event_init_process(struct kfd_process *p) static void destroy_event(struct kfd_process *p, struct kfd_event *ev) { - /* Wake up pending waiters. They will return failure */ - while (!list_empty(&ev->waiters)) { - struct kfd_event_waiter *waiter = - list_first_entry(&ev->waiters, struct kfd_event_waiter, - waiters); + struct kfd_event_waiter *waiter; + /* Wake up pending waiters. They will return failure */ + list_for_each_entry(waiter, &ev->wq.head, wait.entry) waiter->event = NULL; - /* _init because free_waiters will call list_del */ - list_del_init(&waiter->waiters); - wake_up_process(waiter->sleeping_task); - } + wake_up_all(&ev->wq); if (ev->signal_page) { release_event_notification_slot(ev->signal_page, @@ -424,7 +409,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, ev->auto_reset = auto_reset; ev->signaled = false; - INIT_LIST_HEAD(&ev->waiters); + init_waitqueue_head(&ev->wq); *event_page_offset = 0; @@ -482,19 +467,18 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id) static void set_event(struct kfd_event *ev) { struct kfd_event_waiter *waiter; - struct kfd_event_waiter *next; - /* Auto reset if the list is non-empty and we're waking someone. */ - ev->signaled = !ev->auto_reset || list_empty(&ev->waiters); + /* Auto reset if the list is non-empty and we're waking + * someone. waitqueue_active is safe here because we're + * protected by the p->event_mutex, which is also held when + * updating the wait queues in kfd_wait_on_events. + */ + ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq); - list_for_each_entry_safe(waiter, next, &ev->waiters, waiters) { + list_for_each_entry(waiter, &ev->wq.head, wait.entry) waiter->activated = true; - /* _init because free_waiters will call list_del */ - list_del_init(&waiter->waiters); - - wake_up_process(waiter->sleeping_task); - } + wake_up_all(&ev->wq); } /* Assumes that p is current. */ @@ -614,8 +598,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) GFP_KERNEL); for (i = 0; (event_waiters) && (i < num_events) ; i++) { - INIT_LIST_HEAD(&event_waiters[i].waiters); - event_waiters[i].sleeping_task = current; + init_wait(&event_waiters[i].wait); event_waiters[i].activated = false; } @@ -646,7 +629,7 @@ static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) * wait on this event. */ if (!waiter->activated) - list_add(&waiter->waiters, &ev->waiters); + add_wait_queue(&ev->wq, &waiter->wait); } /* test_event_condition - Test condition of events being waited for @@ -736,7 +719,9 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters) uint32_t i; for (i = 0; i < num_events; i++) - list_del(&waiters[i].waiters); + if (waiters[i].event) + remove_wait_queue(&waiters[i].event->wq, + &waiters[i].wait); kfree(waiters); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index 28f6838b1f4c..96f9122805fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "kfd_priv.h" #include @@ -56,7 +57,7 @@ struct kfd_event { int type; - struct list_head waiters; /* List of kfd_event_waiter by waiters. */ + wait_queue_head_t wq; /* List of event waiters. */ /* Only for signal events. */ struct signal_page *signal_page; From 50cb7dd94cb43a6204813376e1be1d21780b71fb Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:26 -0400 Subject: [PATCH 09/26] drm/amdkfd: Simplify events page allocator The first event page is always big enough to handle all events. Handling of multiple events pages is not supported by user mode, and not necessary. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 203 +++++++++--------------- drivers/gpu/drm/amd/amdkfd/kfd_events.h | 1 - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +- 3 files changed, 73 insertions(+), 135 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 7dae26f0e11a..7cc17107f448 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -41,6 +41,9 @@ struct kfd_event_waiter { bool activated; /* Becomes true when event is signaled */ }; +#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT +#define SLOT_BITMAP_LONGS BITS_TO_LONGS(SLOTS_PER_PAGE) + /* * Over-complicated pooled allocator for event notification slots. * @@ -51,132 +54,98 @@ struct kfd_event_waiter { * Individual signal events are then allocated a slot in a page. */ -struct signal_page { - struct list_head event_pages; /* kfd_process.signal_event_pages */ +struct kfd_signal_page { uint64_t *kernel_address; uint64_t __user *user_address; - uint32_t page_index; /* Index into the mmap aperture. */ unsigned int free_slots; - unsigned long used_slot_bitmap[0]; + unsigned long used_slot_bitmap[SLOT_BITMAP_LONGS]; }; -#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT -#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) -#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) -#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ - SLOT_BITMAP_SIZE * sizeof(long)) - /* * For signal events, the event ID is used as the interrupt user data. * For SQ s_sendmsg interrupts, this is limited to 8 bits. */ #define INTERRUPT_DATA_BITS 8 -#define SIGNAL_EVENT_ID_SLOT_SHIFT 0 -static uint64_t *page_slots(struct signal_page *page) +static uint64_t *page_slots(struct kfd_signal_page *page) { return page->kernel_address; } static bool allocate_free_slot(struct kfd_process *process, - struct signal_page **out_page, - unsigned int *out_slot_index) + unsigned int *out_slot_index) { - struct signal_page *page; + struct kfd_signal_page *page = process->signal_page; + unsigned int slot; - list_for_each_entry(page, &process->signal_event_pages, event_pages) { - if (page->free_slots > 0) { - unsigned int slot = - find_first_zero_bit(page->used_slot_bitmap, - SLOTS_PER_PAGE); + if (!page || page->free_slots == 0) { + pr_debug("No free event signal slots were found for process %p\n", + process); - __set_bit(slot, page->used_slot_bitmap); - page->free_slots--; - - page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; - - *out_page = page; - *out_slot_index = slot; - - pr_debug("Allocated event signal slot in page %p, slot %d\n", - page, slot); - - return true; - } + return false; } - pr_debug("No free event signal slots were found for process %p\n", - process); + slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE); - return false; + __set_bit(slot, page->used_slot_bitmap); + page->free_slots--; + + page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; + + *out_slot_index = slot; + + pr_debug("Allocated event signal slot in page %p, slot %d\n", + page, slot); + + return true; } -#define list_tail_entry(head, type, member) \ - list_entry((head)->prev, type, member) - -static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) +static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) { void *backing_store; - struct signal_page *page; + struct kfd_signal_page *page; - page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); + page = kzalloc(sizeof(*page), GFP_KERNEL); if (!page) - goto fail_alloc_signal_page; + return NULL; page->free_slots = SLOTS_PER_PAGE; - backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + backing_store = (void *) __get_free_pages(GFP_KERNEL, get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); if (!backing_store) goto fail_alloc_signal_store; - /* prevent user-mode info leaks */ + /* Initialize all events to unsignaled */ memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); + KFD_SIGNAL_EVENT_LIMIT * 8); page->kernel_address = backing_store; - - if (list_empty(&p->signal_event_pages)) - page->page_index = 0; - else - page->page_index = list_tail_entry(&p->signal_event_pages, - struct signal_page, - event_pages)->page_index + 1; - pr_debug("Allocated new event signal page at %p, for process %p\n", page, p); - pr_debug("Page index is %d\n", page->page_index); - list_add(&page->event_pages, &p->signal_event_pages); - - return true; + return page; fail_alloc_signal_store: kfree(page); -fail_alloc_signal_page: - return false; + return NULL; } -static bool allocate_event_notification_slot(struct file *devkfd, - struct kfd_process *p, - struct signal_page **page, - unsigned int *signal_slot_index) +static bool allocate_event_notification_slot(struct kfd_process *p, + unsigned int *signal_slot_index) { - bool ret; - - ret = allocate_free_slot(p, page, signal_slot_index); - if (!ret) { - ret = allocate_signal_page(devkfd, p); - if (ret) - ret = allocate_free_slot(p, page, signal_slot_index); + if (!p->signal_page) { + p->signal_page = allocate_signal_page(p); + if (!p->signal_page) + return false; } - return ret; + return allocate_free_slot(p, signal_slot_index); } /* Assumes that the process's event_mutex is locked. */ -static void release_event_notification_slot(struct signal_page *page, +static void release_event_notification_slot(struct kfd_signal_page *page, size_t slot_index) { __clear_bit(slot_index, page->used_slot_bitmap); @@ -187,22 +156,6 @@ static void release_event_notification_slot(struct signal_page *page, */ } -static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, - unsigned int page_index) -{ - struct signal_page *page; - - /* - * This is safe because we don't delete signal pages until the - * process exits. - */ - list_for_each_entry(page, &p->signal_event_pages, event_pages) - if (page->page_index == page_index) - return page; - - return NULL; -} - /* * Assumes that p->event_mutex is held and of course that p is not going * away (current or locked). @@ -218,13 +171,6 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) return NULL; } -static u32 make_signal_event_id(struct signal_page *page, - unsigned int signal_slot_index) -{ - return page->page_index | - (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); -} - /* * Produce a kfd event id for a nonsignal event. * These are arbitrary numbers, so we do a sequential search through @@ -270,10 +216,9 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) } static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, - struct signal_page *page, unsigned int signal_slot) { - return lookup_event_by_id(p, make_signal_event_id(page, signal_slot)); + return lookup_event_by_id(p, signal_slot); } static int create_signal_event(struct file *devkfd, @@ -288,8 +233,7 @@ static int create_signal_event(struct file *devkfd, return -ENOMEM; } - if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, - &ev->signal_slot_index)) { + if (!allocate_event_notification_slot(p, &ev->signal_slot_index)) { pr_warn("Signal event wasn't created because out of kernel memory\n"); return -ENOMEM; } @@ -297,10 +241,9 @@ static int create_signal_event(struct file *devkfd, p->signal_event_count++; ev->user_signal_address = - &ev->signal_page->user_address[ev->signal_slot_index]; + &p->signal_page->user_address[ev->signal_slot_index]; - ev->event_id = make_signal_event_id(ev->signal_page, - ev->signal_slot_index); + ev->event_id = ev->signal_slot_index; pr_debug("Signal event number %zu created with id %d, address %p\n", p->signal_event_count, ev->event_id, @@ -327,7 +270,7 @@ void kfd_event_init_process(struct kfd_process *p) { mutex_init(&p->event_mutex); hash_init(p->events); - INIT_LIST_HEAD(&p->signal_event_pages); + p->signal_page = NULL; p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; p->signal_event_count = 0; } @@ -341,8 +284,9 @@ static void destroy_event(struct kfd_process *p, struct kfd_event *ev) waiter->event = NULL; wake_up_all(&ev->wq); - if (ev->signal_page) { - release_event_notification_slot(ev->signal_page, + if ((ev->type == KFD_EVENT_TYPE_SIGNAL || + ev->type == KFD_EVENT_TYPE_DEBUG) && p->signal_page) { + release_event_notification_slot(p->signal_page, ev->signal_slot_index); p->signal_event_count--; } @@ -365,12 +309,11 @@ static void destroy_events(struct kfd_process *p) * We assume that the process is being destroyed and there is no need to * unmap the pages or keep bookkeeping data in order. */ -static void shutdown_signal_pages(struct kfd_process *p) +static void shutdown_signal_page(struct kfd_process *p) { - struct signal_page *page, *tmp; + struct kfd_signal_page *page = p->signal_page; - list_for_each_entry_safe(page, tmp, &p->signal_event_pages, - event_pages) { + if (page) { free_pages((unsigned long)page->kernel_address, get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); kfree(page); @@ -380,7 +323,7 @@ static void shutdown_signal_pages(struct kfd_process *p) void kfd_event_free_process(struct kfd_process *p) { destroy_events(p); - shutdown_signal_pages(p); + shutdown_signal_page(p); } static bool event_can_be_gpu_signaled(const struct kfd_event *ev) @@ -420,8 +363,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, case KFD_EVENT_TYPE_DEBUG: ret = create_signal_event(devkfd, p, ev); if (!ret) { - *event_page_offset = (ev->signal_page->page_index | - KFD_MMAP_EVENTS_MASK); + *event_page_offset = KFD_MMAP_EVENTS_MASK; *event_page_offset <<= PAGE_SHIFT; *event_slot_index = ev->signal_slot_index; } @@ -527,13 +469,17 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) { - page_slots(ev->signal_page)[ev->signal_slot_index] = + page_slots(p->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT; } -static bool is_slot_signaled(struct signal_page *page, unsigned int index) +static bool is_slot_signaled(struct kfd_process *p, unsigned int index) { - return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT; + if (!p->signal_page) + return false; + else + return page_slots(p->signal_page)[index] != + UNSIGNALED_EVENT_SLOT; } static void set_event_from_interrupt(struct kfd_process *p, @@ -566,22 +512,19 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, /* Partial ID is a full ID. */ ev = lookup_event_by_id(p, partial_id); set_event_from_interrupt(p, ev); - } else { + } else if (p->signal_page) { /* * Partial ID is in fact partial. For now we completely * ignore it, but we could use any bits we did receive to * search faster. */ - struct signal_page *page; unsigned int i; - list_for_each_entry(page, &p->signal_event_pages, event_pages) - for (i = 0; i < SLOTS_PER_PAGE; i++) - if (is_slot_signaled(page, i)) { - ev = lookup_event_by_page_slot(p, - page, i); - set_event_from_interrupt(p, ev); - } + for (i = 0; i < SLOTS_PER_PAGE; i++) + if (is_slot_signaled(p, i)) { + ev = lookup_event_by_page_slot(p, i); + set_event_from_interrupt(p, ev); + } } mutex_unlock(&p->event_mutex); @@ -846,9 +789,8 @@ out: int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) { - unsigned int page_index; unsigned long pfn; - struct signal_page *page; + struct kfd_signal_page *page; /* check required size is logical */ if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != @@ -857,13 +799,10 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) return -EINVAL; } - page_index = vma->vm_pgoff; - - page = lookup_signal_page_by_index(p, page_index); + page = p->signal_page; if (!page) { /* Probably KFD bug, but mmap is user-accessible. */ - pr_debug("Signal page could not be found for page_index %u\n", - page_index); + pr_debug("Signal page could not be found\n"); return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index 96f9122805fa..f85fcee4414b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -60,7 +60,6 @@ struct kfd_event { wait_queue_head_t wq; /* List of event waiters. */ /* Only for signal events. */ - struct signal_page *signal_page; unsigned int signal_slot_index; uint64_t __user *user_signal_address; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d3cf53a50ced..c1b3ee22faed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -540,8 +540,8 @@ struct kfd_process { struct mutex event_mutex; /* All events in process hashed by ID, linked on kfd_event.events. */ DECLARE_HASHTABLE(events, 4); - /* struct slot_page_header.event_pages */ - struct list_head signal_event_pages; + /* Event page */ + struct kfd_signal_page *signal_page; u32 next_nonsignal_event_id; size_t signal_event_count; bool signal_event_limit_reached; From 482f07775cf559c82cb3d086e3c4fad91582e4cb Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:27 -0400 Subject: [PATCH 10/26] drm/amdkfd: Simplify event ID and signal slot management Signal slots are identical to event IDs. Replace the used_slot_bitmap and events hash table with an IDR to allocate and lookup event IDs and signal slots more efficiently. Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 230 ++++++++---------------- drivers/gpu/drm/amd/amdkfd/kfd_events.h | 14 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +- 3 files changed, 80 insertions(+), 170 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 7cc17107f448..41580e0638f3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -41,24 +41,16 @@ struct kfd_event_waiter { bool activated; /* Becomes true when event is signaled */ }; -#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT -#define SLOT_BITMAP_LONGS BITS_TO_LONGS(SLOTS_PER_PAGE) - /* - * Over-complicated pooled allocator for event notification slots. - * * Each signal event needs a 64-bit signal slot where the signaler will write - * a 1 before sending an interrupt.l (This is needed because some interrupts + * a 1 before sending an interrupt. (This is needed because some interrupts * do not contain enough spare data bits to identify an event.) - * We get whole pages from vmalloc and map them to the process VA. - * Individual signal events are then allocated a slot in a page. + * We get whole pages and map them to the process VA. + * Individual signal events use their event_id as slot index. */ - struct kfd_signal_page { uint64_t *kernel_address; uint64_t __user *user_address; - unsigned int free_slots; - unsigned long used_slot_bitmap[SLOT_BITMAP_LONGS]; }; /* @@ -73,34 +65,6 @@ static uint64_t *page_slots(struct kfd_signal_page *page) return page->kernel_address; } -static bool allocate_free_slot(struct kfd_process *process, - unsigned int *out_slot_index) -{ - struct kfd_signal_page *page = process->signal_page; - unsigned int slot; - - if (!page || page->free_slots == 0) { - pr_debug("No free event signal slots were found for process %p\n", - process); - - return false; - } - - slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE); - - __set_bit(slot, page->used_slot_bitmap); - page->free_slots--; - - page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; - - *out_slot_index = slot; - - pr_debug("Allocated event signal slot in page %p, slot %d\n", - page, slot); - - return true; -} - static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) { void *backing_store; @@ -110,8 +74,6 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) if (!page) return NULL; - page->free_slots = SLOTS_PER_PAGE; - backing_store = (void *) __get_free_pages(GFP_KERNEL, get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); if (!backing_store) @@ -132,28 +94,26 @@ fail_alloc_signal_store: return NULL; } -static bool allocate_event_notification_slot(struct kfd_process *p, - unsigned int *signal_slot_index) +static int allocate_event_notification_slot(struct kfd_process *p, + struct kfd_event *ev) { + int id; + if (!p->signal_page) { p->signal_page = allocate_signal_page(p); if (!p->signal_page) - return false; + return -ENOMEM; } - return allocate_free_slot(p, signal_slot_index); -} + id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT, + GFP_KERNEL); + if (id < 0) + return id; -/* Assumes that the process's event_mutex is locked. */ -static void release_event_notification_slot(struct kfd_signal_page *page, - size_t slot_index) -{ - __clear_bit(slot_index, page->used_slot_bitmap); - page->free_slots++; + ev->event_id = id; + page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT; - /* We don't free signal pages, they are retained by the process - * and reused until it exits. - */ + return 0; } /* @@ -162,89 +122,32 @@ static void release_event_notification_slot(struct kfd_signal_page *page, */ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) { - struct kfd_event *ev; - - hash_for_each_possible(p->events, ev, events, id) - if (ev->event_id == id) - return ev; - - return NULL; -} - -/* - * Produce a kfd event id for a nonsignal event. - * These are arbitrary numbers, so we do a sequential search through - * the hash table for an unused number. - */ -static u32 make_nonsignal_event_id(struct kfd_process *p) -{ - u32 id; - - for (id = p->next_nonsignal_event_id; - id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id); - id++) - ; - - if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { - - /* - * What if id == LAST_NONSIGNAL_EVENT_ID - 1? - * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so - * the first loop fails immediately and we proceed with the - * wraparound loop below. - */ - p->next_nonsignal_event_id = id + 1; - - return id; - } - - for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; - id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id); - id++) - ; - - - if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { - p->next_nonsignal_event_id = id + 1; - return id; - } - - p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; - return 0; -} - -static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, - unsigned int signal_slot) -{ - return lookup_event_by_id(p, signal_slot); + return idr_find(&p->event_idr, id); } static int create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) { + int ret; + if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { if (!p->signal_event_limit_reached) { pr_warn("Signal event wasn't created because limit was reached\n"); p->signal_event_limit_reached = true; } - return -ENOMEM; + return -ENOSPC; } - if (!allocate_event_notification_slot(p, &ev->signal_slot_index)) { + ret = allocate_event_notification_slot(p, ev); + if (ret) { pr_warn("Signal event wasn't created because out of kernel memory\n"); - return -ENOMEM; + return ret; } p->signal_event_count++; - ev->user_signal_address = - &p->signal_page->user_address[ev->signal_slot_index]; - - ev->event_id = ev->signal_slot_index; - + ev->user_signal_address = &p->signal_page->user_address[ev->event_id]; pr_debug("Signal event number %zu created with id %d, address %p\n", p->signal_event_count, ev->event_id, ev->user_signal_address); @@ -252,16 +155,20 @@ static int create_signal_event(struct file *devkfd, return 0; } -/* - * No non-signal events are supported yet. - * We create them as events that never signal. - * Set event calls from user-mode are failed. - */ static int create_other_event(struct kfd_process *p, struct kfd_event *ev) { - ev->event_id = make_nonsignal_event_id(p); - if (ev->event_id == 0) - return -ENOMEM; + /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an + * intentional integer overflow to -1 without a compiler + * warning. idr_alloc treats a negative value as "maximum + * signed integer". + */ + int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID, + (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1, + GFP_KERNEL); + + if (id < 0) + return id; + ev->event_id = id; return 0; } @@ -269,9 +176,8 @@ static int create_other_event(struct kfd_process *p, struct kfd_event *ev) void kfd_event_init_process(struct kfd_process *p) { mutex_init(&p->event_mutex); - hash_init(p->events); + idr_init(&p->event_idr); p->signal_page = NULL; - p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; p->signal_event_count = 0; } @@ -284,25 +190,22 @@ static void destroy_event(struct kfd_process *p, struct kfd_event *ev) waiter->event = NULL; wake_up_all(&ev->wq); - if ((ev->type == KFD_EVENT_TYPE_SIGNAL || - ev->type == KFD_EVENT_TYPE_DEBUG) && p->signal_page) { - release_event_notification_slot(p->signal_page, - ev->signal_slot_index); + if (ev->type == KFD_EVENT_TYPE_SIGNAL || + ev->type == KFD_EVENT_TYPE_DEBUG) p->signal_event_count--; - } - hash_del(&ev->events); + idr_remove(&p->event_idr, ev->event_id); kfree(ev); } static void destroy_events(struct kfd_process *p) { struct kfd_event *ev; - struct hlist_node *tmp; - unsigned int hash_bkt; + uint32_t id; - hash_for_each_safe(p->events, hash_bkt, tmp, ev, events) + idr_for_each_entry(&p->event_idr, ev, id) destroy_event(p, ev); + idr_destroy(&p->event_idr); } /* @@ -365,7 +268,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, if (!ret) { *event_page_offset = KFD_MMAP_EVENTS_MASK; *event_page_offset <<= PAGE_SHIFT; - *event_slot_index = ev->signal_slot_index; + *event_slot_index = ev->event_id; } break; default: @@ -374,8 +277,6 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, } if (!ret) { - hash_add(p->events, &ev->events, ev->event_id); - *event_id = ev->event_id; *event_trigger_data = ev->event_id; } else { @@ -469,17 +370,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) { - page_slots(p->signal_page)[ev->signal_slot_index] = - UNSIGNALED_EVENT_SLOT; -} - -static bool is_slot_signaled(struct kfd_process *p, unsigned int index) -{ - if (!p->signal_page) - return false; - else - return page_slots(p->signal_page)[index] != - UNSIGNALED_EVENT_SLOT; + page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT; } static void set_event_from_interrupt(struct kfd_process *p, @@ -518,13 +409,31 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, * ignore it, but we could use any bits we did receive to * search faster. */ - unsigned int i; + uint64_t *slots = page_slots(p->signal_page); + uint32_t id; - for (i = 0; i < SLOTS_PER_PAGE; i++) - if (is_slot_signaled(p, i)) { - ev = lookup_event_by_page_slot(p, i); - set_event_from_interrupt(p, ev); + if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { + /* With relatively few events, it's faster to + * iterate over the event IDR + */ + idr_for_each_entry(&p->event_idr, ev, id) { + if (id >= KFD_SIGNAL_EVENT_LIMIT) + break; + + if (slots[id] != UNSIGNALED_EVENT_SLOT) + set_event_from_interrupt(p, ev); } + } else { + /* With relatively many events, it's faster to + * iterate over the signal slots and lookup + * only signaled events from the IDR. + */ + for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++) + if (slots[id] != UNSIGNALED_EVENT_SLOT) { + ev = lookup_event_by_id(p, id); + set_event_from_interrupt(p, ev); + } + } } mutex_unlock(&p->event_mutex); @@ -836,12 +745,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, { struct kfd_hsa_memory_exception_data *ev_data; struct kfd_event *ev; - int bkt; + uint32_t id; bool send_signal = true; ev_data = (struct kfd_hsa_memory_exception_data *) event_data; - hash_for_each(p->events, bkt, ev, events) + id = KFD_FIRST_NONSIGNAL_EVENT_ID; + idr_for_each_entry_continue(&p->event_idr, ev, id) if (ev->type == type) { send_signal = false; dev_dbg(kfd_device, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index f85fcee4414b..abca5bfebbff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -31,9 +31,13 @@ #include "kfd_priv.h" #include -#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U -#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK -#define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX +/* + * IDR supports non-negative integer IDs. Small IDs are used for + * signal events to match their signal slot. Use the upper half of the + * ID space for non-signal events. + */ +#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1) +#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX /* * Written into kfd_signal_slot_t to indicate that the event is not signaled. @@ -47,9 +51,6 @@ struct kfd_event_waiter; struct signal_page; struct kfd_event { - /* All events in process, rooted at kfd_process.events. */ - struct hlist_node events; - u32 event_id; bool signaled; @@ -60,7 +61,6 @@ struct kfd_event { wait_queue_head_t wq; /* List of event waiters. */ /* Only for signal events. */ - unsigned int signal_slot_index; uint64_t __user *user_signal_address; /* type specific data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index c1b3ee22faed..ebae8e1891d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "amd_shared.h" @@ -538,11 +539,10 @@ struct kfd_process { /* Event-related data */ struct mutex event_mutex; - /* All events in process hashed by ID, linked on kfd_event.events. */ - DECLARE_HASHTABLE(events, 4); + /* Event ID allocator and lookup */ + struct idr event_idr; /* Event page */ struct kfd_signal_page *signal_page; - u32 next_nonsignal_event_id; size_t signal_event_count; bool signal_event_limit_reached; }; From 3f04f9614831b4d18dcaf228cff0617a20073f9d Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:28 -0400 Subject: [PATCH 11/26] drm/amdkfd: Use IH context ID for signal lookup This speeds up signal lookup when the IH ring entry includes a valid context ID or partial context ID. Only if the context ID is found to be invalid, fall back to an exhaustive search of all signaled events. Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../gpu/drm/amd/amdkfd/cik_event_interrupt.c | 7 +- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 73 +++++++++++++++---- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 66164aa67c66..3d5ccb3755d4 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -47,6 +47,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; + uint32_t context_id = ihre->data & 0xfffffff; pasid = (ihre->ring_id & 0xffff0000) >> 16; @@ -54,11 +55,11 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, 0, 0); + kfd_signal_event_interrupt(pasid, context_id, 28); else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, 0, 0); + kfd_signal_event_interrupt(pasid, context_id, 28); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); + kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) kfd_signal_hw_exception_event(pasid); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 41580e0638f3..26e8045aa760 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -53,12 +53,6 @@ struct kfd_signal_page { uint64_t __user *user_address; }; -/* - * For signal events, the event ID is used as the interrupt user data. - * For SQ s_sendmsg interrupts, this is limited to 8 bits. - */ - -#define INTERRUPT_DATA_BITS 8 static uint64_t *page_slots(struct kfd_signal_page *page) { @@ -125,6 +119,54 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) return idr_find(&p->event_idr, id); } +/** + * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID + * @p: Pointer to struct kfd_process + * @id: ID to look up + * @bits: Number of valid bits in @id + * + * Finds the first signaled event with a matching partial ID. If no + * matching signaled event is found, returns NULL. In that case the + * caller should assume that the partial ID is invalid and do an + * exhaustive search of all siglaned events. + * + * If multiple events with the same partial ID signal at the same + * time, they will be found one interrupt at a time, not necessarily + * in the same order the interrupts occurred. As long as the number of + * interrupts is correct, all signaled events will be seen by the + * driver. + */ +static struct kfd_event *lookup_signaled_event_by_partial_id( + struct kfd_process *p, uint32_t id, uint32_t bits) +{ + struct kfd_event *ev; + + if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT) + return NULL; + + /* Fast path for the common case that @id is not a partial ID + * and we only need a single lookup. + */ + if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) { + if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + return NULL; + + return idr_find(&p->event_idr, id); + } + + /* General case for partial IDs: Iterate over all matching IDs + * and find the first one that has signaled. + */ + for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) { + if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + continue; + + ev = idr_find(&p->event_idr, id); + } + + return ev; +} + static int create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) @@ -385,7 +427,7 @@ static void set_event_from_interrupt(struct kfd_process *p, void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits) { - struct kfd_event *ev; + struct kfd_event *ev = NULL; /* * Because we are called from arbitrary context (workqueue) as opposed @@ -399,19 +441,24 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, mutex_lock(&p->event_mutex); - if (valid_id_bits >= INTERRUPT_DATA_BITS) { - /* Partial ID is a full ID. */ - ev = lookup_event_by_id(p, partial_id); + if (valid_id_bits) + ev = lookup_signaled_event_by_partial_id(p, partial_id, + valid_id_bits); + if (ev) { set_event_from_interrupt(p, ev); } else if (p->signal_page) { /* - * Partial ID is in fact partial. For now we completely - * ignore it, but we could use any bits we did receive to - * search faster. + * Partial ID lookup failed. Assume that the event ID + * in the interrupt payload was invalid and do an + * exhaustive search of signaled events. */ uint64_t *slots = page_slots(p->signal_page); uint32_t id; + if (valid_id_bits) + pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", + partial_id, valid_id_bits); + if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { /* With relatively few events, it's faster to * iterate over the event IDR From b9a5d0a5db802277c93667c6af93e699bb8c773e Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 27 Oct 2017 19:35:29 -0400 Subject: [PATCH 12/26] drm/amdkfd: Make event limit dependent on user mode mapping size This allows increasing the KFD_SIGNAL_EVENT_LIMIT in kfd_ioctl.h without breaking processes built with older kfd_ioctl.h versions. Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 25 +++++++++++++++++++------ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 26e8045aa760..cb92d4b72400 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -97,9 +97,17 @@ static int allocate_event_notification_slot(struct kfd_process *p, p->signal_page = allocate_signal_page(p); if (!p->signal_page) return -ENOMEM; + /* Oldest user mode expects 256 event slots */ + p->signal_mapped_size = 256*8; } - id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT, + /* + * Compatibility with old user mode: Only use signal slots + * user mode has mapped, may be less than + * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase + * of the event limit without breaking user mode. + */ + id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8, GFP_KERNEL); if (id < 0) return id; @@ -173,7 +181,8 @@ static int create_signal_event(struct file *devkfd, { int ret; - if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { + if (p->signal_mapped_size && + p->signal_event_count == p->signal_mapped_size / 8) { if (!p->signal_event_limit_reached) { pr_warn("Signal event wasn't created because limit was reached\n"); p->signal_event_limit_reached = true; @@ -744,12 +753,12 @@ out: int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) { - unsigned long pfn; struct kfd_signal_page *page; + int ret; - /* check required size is logical */ - if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != + /* check required size doesn't exceed the allocated size */ + if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) < get_order(vma->vm_end - vma->vm_start)) { pr_err("Event page mmap requested illegal size\n"); return -EINVAL; @@ -779,8 +788,12 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) page->user_address = (uint64_t __user *)vma->vm_start; /* mapping the page to user process */ - return remap_pfn_range(vma, vma->vm_start, pfn, + ret = remap_pfn_range(vma, vma->vm_start, pfn, vma->vm_end - vma->vm_start, vma->vm_page_prot); + if (!ret) + p->signal_mapped_size = vma->vm_end - vma->vm_start; + + return ret; } /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ebae8e1891d2..ba26da81d8ec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -543,6 +543,7 @@ struct kfd_process { struct idr event_idr; /* Event page */ struct kfd_signal_page *signal_page; + size_t signal_mapped_size; size_t signal_event_count; bool signal_event_limit_reached; }; From 7e86a365a8319970e002f83c73701a86d95a69e6 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 27 Oct 2017 19:35:30 -0400 Subject: [PATCH 13/26] drm/amdkfd: increase limit of signal events to 4096 per process Signed-off-by: Oded Gabbay Reviewed-by: Ben Goz Signed-off-by: Felix Kuehling --- include/uapi/linux/kfd_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 26283fefdf5f..731d0df722e3 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -169,7 +169,7 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_IOC_WAIT_RESULT_TIMEOUT 1 #define KFD_IOC_WAIT_RESULT_FAIL 2 -#define KFD_SIGNAL_EVENT_LIMIT 256 +#define KFD_SIGNAL_EVENT_LIMIT 4096 struct kfd_ioctl_create_event_args { __u64 event_page_offset; /* from KFD */ From 04ad47bd14a0f7a268124c4fc468e964e457a702 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 27 Oct 2017 19:35:31 -0400 Subject: [PATCH 14/26] drm/amdkfd: use standard kernel kfifo for IH Replace our implementation of a lockless ring buffer with the standard linux kernel kfifo. We shouldn't maintain our own version of a standard data structure. Signed-off-by: Andres Rodriguez Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 76 +++++++--------------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +- 2 files changed, 26 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 70b3a99cffc2..ffbb91aa9bbf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -42,25 +42,24 @@ #include #include +#include #include "kfd_priv.h" -#define KFD_INTERRUPT_RING_SIZE 1024 +#define KFD_IH_NUM_ENTRIES 1024 static void interrupt_wq(struct work_struct *); int kfd_interrupt_init(struct kfd_dev *kfd) { - void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, - kfd->device_info->ih_ring_entry_size, - GFP_KERNEL); - if (!interrupt_ring) - return -ENOMEM; + int r; - kfd->interrupt_ring = interrupt_ring; - kfd->interrupt_ring_size = - KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; - atomic_set(&kfd->interrupt_ring_wptr, 0); - atomic_set(&kfd->interrupt_ring_rptr, 0); + r = kfifo_alloc(&kfd->ih_fifo, + KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size, + GFP_KERNEL); + if (r) { + dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); + return r; + } spin_lock_init(&kfd->interrupt_lock); @@ -98,68 +97,41 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) */ flush_scheduled_work(); - kfree(kfd->interrupt_ring); + kfifo_free(&kfd->ih_fifo); } /* - * This assumes that it can't be called concurrently with itself - * but only with dequeue_ih_ring_entry. + * Assumption: single reader/writer. This function is not re-entrant */ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) { - unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); - unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + int count; - if ((rptr - wptr) % kfd->interrupt_ring_size == - kfd->device_info->ih_ring_entry_size) { - /* This is very bad, the system is likely to hang. */ + count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + if (count != kfd->device_info->ih_ring_entry_size) { dev_err_ratelimited(kfd_chardev(), - "Interrupt ring overflow, dropping interrupt.\n"); + "Interrupt ring overflow, dropping interrupt %d\n", + count); return false; } - memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, - kfd->device_info->ih_ring_entry_size); - - wptr = (wptr + kfd->device_info->ih_ring_entry_size) % - kfd->interrupt_ring_size; - smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ - atomic_set(&kfd->interrupt_ring_wptr, wptr); - return true; } /* - * This assumes that it can't be called concurrently with itself - * but only with enqueue_ih_ring_entry. + * Assumption: single reader/writer. This function is not re-entrant */ static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) { - /* - * Assume that wait queues have an implicit barrier, i.e. anything that - * happened in the ISR before it queued work is visible. - */ + int count; - unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); - unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); - if (rptr == wptr) - return false; + WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); - memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, - kfd->device_info->ih_ring_entry_size); - - rptr = (rptr + kfd->device_info->ih_ring_entry_size) % - kfd->interrupt_ring_size; - - /* - * Ensure the rptr write update is not visible until - * memcpy has finished reading. - */ - smp_mb(); - atomic_set(&kfd->interrupt_ring_rptr, rptr); - - return true; + return count == kfd->device_info->ih_ring_entry_size; } static void interrupt_wq(struct work_struct *work) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ba26da81d8ec..0aec5ca8a964 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "amd_shared.h" @@ -182,10 +183,7 @@ struct kfd_dev { unsigned int gtt_sa_num_of_chunks; /* Interrupts */ - void *interrupt_ring; - size_t interrupt_ring_size; - atomic_t interrupt_ring_rptr; - atomic_t interrupt_ring_wptr; + struct kfifo ih_fifo; struct work_struct interrupt_work; spinlock_t interrupt_lock; From 27232055b12902073f3dbc17cdfa2def27f70d85 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 27 Oct 2017 19:35:32 -0400 Subject: [PATCH 15/26] drm/amdkfd: increase IH num entries to 8192 A larger buffer will let us accommodate applications with a large amount of semi-simultaneous event signals. Signed-off-by: Andres Rodriguez Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index ffbb91aa9bbf..a1472691e028 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -45,7 +45,7 @@ #include #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 1024 +#define KFD_IH_NUM_ENTRIES 8192 static void interrupt_wq(struct work_struct *); From 0f875e3f3e422d28bb80757269837def75009778 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 27 Oct 2017 19:35:33 -0400 Subject: [PATCH 16/26] drm/amdkfd: wait only for IH work on IH exit We don't need to wait for all work to complete in the IH exit function. We only need to make sure the interrupt_work has finished executing to guarantee that ih_kfifo is no longer in use. Signed-off-by: Andres Rodriguez Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index a1472691e028..9c08d4670b7f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -91,11 +91,11 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) spin_unlock_irqrestore(&kfd->interrupt_lock, flags); /* - * Flush_scheduled_work ensures that there are no outstanding + * flush_work ensures that there are no outstanding * work-queue items that will access interrupt_ring. New work items * can't be created because we stopped interrupt handling above. */ - flush_scheduled_work(); + flush_work(&kfd->interrupt_work); kfifo_free(&kfd->ih_fifo); } From 48e876a20e79566f1736413d4f42dc66f3ab2f16 Mon Sep 17 00:00:00 2001 From: Andres Rodriguez Date: Fri, 27 Oct 2017 19:35:34 -0400 Subject: [PATCH 17/26] drm/amdkfd: use a high priority workqueue for IH work In systems under heavy load the IH work may experience significant scheduling delays. Under load + system workqueue: Max Latency: 7.023695 ms Avg Latency: 0.263994 ms Under load + high priority workqueue: Max Latency: 1.162568 ms Avg Latency: 0.163213 ms Further work is required to measure the impact of per-cpu settings on IH performance. Signed-off-by: Andres Rodriguez Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 46049f005b02..621a3b53a038 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -403,7 +403,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry) && enqueue_ih_ring_entry(kfd, ih_ring_entry)) - schedule_work(&kfd->interrupt_work); + queue_work(kfd->ih_wq, &kfd->interrupt_work); spin_unlock(&kfd->interrupt_lock); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 9c08d4670b7f..035c351f47c5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -61,6 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd) return r; } + kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); spin_lock_init(&kfd->interrupt_lock); INIT_WORK(&kfd->interrupt_work, interrupt_wq); @@ -95,7 +96,7 @@ void kfd_interrupt_exit(struct kfd_dev *kfd) * work-queue items that will access interrupt_ring. New work items * can't be created because we stopped interrupt handling above. */ - flush_work(&kfd->interrupt_work); + flush_workqueue(kfd->ih_wq); kfifo_free(&kfd->ih_fifo); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 0aec5ca8a964..6a91a60c64cf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -184,6 +184,7 @@ struct kfd_dev { /* Interrupts */ struct kfifo ih_fifo; + struct workqueue_struct *ih_wq; struct work_struct interrupt_work; spinlock_t interrupt_lock; From f4fa88ab28ab61941a22f938eda3d93d1fe371af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 30 Oct 2017 14:16:21 +0100 Subject: [PATCH 18/26] drm/radeon: deprecate and remove KFD interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To quote Felix: "For testing KV with current user mode stack, please use amdgpu. I don't expect this to work with radeon and I'm not planning to spend any effort on making radeon work with a current user mode stack." Only compile tested, but should be straight forward. Signed-off-by: Christian König Signed-off-by: Oded Gabbay --- MAINTAINERS | 2 - drivers/gpu/drm/amd/amdkfd/Kconfig | 2 +- drivers/gpu/drm/radeon/Makefile | 3 +- drivers/gpu/drm/radeon/cik.c | 14 +- drivers/gpu/drm/radeon/cikd.h | 2 - drivers/gpu/drm/radeon/radeon.h | 3 - drivers/gpu/drm/radeon/radeon_drv.c | 10 - drivers/gpu/drm/radeon/radeon_kfd.c | 901 ---------------------------- drivers/gpu/drm/radeon/radeon_kfd.h | 47 -- drivers/gpu/drm/radeon/radeon_kms.c | 7 - 10 files changed, 4 insertions(+), 987 deletions(-) delete mode 100644 drivers/gpu/drm/radeon/radeon_kfd.c delete mode 100644 drivers/gpu/drm/radeon/radeon_kfd.h diff --git a/MAINTAINERS b/MAINTAINERS index 0525701befd0..6e1f94b4ed26 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -759,8 +759,6 @@ F: drivers/gpu/drm/amd/amdkfd/ F: drivers/gpu/drm/amd/include/cik_structs.h F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h F: drivers/gpu/drm/amd/include/vi_structs.h -F: drivers/gpu/drm/radeon/radeon_kfd.c -F: drivers/gpu/drm/radeon/radeon_kfd.h F: include/uapi/linux/kfd_ioctl.h AMD SEATTLE DEVICE TREE SUPPORT diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index e13c67c8d2c0..bc5a2945bd2b 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -4,6 +4,6 @@ config HSA_AMD tristate "HSA kernel driver for AMD GPU devices" - depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 + depends on DRM_AMDGPU && AMD_IOMMU_V2 && X86_64 help Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index be16c6390216..cf3e5985e3e7 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -102,8 +102,7 @@ radeon-y += \ radeon-y += \ radeon_vce.o \ vce_v1_0.o \ - vce_v2_0.o \ - radeon_kfd.o + vce_v2_0.o radeon-$(CONFIG_VGA_SWITCHEROO) += radeon_atpx_handler.o radeon-$(CONFIG_ACPI) += radeon_acpi.o diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 3cb6c55b268d..898f9a078830 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -33,7 +33,6 @@ #include "cik_blit_shaders.h" #include "radeon_ucode.h" #include "clearstate_ci.h" -#include "radeon_kfd.h" #define SH_MEM_CONFIG_GFX_DEFAULT \ ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) @@ -5684,10 +5683,9 @@ int cik_vm_init(struct radeon_device *rdev) /* * number of VMs * VMID 0 is reserved for System - * radeon graphics/compute will use VMIDs 1-7 - * amdkfd will use VMIDs 8-15 + * radeon graphics/compute will use VMIDs 1-15 */ - rdev->vm_manager.nvm = RADEON_NUM_OF_VMIDS; + rdev->vm_manager.nvm = 16; /* base offset of vram pages */ if (rdev->flags & RADEON_IS_IGP) { u64 tmp = RREG32(MC_VM_FB_OFFSET); @@ -7589,9 +7587,6 @@ restart_ih: /* wptr/rptr are in bytes! */ ring_index = rptr / 4; - radeon_kfd_interrupt(rdev, - (const void *) &rdev->ih.ring[ring_index]); - src_id = le32_to_cpu(rdev->ih.ring[ring_index]) & 0xff; src_data = le32_to_cpu(rdev->ih.ring[ring_index + 1]) & 0xfffffff; ring_id = le32_to_cpu(rdev->ih.ring[ring_index + 2]) & 0xff; @@ -8486,10 +8481,6 @@ static int cik_startup(struct radeon_device *rdev) if (r) return r; - r = radeon_kfd_resume(rdev); - if (r) - return r; - return 0; } @@ -8538,7 +8529,6 @@ int cik_resume(struct radeon_device *rdev) */ int cik_suspend(struct radeon_device *rdev) { - radeon_kfd_suspend(rdev); radeon_pm_suspend(rdev); radeon_audio_fini(rdev); radeon_vm_manager_fini(rdev); diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h index e21015475ed5..cda16fcd43bb 100644 --- a/drivers/gpu/drm/radeon/cikd.h +++ b/drivers/gpu/drm/radeon/cikd.h @@ -30,8 +30,6 @@ #define CIK_RB_BITMAP_WIDTH_PER_SH 2 #define HAWAII_RB_BITMAP_WIDTH_PER_SH 4 -#define RADEON_NUM_OF_VMIDS 8 - /* DIDT IND registers */ #define DIDT_SQ_CTRL0 0x0 # define DIDT_CTRL_EN (1 << 0) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 8cbaeec090c9..a8e546569858 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -2456,9 +2456,6 @@ struct radeon_device { u64 vram_pin_size; u64 gart_pin_size; - /* amdkfd interface */ - struct kfd_dev *kfd; - struct mutex mn_lock; DECLARE_HASHTABLE(mn_hash, 7); }; diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index f4becad0a78c..31dd04f6baa1 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -43,7 +43,6 @@ #include #include -#include "radeon_kfd.h" /* * KMS wrapper. @@ -338,14 +337,6 @@ static int radeon_pci_probe(struct pci_dev *pdev, { int ret; - /* - * Initialize amdkfd before starting radeon. If it was not loaded yet, - * defer radeon probing - */ - ret = radeon_kfd_init(); - if (ret == -EPROBE_DEFER) - return ret; - if (vga_switcheroo_client_probe_defer(pdev)) return -EPROBE_DEFER; @@ -645,7 +636,6 @@ static int __init radeon_init(void) static void __exit radeon_exit(void) { - radeon_kfd_fini(); pci_unregister_driver(pdriver); radeon_unregister_atpx_handler(); } diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c deleted file mode 100644 index 385b4d76956d..000000000000 --- a/drivers/gpu/drm/radeon/radeon_kfd.c +++ /dev/null @@ -1,901 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include "radeon.h" -#include "cikd.h" -#include "cik_reg.h" -#include "radeon_kfd.h" -#include "radeon_ucode.h" -#include -#include "cik_structs.h" - -#define CIK_PIPE_PER_MEC (4) - -static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { - TCP_WATCH0_ADDR_H, TCP_WATCH0_ADDR_L, TCP_WATCH0_CNTL, - TCP_WATCH1_ADDR_H, TCP_WATCH1_ADDR_L, TCP_WATCH1_CNTL, - TCP_WATCH2_ADDR_H, TCP_WATCH2_ADDR_L, TCP_WATCH2_CNTL, - TCP_WATCH3_ADDR_H, TCP_WATCH3_ADDR_L, TCP_WATCH3_CNTL -}; - -struct kgd_mem { - struct radeon_bo *bo; - uint64_t gpu_addr; - void *cpu_ptr; -}; - - -static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); - -static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); - -static uint64_t get_vmem_size(struct kgd_dev *kgd); -static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); - -static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); - -static int alloc_pasid(unsigned int bits); -static void free_pasid(unsigned int pasid); - -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); - -/* - * Register access functions - */ - -static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, - uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); - -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid); - -static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); -static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); -static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); -static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id); -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int timeout); -static int kgd_address_watch_disable(struct kgd_dev *kgd); -static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, - uint32_t cntl_val, - uint32_t addr_hi, - uint32_t addr_lo); -static int kgd_wave_control_execute(struct kgd_dev *kgd, - uint32_t gfx_index_val, - uint32_t sq_cmd); -static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset); - -static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); -static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid); -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); - -static const struct kfd2kgd_calls kfd2kgd = { - .init_gtt_mem_allocation = alloc_gtt_mem, - .free_gtt_mem = free_gtt_mem, - .get_vmem_size = get_vmem_size, - .get_gpu_clock_counter = get_gpu_clock_counter, - .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, - .alloc_pasid = alloc_pasid, - .free_pasid = free_pasid, - .program_sh_mem_settings = kgd_program_sh_mem_settings, - .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, - .init_pipeline = kgd_init_pipeline, - .init_interrupts = kgd_init_interrupts, - .hqd_load = kgd_hqd_load, - .hqd_sdma_load = kgd_hqd_sdma_load, - .hqd_is_occupied = kgd_hqd_is_occupied, - .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, - .hqd_destroy = kgd_hqd_destroy, - .hqd_sdma_destroy = kgd_hqd_sdma_destroy, - .address_watch_disable = kgd_address_watch_disable, - .address_watch_execute = kgd_address_watch_execute, - .wave_control_execute = kgd_wave_control_execute, - .address_watch_get_offset = kgd_address_watch_get_offset, - .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, - .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, - .write_vmid_invalidate_request = write_vmid_invalidate_request, - .get_fw_version = get_fw_version -}; - -static const struct kgd2kfd_calls *kgd2kfd; - -int radeon_kfd_init(void) -{ - int ret; - -#if defined(CONFIG_HSA_AMD_MODULE) - int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); - - kgd2kfd_init_p = symbol_request(kgd2kfd_init); - - if (kgd2kfd_init_p == NULL) - return -ENOENT; - - ret = kgd2kfd_init_p(KFD_INTERFACE_VERSION, &kgd2kfd); - if (ret) { - symbol_put(kgd2kfd_init); - kgd2kfd = NULL; - } - -#elif defined(CONFIG_HSA_AMD) - ret = kgd2kfd_init(KFD_INTERFACE_VERSION, &kgd2kfd); - if (ret) - kgd2kfd = NULL; - -#else - ret = -ENOENT; -#endif - - return ret; -} - -void radeon_kfd_fini(void) -{ - if (kgd2kfd) { - kgd2kfd->exit(); - symbol_put(kgd2kfd_init); - } -} - -void radeon_kfd_device_probe(struct radeon_device *rdev) -{ - if (kgd2kfd) - rdev->kfd = kgd2kfd->probe((struct kgd_dev *)rdev, - rdev->pdev, &kfd2kgd); -} - -void radeon_kfd_device_init(struct radeon_device *rdev) -{ - int i, queue, pipe, mec; - - if (rdev->kfd) { - struct kgd2kfd_shared_resources gpu_resources = { - .compute_vmid_bitmap = 0xFF00, - .num_pipe_per_mec = 4, - .num_queue_per_pipe = 8 - }; - - bitmap_zero(gpu_resources.queue_bitmap, KGD_MAX_QUEUES); - - for (i = 0; i < KGD_MAX_QUEUES; ++i) { - queue = i % gpu_resources.num_queue_per_pipe; - pipe = (i / gpu_resources.num_queue_per_pipe) - % gpu_resources.num_pipe_per_mec; - mec = (i / gpu_resources.num_queue_per_pipe) - / gpu_resources.num_pipe_per_mec; - - if (mec == 0 && pipe > 0) - set_bit(i, gpu_resources.queue_bitmap); - } - - radeon_doorbell_get_kfd_info(rdev, - &gpu_resources.doorbell_physical_address, - &gpu_resources.doorbell_aperture_size, - &gpu_resources.doorbell_start_offset); - - kgd2kfd->device_init(rdev->kfd, &gpu_resources); - } -} - -void radeon_kfd_device_fini(struct radeon_device *rdev) -{ - if (rdev->kfd) { - kgd2kfd->device_exit(rdev->kfd); - rdev->kfd = NULL; - } -} - -void radeon_kfd_interrupt(struct radeon_device *rdev, const void *ih_ring_entry) -{ - if (rdev->kfd) - kgd2kfd->interrupt(rdev->kfd, ih_ring_entry); -} - -void radeon_kfd_suspend(struct radeon_device *rdev) -{ - if (rdev->kfd) - kgd2kfd->suspend(rdev->kfd); -} - -int radeon_kfd_resume(struct radeon_device *rdev) -{ - int r = 0; - - if (rdev->kfd) - r = kgd2kfd->resume(rdev->kfd); - - return r; -} - -static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, - void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - struct kgd_mem **mem = (struct kgd_mem **) mem_obj; - int r; - - BUG_ON(kgd == NULL); - BUG_ON(gpu_addr == NULL); - BUG_ON(cpu_ptr == NULL); - - *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if ((*mem) == NULL) - return -ENOMEM; - - r = radeon_bo_create(rdev, size, PAGE_SIZE, true, RADEON_GEM_DOMAIN_GTT, - RADEON_GEM_GTT_WC, NULL, NULL, &(*mem)->bo); - if (r) { - dev_err(rdev->dev, - "failed to allocate BO for amdkfd (%d)\n", r); - return r; - } - - /* map the buffer */ - r = radeon_bo_reserve((*mem)->bo, true); - if (r) { - dev_err(rdev->dev, "(%d) failed to reserve bo for amdkfd\n", r); - goto allocate_mem_reserve_bo_failed; - } - - r = radeon_bo_pin((*mem)->bo, RADEON_GEM_DOMAIN_GTT, - &(*mem)->gpu_addr); - if (r) { - dev_err(rdev->dev, "(%d) failed to pin bo for amdkfd\n", r); - goto allocate_mem_pin_bo_failed; - } - *gpu_addr = (*mem)->gpu_addr; - - r = radeon_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); - if (r) { - dev_err(rdev->dev, - "(%d) failed to map bo to kernel for amdkfd\n", r); - goto allocate_mem_kmap_bo_failed; - } - *cpu_ptr = (*mem)->cpu_ptr; - - radeon_bo_unreserve((*mem)->bo); - - return 0; - -allocate_mem_kmap_bo_failed: - radeon_bo_unpin((*mem)->bo); -allocate_mem_pin_bo_failed: - radeon_bo_unreserve((*mem)->bo); -allocate_mem_reserve_bo_failed: - radeon_bo_unref(&(*mem)->bo); - - return r; -} - -static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) -{ - struct kgd_mem *mem = (struct kgd_mem *) mem_obj; - - BUG_ON(mem == NULL); - - radeon_bo_reserve(mem->bo, true); - radeon_bo_kunmap(mem->bo); - radeon_bo_unpin(mem->bo); - radeon_bo_unreserve(mem->bo); - radeon_bo_unref(&(mem->bo)); - kfree(mem); -} - -static uint64_t get_vmem_size(struct kgd_dev *kgd) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - - BUG_ON(kgd == NULL); - - return rdev->mc.real_vram_size; -} - -static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - - return rdev->asic->get_gpu_clock_counter(rdev); -} - -static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - - /* The sclk is in quantas of 10kHz */ - return rdev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100; -} - -/* - * PASID manager - */ -static DEFINE_IDA(pasid_ida); - -static int alloc_pasid(unsigned int bits) -{ - int pasid = -EINVAL; - - for (bits = min(bits, 31U); bits > 0; bits--) { - pasid = ida_simple_get(&pasid_ida, - 1U << (bits - 1), 1U << bits, - GFP_KERNEL); - if (pasid != -ENOSPC) - break; - } - - return pasid; -} - -static void free_pasid(unsigned int pasid) -{ - ida_simple_remove(&pasid_ida, pasid); -} - -static inline struct radeon_device *get_radeon_device(struct kgd_dev *kgd) -{ - return (struct radeon_device *)kgd; -} - -static void write_register(struct kgd_dev *kgd, uint32_t offset, uint32_t value) -{ - struct radeon_device *rdev = get_radeon_device(kgd); - - writel(value, (void __iomem *)(rdev->rmmio + offset)); -} - -static uint32_t read_register(struct kgd_dev *kgd, uint32_t offset) -{ - struct radeon_device *rdev = get_radeon_device(kgd); - - return readl((void __iomem *)(rdev->rmmio + offset)); -} - -static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, - uint32_t queue, uint32_t vmid) -{ - struct radeon_device *rdev = get_radeon_device(kgd); - uint32_t value = PIPEID(pipe) | MEID(mec) | VMID(vmid) | QUEUEID(queue); - - mutex_lock(&rdev->srbm_mutex); - write_register(kgd, SRBM_GFX_CNTL, value); -} - -static void unlock_srbm(struct kgd_dev *kgd) -{ - struct radeon_device *rdev = get_radeon_device(kgd); - - write_register(kgd, SRBM_GFX_CNTL, 0); - mutex_unlock(&rdev->srbm_mutex); -} - -static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t queue_id) -{ - uint32_t mec = (++pipe_id / CIK_PIPE_PER_MEC) + 1; - uint32_t pipe = (pipe_id % CIK_PIPE_PER_MEC); - - lock_srbm(kgd, mec, pipe, queue_id, 0); -} - -static void release_queue(struct kgd_dev *kgd) -{ - unlock_srbm(kgd); -} - -static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, - uint32_t sh_mem_ape1_base, - uint32_t sh_mem_ape1_limit, - uint32_t sh_mem_bases) -{ - lock_srbm(kgd, 0, 0, 0, vmid); - - write_register(kgd, SH_MEM_CONFIG, sh_mem_config); - write_register(kgd, SH_MEM_APE1_BASE, sh_mem_ape1_base); - write_register(kgd, SH_MEM_APE1_LIMIT, sh_mem_ape1_limit); - write_register(kgd, SH_MEM_BASES, sh_mem_bases); - - unlock_srbm(kgd); -} - -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid) -{ - /* - * We have to assume that there is no outstanding mapping. - * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 - * because a mapping is in progress or because a mapping finished and - * the SW cleared it. - * So the protocol is to always wait & clear. - */ - uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | - ATC_VMID_PASID_MAPPING_VALID_MASK; - - write_register(kgd, ATC_VMID0_PASID_MAPPING + vmid*sizeof(uint32_t), - pasid_mapping); - - while (!(read_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS) & - (1U << vmid))) - cpu_relax(); - write_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid); - - /* Mapping vmid to pasid also for IH block */ - write_register(kgd, IH_VMID_0_LUT + vmid * sizeof(uint32_t), - pasid_mapping); - - return 0; -} - -static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr) -{ - /* nothing to do here */ - return 0; -} - -static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) -{ - uint32_t mec; - uint32_t pipe; - - mec = (pipe_id / CIK_PIPE_PER_MEC) + 1; - pipe = (pipe_id % CIK_PIPE_PER_MEC); - - lock_srbm(kgd, mec, pipe, 0, 0); - - write_register(kgd, CPC_INT_CNTL, - TIME_STAMP_INT_ENABLE | OPCODE_ERROR_INT_ENABLE); - - unlock_srbm(kgd); - - return 0; -} - -static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) -{ - uint32_t retval; - - retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + - m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; - - pr_debug("kfd: sdma base address: 0x%x\n", retval); - - return retval; -} - -static inline struct cik_mqd *get_mqd(void *mqd) -{ - return (struct cik_mqd *)mqd; -} - -static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -{ - return (struct cik_sdma_rlc_registers *)mqd; -} - -static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr, - uint32_t wptr_shift, uint32_t wptr_mask, - struct mm_struct *mm) -{ - uint32_t wptr_shadow, is_wptr_shadow_valid; - struct cik_mqd *m; - - m = get_mqd(mqd); - - is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); - - acquire_queue(kgd, pipe_id, queue_id); - write_register(kgd, CP_MQD_BASE_ADDR, m->cp_mqd_base_addr_lo); - write_register(kgd, CP_MQD_BASE_ADDR_HI, m->cp_mqd_base_addr_hi); - write_register(kgd, CP_MQD_CONTROL, m->cp_mqd_control); - - write_register(kgd, CP_HQD_PQ_BASE, m->cp_hqd_pq_base_lo); - write_register(kgd, CP_HQD_PQ_BASE_HI, m->cp_hqd_pq_base_hi); - write_register(kgd, CP_HQD_PQ_CONTROL, m->cp_hqd_pq_control); - - write_register(kgd, CP_HQD_IB_CONTROL, m->cp_hqd_ib_control); - write_register(kgd, CP_HQD_IB_BASE_ADDR, m->cp_hqd_ib_base_addr_lo); - write_register(kgd, CP_HQD_IB_BASE_ADDR_HI, m->cp_hqd_ib_base_addr_hi); - - write_register(kgd, CP_HQD_IB_RPTR, m->cp_hqd_ib_rptr); - - write_register(kgd, CP_HQD_PERSISTENT_STATE, - m->cp_hqd_persistent_state); - write_register(kgd, CP_HQD_SEMA_CMD, m->cp_hqd_sema_cmd); - write_register(kgd, CP_HQD_MSG_TYPE, m->cp_hqd_msg_type); - - write_register(kgd, CP_HQD_ATOMIC0_PREOP_LO, - m->cp_hqd_atomic0_preop_lo); - - write_register(kgd, CP_HQD_ATOMIC0_PREOP_HI, - m->cp_hqd_atomic0_preop_hi); - - write_register(kgd, CP_HQD_ATOMIC1_PREOP_LO, - m->cp_hqd_atomic1_preop_lo); - - write_register(kgd, CP_HQD_ATOMIC1_PREOP_HI, - m->cp_hqd_atomic1_preop_hi); - - write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR, - m->cp_hqd_pq_rptr_report_addr_lo); - - write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR_HI, - m->cp_hqd_pq_rptr_report_addr_hi); - - write_register(kgd, CP_HQD_PQ_RPTR, m->cp_hqd_pq_rptr); - - write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR, - m->cp_hqd_pq_wptr_poll_addr_lo); - - write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR_HI, - m->cp_hqd_pq_wptr_poll_addr_hi); - - write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL, - m->cp_hqd_pq_doorbell_control); - - write_register(kgd, CP_HQD_VMID, m->cp_hqd_vmid); - - write_register(kgd, CP_HQD_QUANTUM, m->cp_hqd_quantum); - - write_register(kgd, CP_HQD_PIPE_PRIORITY, m->cp_hqd_pipe_priority); - write_register(kgd, CP_HQD_QUEUE_PRIORITY, m->cp_hqd_queue_priority); - - write_register(kgd, CP_HQD_IQ_RPTR, m->cp_hqd_iq_rptr); - - if (is_wptr_shadow_valid) - write_register(kgd, CP_HQD_PQ_WPTR, wptr_shadow); - - write_register(kgd, CP_HQD_ACTIVE, m->cp_hqd_active); - release_queue(kgd); - - return 0; -} - -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) -{ - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_VIRTUAL_ADDR, - m->sdma_rlc_virtual_addr); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_BASE, - m->sdma_rlc_rb_base); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_BASE_HI, - m->sdma_rlc_rb_base_hi); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_LO, - m->sdma_rlc_rb_rptr_addr_lo); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_HI, - m->sdma_rlc_rb_rptr_addr_hi); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_DOORBELL, - m->sdma_rlc_doorbell); - - write_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_CNTL, - m->sdma_rlc_rb_cntl); - - return 0; -} - -static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id) -{ - uint32_t act; - bool retval = false; - uint32_t low, high; - - acquire_queue(kgd, pipe_id, queue_id); - act = read_register(kgd, CP_HQD_ACTIVE); - if (act) { - low = lower_32_bits(queue_address >> 8); - high = upper_32_bits(queue_address >> 8); - - if (low == read_register(kgd, CP_HQD_PQ_BASE) && - high == read_register(kgd, CP_HQD_PQ_BASE_HI)) - retval = true; - } - release_queue(kgd); - return retval; -} - -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) -{ - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t sdma_rlc_rb_cntl; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - - sdma_rlc_rb_cntl = read_register(kgd, - sdma_base_addr + SDMA0_RLC0_RB_CNTL); - - if (sdma_rlc_rb_cntl & SDMA_RB_ENABLE) - return true; - - return false; -} - -static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - uint32_t temp; - - acquire_queue(kgd, pipe_id, queue_id); - write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL, 0); - - write_register(kgd, CP_HQD_DEQUEUE_REQUEST, reset_type); - - while (true) { - temp = read_register(kgd, CP_HQD_ACTIVE); - if (temp & 0x1) - break; - if (timeout == 0) { - pr_err("kfd: cp queue preemption time out (%dms)\n", - temp); - release_queue(kgd); - return -ETIME; - } - msleep(20); - timeout -= 20; - } - - release_queue(kgd); - return 0; -} - -static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, - unsigned int timeout) -{ - struct cik_sdma_rlc_registers *m; - uint32_t sdma_base_addr; - uint32_t temp; - - m = get_sdma_mqd(mqd); - sdma_base_addr = get_sdma_base_addr(m); - - temp = read_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL); - temp = temp & ~SDMA_RB_ENABLE; - write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL, temp); - - while (true) { - temp = read_register(kgd, sdma_base_addr + - SDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA_RLC_IDLE) - break; - if (timeout == 0) - return -ETIME; - msleep(20); - timeout -= 20; - } - - write_register(kgd, sdma_base_addr + SDMA0_RLC0_DOORBELL, 0); - write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_RPTR, 0); - write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_WPTR, 0); - write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_BASE, 0); - - return 0; -} - -static int kgd_address_watch_disable(struct kgd_dev *kgd) -{ - union TCP_WATCH_CNTL_BITS cntl; - unsigned int i; - - cntl.u32All = 0; - - cntl.bitfields.valid = 0; - cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; - cntl.bitfields.atc = 1; - - /* Turning off this address until we set all the registers */ - for (i = 0; i < MAX_WATCH_ADDRESSES; i++) - write_register(kgd, - watchRegs[i * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - return 0; -} - -static int kgd_address_watch_execute(struct kgd_dev *kgd, - unsigned int watch_point_id, - uint32_t cntl_val, - uint32_t addr_hi, - uint32_t addr_lo) -{ - union TCP_WATCH_CNTL_BITS cntl; - - cntl.u32All = cntl_val; - - /* Turning off this watch point until we set all the registers */ - cntl.bitfields.valid = 0; - write_register(kgd, - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - write_register(kgd, - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_ADDR_HI], - addr_hi); - - write_register(kgd, - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_ADDR_LO], - addr_lo); - - /* Enable the watch point */ - cntl.bitfields.valid = 1; - - write_register(kgd, - watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], - cntl.u32All); - - return 0; -} - -static int kgd_wave_control_execute(struct kgd_dev *kgd, - uint32_t gfx_index_val, - uint32_t sq_cmd) -{ - struct radeon_device *rdev = get_radeon_device(kgd); - uint32_t data; - - mutex_lock(&rdev->grbm_idx_mutex); - - write_register(kgd, GRBM_GFX_INDEX, gfx_index_val); - write_register(kgd, SQ_CMD, sq_cmd); - - /* Restore the GRBM_GFX_INDEX register */ - - data = INSTANCE_BROADCAST_WRITES | SH_BROADCAST_WRITES | - SE_BROADCAST_WRITES; - - write_register(kgd, GRBM_GFX_INDEX, data); - - mutex_unlock(&rdev->grbm_idx_mutex); - - return 0; -} - -static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, - unsigned int watch_point_id, - unsigned int reg_offset) -{ - return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset] - / 4; -} - -static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid) -{ - uint32_t reg; - struct radeon_device *rdev = (struct radeon_device *) kgd; - - reg = RREG32(ATC_VMID0_PASID_MAPPING + vmid*4); - return reg & ATC_VMID_PASID_MAPPING_VALID_MASK; -} - -static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, - uint8_t vmid) -{ - uint32_t reg; - struct radeon_device *rdev = (struct radeon_device *) kgd; - - reg = RREG32(ATC_VMID0_PASID_MAPPING + vmid*4); - return reg & ATC_VMID_PASID_MAPPING_PASID_MASK; -} - -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) -{ - struct radeon_device *rdev = (struct radeon_device *) kgd; - - return WREG32(VM_INVALIDATE_REQUEST, 1 << vmid); -} - -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) -{ - struct radeon_device *rdev = (struct radeon_device *) kgd; - const union radeon_firmware_header *hdr; - - BUG_ON(kgd == NULL || rdev->mec_fw == NULL); - - switch (type) { - case KGD_ENGINE_PFP: - hdr = (const union radeon_firmware_header *) rdev->pfp_fw->data; - break; - - case KGD_ENGINE_ME: - hdr = (const union radeon_firmware_header *) rdev->me_fw->data; - break; - - case KGD_ENGINE_CE: - hdr = (const union radeon_firmware_header *) rdev->ce_fw->data; - break; - - case KGD_ENGINE_MEC1: - hdr = (const union radeon_firmware_header *) rdev->mec_fw->data; - break; - - case KGD_ENGINE_MEC2: - hdr = (const union radeon_firmware_header *) - rdev->mec2_fw->data; - break; - - case KGD_ENGINE_RLC: - hdr = (const union radeon_firmware_header *) rdev->rlc_fw->data; - break; - - case KGD_ENGINE_SDMA1: - case KGD_ENGINE_SDMA2: - hdr = (const union radeon_firmware_header *) - rdev->sdma_fw->data; - break; - - default: - return 0; - } - - if (hdr == NULL) - return 0; - - /* Only 12 bit in use*/ - return hdr->common.ucode_version; -} diff --git a/drivers/gpu/drm/radeon/radeon_kfd.h b/drivers/gpu/drm/radeon/radeon_kfd.h deleted file mode 100644 index 9df1fea8e971..000000000000 --- a/drivers/gpu/drm/radeon/radeon_kfd.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * radeon_kfd.h defines the private interface between the - * AMD kernel graphics drivers and the AMD KFD. - */ - -#ifndef RADEON_KFD_H_INCLUDED -#define RADEON_KFD_H_INCLUDED - -#include -#include "kgd_kfd_interface.h" - -struct radeon_device; - -int radeon_kfd_init(void); -void radeon_kfd_fini(void); - -void radeon_kfd_suspend(struct radeon_device *rdev); -int radeon_kfd_resume(struct radeon_device *rdev); -void radeon_kfd_interrupt(struct radeon_device *rdev, - const void *ih_ring_entry); -void radeon_kfd_device_probe(struct radeon_device *rdev); -void radeon_kfd_device_init(struct radeon_device *rdev); -void radeon_kfd_device_fini(struct radeon_device *rdev); - -#endif /* RADEON_KFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index dfee8f7d94ae..cde037f213d7 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -34,8 +34,6 @@ #include #include -#include "radeon_kfd.h" - #if defined(CONFIG_VGA_SWITCHEROO) bool radeon_has_atpx(void); #else @@ -68,8 +66,6 @@ void radeon_driver_unload_kms(struct drm_device *dev) pm_runtime_forbid(dev->dev); } - radeon_kfd_device_fini(rdev); - radeon_acpi_fini(rdev); radeon_modeset_fini(rdev); @@ -174,9 +170,6 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) "Error during ACPI methods call\n"); } - radeon_kfd_device_probe(rdev); - radeon_kfd_device_init(rdev); - if (radeon_is_px(dev)) { pm_runtime_use_autosuspend(dev->dev); pm_runtime_set_autosuspend_delay(dev->dev, 5000); From ab40cba30333cc264cf2731626565c3e1f29e4d1 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 1 Nov 2017 19:21:26 -0400 Subject: [PATCH 19/26] drm/amdkfd: Clean up the data structure in kfd_process A list of per-process queues is maintained in the kfd_process_queue_manager, so the queues array in kfd_process is redundant and in fact unused. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ------ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 18 ------------------ 2 files changed, 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6a91a60c64cf..78b5d61780cb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -527,12 +527,6 @@ struct kfd_process { struct process_queue_manager pqm; - /* The process's queues. */ - size_t queue_array_size; - - /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ - struct kfd_queue **queues; - /*Is the user space process 32 bit?*/ bool is_32bit_user_mode; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 695fa2ae8e5b..946f4d68947f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -34,13 +34,6 @@ struct mm_struct; #include "kfd_priv.h" #include "kfd_dbgmgr.h" -/* - * Initial size for the array of queues. - * The allocated size is doubled each time - * it is exceeded up to MAX_PROCESS_QUEUES. - */ -#define INITIAL_QUEUE_ARRAY_SIZE 16 - /* * List of struct kfd_process (field kfd_process). * Unique/indexed by mm_struct* @@ -187,8 +180,6 @@ static void kfd_process_wq_release(struct work_struct *work) mutex_destroy(&p->mutex); - kfree(p->queues); - kfree(p); kfree(work); @@ -270,11 +261,6 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (!process) goto err_alloc_process; - process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, - sizeof(process->queues[0]), GFP_KERNEL); - if (!process->queues) - goto err_alloc_queues; - process->pasid = kfd_pasid_alloc(); if (process->pasid == 0) goto err_alloc_pasid; @@ -297,8 +283,6 @@ static struct kfd_process *create_process(const struct task_struct *thread) process->lead_thread = thread->group_leader; - process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; - INIT_LIST_HEAD(&process->per_device_data); kfd_event_init_process(process); @@ -327,8 +311,6 @@ err_mmu_notifier: err_alloc_doorbells: kfd_pasid_free(process->pasid); err_alloc_pasid: - kfree(process->queues); -err_alloc_queues: kfree(process); err_alloc_process: return ERR_PTR(err); From bba9662db79cb21c532bff2e83843037a616ed86 Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Wed, 1 Nov 2017 19:21:27 -0400 Subject: [PATCH 20/26] drm/amdkfd: Disable CP/SDMA ring/doorbell in MQD The MQD represents an inactive context and should not have ring or doorbell enable bits set. Doing so interferes with HWS which streams the MQD onto the HQD. If enable bits are set this activates the ring or doorbell before the HQD is fully configured. Signed-off-by: Jay Cornwall Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 34 +++++-------------- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 7 ++-- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 44ffd23348fc..4859d263fa2a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -189,12 +189,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_pq_control |= NO_UPDATE_RPTR; - q->is_active = false; - if (q->queue_size > 0 && + q->is_active = (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { - q->is_active = true; - } + q->queue_percent > 0); return 0; } @@ -215,24 +212,17 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->sdma_rlc_doorbell = q->doorbell_off << - SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | - 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; + m->sdma_rlc_doorbell = + q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; m->sdma_rlc_virtual_addr = q->sdma_vm_addr; m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = false; - if (q->queue_size > 0 && + q->is_active = (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { - m->sdma_rlc_rb_cntl |= - 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; - - q->is_active = true; - } + q->queue_percent > 0); return 0; } @@ -359,19 +349,13 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); - m->cp_hqd_pq_doorbell_control = DOORBELL_EN | - DOORBELL_OFFSET(q->doorbell_off); + m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); m->cp_hqd_vmid = q->vmid; - m->cp_hqd_active = 0; - q->is_active = false; - if (q->queue_size > 0 && + q->is_active = (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { - m->cp_hqd_active = 1; - q->is_active = true; - } + q->queue_percent > 0); return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 73cbfe186dd2..4ea854f9007b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -163,12 +163,9 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; } - q->is_active = false; - if (q->queue_size > 0 && + q->is_active = (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { - q->is_active = true; - } + q->queue_percent > 0); return 0; } From e2a8e99964ded3a156137c1a02832493ee727721 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 1 Nov 2017 19:21:28 -0400 Subject: [PATCH 21/26] drm/amdkfd: Avoid calling amd_iommu_unbind_pasid() when suspending When kfd suspending on APU, we do not need to call amd_iommu_unbind_pasid(), because pasid will be unbound automatically when power goes off. On the other hand, calling amd_iommu_unbind_pasid() will trigger kfd_process_iommu_unbind_callback() if the process is not terminating. By design, kfd_process_iommu_unbind_callback() should only be called for process terminating. So we would rather not to call amd_iommu_unbind_pasid() when suspending. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 946f4d68947f..b81ad8164da6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -423,29 +423,25 @@ int kfd_bind_processes_to_device(struct kfd_dev *dev) } /* - * Temporarily unbind currently bound processes from the device and - * mark them as PDD_BOUND_SUSPENDED. These processes will be restored - * to PDD_BOUND state in kfd_bind_processes_to_device. + * Mark currently bound processes as PDD_BOUND_SUSPENDED. These + * processes will be restored to PDD_BOUND state in + * kfd_bind_processes_to_device. */ void kfd_unbind_processes_from_device(struct kfd_dev *dev) { struct kfd_process_device *pdd; struct kfd_process *p; - unsigned int temp, temp_bound, temp_pasid; + unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { mutex_lock(&p->mutex); pdd = kfd_get_process_device_data(dev, p); - temp_bound = pdd->bound; - temp_pasid = p->pasid; + if (pdd->bound == PDD_BOUND) pdd->bound = PDD_BOUND_SUSPENDED; mutex_unlock(&p->mutex); - - if (temp_bound == PDD_BOUND) - amd_iommu_unbind_pasid(dev->pdev, temp_pasid); } srcu_read_unlock(&kfd_processes_srcu, idx); From 062c5672d5f66963f6c87249a38226d0e70b1c4b Mon Sep 17 00:00:00 2001 From: Yair Shachar Date: Wed, 1 Nov 2017 19:21:29 -0400 Subject: [PATCH 22/26] drm/amdkfd: Fix debug unregister procedure on process termination Take the dbgmgr lock and unregister before destroying the debug manager. Do this before destroying the queues. v2: Correct locking order in kfd_ioctl_dbg_register to ake sure the process mutex and dbgmgr mutex are always taken in the same order. Signed-off-by: Yair Shachar Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 +-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 37 +++++++++++++++++------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a25321ff448f..505d39156acd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -450,8 +450,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, return -EINVAL; } - mutex_lock(kfd_get_dbgmgr_mutex()); mutex_lock(&p->mutex); + mutex_lock(kfd_get_dbgmgr_mutex()); /* * make sure that we have pdd, if this the first queue created for @@ -479,8 +479,8 @@ static int kfd_ioctl_dbg_register(struct file *filep, } out: - mutex_unlock(&p->mutex); mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_unlock(&p->mutex); return status; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index b81ad8164da6..db08f8f53d4b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -224,17 +224,26 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, mutex_lock(&p->mutex); + /* Iterate over all process device data structures and if the + * pdd is in debug mode, we should first force unregistration, + * then we will be able to destroy the queues + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + struct kfd_dev *dev = pdd->dev; + + mutex_lock(kfd_get_dbgmgr_mutex()); + if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { + if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + } + mutex_unlock(kfd_get_dbgmgr_mutex()); + } + kfd_process_dequeue_from_all_devices(p); pqm_uninit(&p->pqm); - /* Iterate over all process device data structure and check - * if we should delete debug managers - */ - list_for_each_entry(pdd, &p->per_device_data, per_device_list) - if ((pdd->dev->dbgmgr) && - (pdd->dev->dbgmgr->pasid == p->pasid)) - kfd_dbgmgr_destroy(pdd->dev->dbgmgr); - mutex_unlock(&p->mutex); /* @@ -463,8 +472,16 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) pr_debug("Unbinding process %d from IOMMU\n", pasid); - if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid)) - kfd_dbgmgr_destroy(dev->dbgmgr); + mutex_lock(kfd_get_dbgmgr_mutex()); + + if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { + if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + } + + mutex_unlock(kfd_get_dbgmgr_mutex()); pdd = kfd_get_process_device_data(dev, p); if (pdd) From 5a29ad6b9e0899cd17988895fa64fd3e9408ab8e Mon Sep 17 00:00:00 2001 From: Ben Goz Date: Wed, 1 Nov 2017 19:21:30 -0400 Subject: [PATCH 23/26] drm/amdkfd: Register/Deregister process on qpd resolution Process registration needs to happen on each device. So use per-device queue lists to determine when to register/deregister the process. Signed-off-by: Ben Goz Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 5129dc139219..2bec902fc939 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -177,7 +177,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, if (retval != 0) return retval; - if (list_empty(&pqm->queues)) { + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) { pdd->qpd.pqm = pqm; dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); } @@ -248,7 +249,8 @@ err_create_queue: err_allocate_pqn: /* check if queues list is empty unregister process from device */ clear_bit(*qid, pqm->queue_slot_bitmap); - if (list_empty(&pqm->queues)) + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); return retval; } @@ -302,7 +304,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) kfree(pqn); clear_bit(qid, pqm->queue_slot_bitmap); - if (list_empty(&pqm->queues)) + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) dqm->ops.unregister_process(dqm, &pdd->qpd); return retval; From bfd5e378a98d0387b45a4864528a11b65d038f0c Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 1 Nov 2017 19:21:31 -0400 Subject: [PATCH 24/26] drm/amdkfd: Cleanup DQM ASIC-specific ops Remove empty initialize function. Rename register_process to update_qpd to avoid confusion with the non-ASIC-specific register_process. Shorten ops_asic_specific to asic_ops. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 19 +++++++----------- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 11 +++++----- .../amd/amdkfd/kfd_device_queue_manager_cik.c | 20 +++++++------------ .../amd/amdkfd/kfd_device_queue_manager_vi.c | 20 +++++++------------ 4 files changed, 27 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index da3b74315acf..45b98dd5b785 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -467,7 +467,7 @@ static int register_process(struct device_queue_manager *dqm, mutex_lock(&dqm->lock); list_add(&n->list, &dqm->queues); - retval = dqm->ops_asic_specific.register_process(dqm, qpd); + retval = dqm->asic_ops.update_qpd(dqm, qpd); dqm->processes_count++; @@ -629,7 +629,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); - dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) @@ -696,8 +696,6 @@ static int set_sched_resources(struct device_queue_manager *dqm) static int initialize_cpsch(struct device_queue_manager *dqm) { - int retval; - pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); mutex_init(&dqm->lock); @@ -706,11 +704,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->sdma_queue_count = 0; dqm->active_runlist = false; dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; - retval = dqm->ops_asic_specific.initialize(dqm); - if (retval) - mutex_destroy(&dqm->lock); - return retval; + return 0; } static int start_cpsch(struct device_queue_manager *dqm) @@ -850,7 +845,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, goto out; } - dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) @@ -1095,7 +1090,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, qpd->sh_mem_ape1_limit = limit >> 16; } - retval = dqm->ops_asic_specific.set_cache_memory_policy( + retval = dqm->asic_ops.set_cache_memory_policy( dqm, qpd, default_policy, @@ -1270,11 +1265,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) switch (dev->device_info->asic_family) { case CHIP_CARRIZO: - device_queue_manager_init_vi(&dqm->ops_asic_specific); + device_queue_manager_init_vi(&dqm->asic_ops); break; case CHIP_KAVERI: - device_queue_manager_init_cik(&dqm->ops_asic_specific); + device_queue_manager_init_cik(&dqm->asic_ops); break; default: WARN(1, "Unexpected ASIC family %u", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 31c2b1f9d320..5b77cb69f732 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -128,9 +128,8 @@ struct device_queue_manager_ops { }; struct device_queue_manager_asic_ops { - int (*register_process)(struct device_queue_manager *dqm, + int (*update_qpd)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); - int (*initialize)(struct device_queue_manager *dqm); bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, @@ -156,7 +155,7 @@ struct device_queue_manager_asic_ops { struct device_queue_manager { struct device_queue_manager_ops ops; - struct device_queue_manager_asic_ops ops_asic_specific; + struct device_queue_manager_asic_ops asic_ops; struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; struct packet_manager packets; @@ -179,8 +178,10 @@ struct device_queue_manager { bool active_runlist; }; -void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); -void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); +void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index 72c3cbabc0a7..28e48c90c596 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -32,18 +32,17 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int register_process_cik(struct device_queue_manager *dqm, +static int update_qpd_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd); -static int initialize_cpsch_cik(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops) +void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops) { - ops->set_cache_memory_policy = set_cache_memory_policy_cik; - ops->register_process = register_process_cik; - ops->initialize = initialize_cpsch_cik; - ops->init_sdma_vm = init_sdma_vm; + asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; + asic_ops->update_qpd = update_qpd_cik; + asic_ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -99,7 +98,7 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, return true; } -static int register_process_cik(struct device_queue_manager *dqm, +static int update_qpd_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; @@ -148,8 +147,3 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } - -static int initialize_cpsch_cik(struct device_queue_manager *dqm) -{ - return 0; -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index 40e9ddd096cd..2fbce57a2f21 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -33,18 +33,17 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int register_process_vi(struct device_queue_manager *dqm, +static int update_qpd_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd); -static int initialize_cpsch_vi(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) +void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops) { - ops->set_cache_memory_policy = set_cache_memory_policy_vi; - ops->register_process = register_process_vi; - ops->initialize = initialize_cpsch_vi; - ops->init_sdma_vm = init_sdma_vm; + asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; + asic_ops->update_qpd = update_qpd_vi; + asic_ops->init_sdma_vm = init_sdma_vm; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) @@ -104,7 +103,7 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, return true; } -static int register_process_vi(struct device_queue_manager *dqm, +static int update_qpd_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct kfd_process_device *pdd; @@ -160,8 +159,3 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } - -static int initialize_cpsch_vi(struct device_queue_manager *dqm) -{ - return 0; -} From 096d1a3efc8b0914a4cfb1203c147ed597907191 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 1 Nov 2017 19:21:32 -0400 Subject: [PATCH 25/26] drm/amdkfd: Update queue_count before mapping queues map_queues_cpsch uses the queue_count to decide whether to upload a new runlist. So update the counter before calling it. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 45b98dd5b785..e2fc4c5d42cd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -408,6 +408,17 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + /* + * check active state vs. the previous state and modify + * counter accordingly. map_queues_cpsch uses the + * dqm->queue_count to determine whether a new runlist must be + * uploaded. + */ + if (q->properties.is_active && !prev_active) + dqm->queue_count++; + else if (!q->properties.is_active && prev_active) + dqm->queue_count--; + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) retval = map_queues_cpsch(dqm); else if (sched_policy == KFD_SCHED_POLICY_NO_HWS && @@ -417,15 +428,6 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); - /* - * check active state vs. the previous state - * and modify counter accordingly - */ - if (q->properties.is_active && !prev_active) - dqm->queue_count++; - else if (!q->properties.is_active && prev_active) - dqm->queue_count--; - out_unlock: mutex_unlock(&dqm->lock); return retval; From 894a8293aaa702a5aef758bc069162a671ca7a07 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 1 Nov 2017 19:21:33 -0400 Subject: [PATCH 26/26] drm/amdkfd: Minor cleanups These were missed previously when rebasing changes for upstreaming. v2: Remove redundant sched_policy conditions Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++++------ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e2fc4c5d42cd..e202921c150e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -389,12 +389,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) if (sched_policy != KFD_SCHED_POLICY_NO_HWS) { retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (retval != 0) { + if (retval) { pr_err("unmap queue failed\n"); goto out_unlock; } - } else if (sched_policy == KFD_SCHED_POLICY_NO_HWS && - prev_active && + } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) { retval = mqd->destroy_mqd(mqd, q->mqd, @@ -421,8 +420,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) if (sched_policy != KFD_SCHED_POLICY_NO_HWS) retval = map_queues_cpsch(dqm); - else if (sched_policy == KFD_SCHED_POLICY_NO_HWS && - q->properties.is_active && + else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, @@ -832,7 +830,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval != 0) + if (retval) goto out; q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 78b5d61780cb..9e4134c5b481 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -507,6 +507,8 @@ struct kfd_process { * In any process, the thread that started main() is the lead * thread and outlives the rest. * It is here because amd_iommu_bind_pasid wants a task_struct. + * It can also be used for safely getting a reference to the + * mm_struct of the process. */ struct task_struct *lead_thread; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index db08f8f53d4b..1f5ccd28bd41 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -416,7 +416,7 @@ int kfd_bind_processes_to_device(struct kfd_dev *dev) err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); if (err < 0) { - pr_err("unexpected pasid %d binding failure\n", + pr_err("Unexpected pasid %d binding failure\n", p->pasid); mutex_unlock(&p->mutex); break;