habanalabs: create internal CB pool

Create a device MMU-mapped internal command buffer pool, in order to allow
the driver to allocate CBs for the signal/wait operations
that are fetched by the queues when they are configured with the user's
address space ID.

We must pre-map this internal pool due to performance issues.

This pool is needed for future ASIC support and it is currently unused in
GOYA and GAUDI.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
Ofir Bitton 2020-07-13 13:36:55 +03:00 committed by Oded Gabbay
parent eb8b293e79
commit a04b7cd97e
6 changed files with 104 additions and 51 deletions

View File

@ -10,12 +10,18 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/genalloc.h>
static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
{ {
if (cb->is_internal)
gen_pool_free(hdev->internal_cb_pool,
cb->kernel_address, cb->size);
else
hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
(void *) (uintptr_t) cb->kernel_address, (void *) (uintptr_t) cb->kernel_address,
cb->bus_address); cb->bus_address);
kfree(cb); kfree(cb);
} }
@ -44,9 +50,10 @@ static void cb_release(struct kref *ref)
} }
static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
int ctx_id) int ctx_id, bool internal_cb)
{ {
struct hl_cb *cb; struct hl_cb *cb;
u32 cb_offset;
void *p; void *p;
/* /*
@ -65,13 +72,25 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
if (!cb) if (!cb)
return NULL; return NULL;
if (ctx_id == HL_KERNEL_ASID_ID) if (internal_cb) {
p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size);
if (!p) {
kfree(cb);
return NULL;
}
cb_offset = p - hdev->internal_cb_pool_virt_addr;
cb->is_internal = true;
cb->bus_address = hdev->internal_cb_va_base + cb_offset;
} else if (ctx_id == HL_KERNEL_ASID_ID) {
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address, GFP_ATOMIC); &cb->bus_address, GFP_ATOMIC);
else } else {
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address, &cb->bus_address,
GFP_USER | __GFP_ZERO); GFP_USER | __GFP_ZERO);
}
if (!p) { if (!p) {
dev_err(hdev->dev, dev_err(hdev->dev,
"failed to allocate %d of dma memory for CB\n", "failed to allocate %d of dma memory for CB\n",
@ -87,7 +106,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
} }
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
u32 cb_size, u64 *handle, int ctx_id) u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
{ {
struct hl_cb *cb; struct hl_cb *cb;
bool alloc_new_cb = true; bool alloc_new_cb = true;
@ -112,6 +131,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
goto out_err; goto out_err;
} }
if (!internal_cb) {
/* Minimum allocation must be PAGE SIZE */ /* Minimum allocation must be PAGE SIZE */
if (cb_size < PAGE_SIZE) if (cb_size < PAGE_SIZE)
cb_size = PAGE_SIZE; cb_size = PAGE_SIZE;
@ -121,8 +141,8 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
spin_lock(&hdev->cb_pool_lock); spin_lock(&hdev->cb_pool_lock);
if (!list_empty(&hdev->cb_pool)) { if (!list_empty(&hdev->cb_pool)) {
cb = list_first_entry(&hdev->cb_pool, typeof(*cb), cb = list_first_entry(&hdev->cb_pool,
pool_list); typeof(*cb), pool_list);
list_del(&cb->pool_list); list_del(&cb->pool_list);
spin_unlock(&hdev->cb_pool_lock); spin_unlock(&hdev->cb_pool_lock);
alloc_new_cb = false; alloc_new_cb = false;
@ -131,9 +151,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
dev_dbg(hdev->dev, "CB pool is empty\n"); dev_dbg(hdev->dev, "CB pool is empty\n");
} }
} }
}
if (alloc_new_cb) { if (alloc_new_cb) {
cb = hl_cb_alloc(hdev, cb_size, ctx_id); cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb);
if (!cb) { if (!cb) {
rc = -ENOMEM; rc = -ENOMEM;
goto out_err; goto out_err;
@ -230,7 +251,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
} else { } else {
rc = hl_cb_create(hdev, &hpriv->cb_mgr, rc = hl_cb_create(hdev, &hpriv->cb_mgr,
args->in.cb_size, &handle, args->in.cb_size, &handle,
hpriv->ctx->asid); hpriv->ctx->asid, false);
} }
memset(args, 0, sizeof(*args)); memset(args, 0, sizeof(*args));
@ -398,14 +419,15 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
idr_destroy(&mgr->cb_handles); idr_destroy(&mgr->cb_handles);
} }
struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size) struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
bool internal_cb)
{ {
u64 cb_handle; u64 cb_handle;
struct hl_cb *cb; struct hl_cb *cb;
int rc; int rc;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle, rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
HL_KERNEL_ASID_ID); HL_KERNEL_ASID_ID, internal_cb);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate CB for the kernel driver %d\n", rc); "Failed to allocate CB for the kernel driver %d\n", rc);
@ -437,7 +459,7 @@ int hl_cb_pool_init(struct hl_device *hdev)
for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) { for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) {
cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size, cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size,
HL_KERNEL_ASID_ID); HL_KERNEL_ASID_ID, false);
if (cb) { if (cb) {
cb->is_pool = true; cb->is_pool = true;
list_add(&cb->pool_list, &hdev->cb_pool); list_add(&cb->pool_list, &hdev->cb_pool);

View File

@ -919,7 +919,13 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
goto put_cs; goto put_cs;
} }
cb = hl_cb_kernel_create(hdev, PAGE_SIZE); if (cs->type == CS_TYPE_WAIT)
cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
else
cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
cb = hl_cb_kernel_create(hdev, cb_size,
q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
if (!cb) { if (!cb) {
ctx->cs_counters.out_of_mem_drop_cnt++; ctx->cs_counters.out_of_mem_drop_cnt++;
kfree(job); kfree(job);
@ -927,11 +933,6 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
goto put_cs; goto put_cs;
} }
if (cs->type == CS_TYPE_WAIT)
cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
else
cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
job->id = 0; job->id = 0;
job->cs = cs; job->cs = cs;
job->user_cb = cb; job->user_cb = cb;

View File

@ -153,10 +153,18 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
rc = -ENOMEM; rc = -ENOMEM;
goto mem_ctx_err; goto mem_ctx_err;
} }
rc = hdev->asic_funcs->ctx_init(ctx);
if (rc) {
dev_err(hdev->dev, "ctx_init failed\n");
goto ctx_init_err;
}
} }
return 0; return 0;
ctx_init_err:
hl_vm_ctx_fini(ctx);
mem_ctx_err: mem_ctx_err:
if (ctx->asid != HL_KERNEL_ASID_ID) if (ctx->asid != HL_KERNEL_ASID_ID)
hl_asid_free(hdev, ctx->asid); hl_asid_free(hdev, ctx->asid);

View File

@ -392,6 +392,7 @@ struct hl_cb_mgr {
* @ctx_id: holds the ID of the owner's context. * @ctx_id: holds the ID of the owner's context.
* @mmap: true if the CB is currently mmaped to user. * @mmap: true if the CB is currently mmaped to user.
* @is_pool: true if CB was acquired from the pool, false otherwise. * @is_pool: true if CB was acquired from the pool, false otherwise.
* @is_internal: internaly allocated
*/ */
struct hl_cb { struct hl_cb {
struct kref refcount; struct kref refcount;
@ -408,6 +409,7 @@ struct hl_cb {
u32 ctx_id; u32 ctx_id;
u8 mmap; u8 mmap;
u8 is_pool; u8 is_pool;
u8 is_internal;
}; };
@ -643,6 +645,7 @@ enum div_select_defs {
* @rreg: Read a register. Needed for simulator support. * @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support. * @wreg: Write a register. Needed for simulator support.
* @halt_coresight: stop the ETF and ETR traces. * @halt_coresight: stop the ETF and ETR traces.
* @ctx_init: context dependent initialization.
* @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
* @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index. * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
* @read_device_fw_version: read the device's firmware versions that are * @read_device_fw_version: read the device's firmware versions that are
@ -745,6 +748,7 @@ struct hl_asic_funcs {
u32 (*rreg)(struct hl_device *hdev, u32 reg); u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
void (*halt_coresight)(struct hl_device *hdev); void (*halt_coresight)(struct hl_device *hdev);
int (*ctx_init)(struct hl_ctx *ctx);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx); u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
void (*read_device_fw_version)(struct hl_device *hdev, void (*read_device_fw_version)(struct hl_device *hdev,
@ -1432,6 +1436,10 @@ struct hl_device_idle_busy_ts {
* @hl_debugfs: device's debugfs manager. * @hl_debugfs: device's debugfs manager.
* @cb_pool: list of preallocated CBs. * @cb_pool: list of preallocated CBs.
* @cb_pool_lock: protects the CB pool. * @cb_pool_lock: protects the CB pool.
* @internal_cb_pool_virt_addr: internal command buffer pool virtual address.
* @internal_cb_pool_dma_addr: internal command buffer pool dma address.
* @internal_cb_pool: internal command buffer memory pool.
* @internal_cb_va_base: internal cb pool mmu virtual address base
* @fpriv_list: list of file private data structures. Each structure is created * @fpriv_list: list of file private data structures. Each structure is created
* when a user opens the device * when a user opens the device
* @fpriv_list_lock: protects the fpriv_list * @fpriv_list_lock: protects the fpriv_list
@ -1531,6 +1539,11 @@ struct hl_device {
struct list_head cb_pool; struct list_head cb_pool;
spinlock_t cb_pool_lock; spinlock_t cb_pool_lock;
void *internal_cb_pool_virt_addr;
dma_addr_t internal_cb_pool_dma_addr;
struct gen_pool *internal_cb_pool;
u64 internal_cb_va_base;
struct list_head fpriv_list; struct list_head fpriv_list;
struct mutex fpriv_list_lock; struct mutex fpriv_list_lock;
@ -1741,7 +1754,7 @@ int hl_hwmon_init(struct hl_device *hdev);
void hl_hwmon_fini(struct hl_device *hdev); void hl_hwmon_fini(struct hl_device *hdev);
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size, int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
u64 *handle, int ctx_id); u64 *handle, int ctx_id, bool internal_cb);
int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle); int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
@ -1749,7 +1762,8 @@ struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
void hl_cb_put(struct hl_cb *cb); void hl_cb_put(struct hl_cb *cb);
void hl_cb_mgr_init(struct hl_cb_mgr *mgr); void hl_cb_mgr_init(struct hl_cb_mgr *mgr);
void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr); void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr);
struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size); struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
bool internal_cb);
int hl_cb_pool_init(struct hl_device *hdev); int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev); int hl_cb_pool_fini(struct hl_device *hdev);

View File

@ -635,7 +635,7 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev,
u8 tpc_id; u8 tpc_id;
int rc; int rc;
cb = hl_cb_kernel_create(hdev, PAGE_SIZE); cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false);
if (!cb) if (!cb)
return -EFAULT; return -EFAULT;
@ -4048,9 +4048,8 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size + parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2; sizeof(struct packet_msg_prot) * 2;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
parser->patched_cb_size, &patched_cb_handle, HL_KERNEL_ASID_ID, false);
&patched_cb_handle, HL_KERNEL_ASID_ID);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
@ -4122,9 +4121,8 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev,
if (rc) if (rc)
goto free_userptr; goto free_userptr;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
parser->patched_cb_size, &patched_cb_handle, HL_KERNEL_ASID_ID, false);
&patched_cb_handle, HL_KERNEL_ASID_ID);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc); "Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4257,7 +4255,7 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
struct hl_cb *cb; struct hl_cb *cb;
int rc; int rc;
cb = hl_cb_kernel_create(hdev, PAGE_SIZE); cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false);
if (!cb) if (!cb)
return -EFAULT; return -EFAULT;
@ -6229,6 +6227,11 @@ static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev)
return RREG32(mmHW_STATE); return RREG32(mmHW_STATE);
} }
int gaudi_ctx_init(struct hl_ctx *ctx)
{
return 0;
}
static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
{ {
return gaudi_cq_assignment[cq_idx]; return gaudi_cq_assignment[cq_idx];
@ -6532,6 +6535,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
.rreg = hl_rreg, .rreg = hl_rreg,
.wreg = hl_wreg, .wreg = hl_wreg,
.halt_coresight = gaudi_halt_coresight, .halt_coresight = gaudi_halt_coresight,
.ctx_init = gaudi_ctx_init,
.get_clk_rate = gaudi_get_clk_rate, .get_clk_rate = gaudi_get_clk_rate,
.get_queue_id_for_cq = gaudi_get_queue_id_for_cq, .get_queue_id_for_cq = gaudi_get_queue_id_for_cq,
.read_device_fw_version = gaudi_read_device_fw_version, .read_device_fw_version = gaudi_read_device_fw_version,

View File

@ -3771,9 +3771,8 @@ static int goya_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size + parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2; sizeof(struct packet_msg_prot) * 2;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
parser->patched_cb_size, &patched_cb_handle, HL_KERNEL_ASID_ID, false);
&patched_cb_handle, HL_KERNEL_ASID_ID);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
@ -3845,9 +3844,8 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev,
if (rc) if (rc)
goto free_userptr; goto free_userptr;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
parser->patched_cb_size, &patched_cb_handle, HL_KERNEL_ASID_ID, false);
&patched_cb_handle, HL_KERNEL_ASID_ID);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc); "Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4693,7 +4691,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
lin_dma_pkts_cnt = DIV_ROUND_UP_ULL(size, SZ_2G); lin_dma_pkts_cnt = DIV_ROUND_UP_ULL(size, SZ_2G);
cb_size = lin_dma_pkts_cnt * sizeof(struct packet_lin_dma) + cb_size = lin_dma_pkts_cnt * sizeof(struct packet_lin_dma) +
sizeof(struct packet_msg_prot); sizeof(struct packet_msg_prot);
cb = hl_cb_kernel_create(hdev, cb_size); cb = hl_cb_kernel_create(hdev, cb_size, false);
if (!cb) if (!cb)
return -ENOMEM; return -ENOMEM;
@ -5223,6 +5221,11 @@ static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev)
return RREG32(mmHW_STATE); return RREG32(mmHW_STATE);
} }
int goya_ctx_init(struct hl_ctx *ctx)
{
return 0;
}
u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
{ {
return cq_idx; return cq_idx;
@ -5336,6 +5339,7 @@ static const struct hl_asic_funcs goya_funcs = {
.rreg = hl_rreg, .rreg = hl_rreg,
.wreg = hl_wreg, .wreg = hl_wreg,
.halt_coresight = goya_halt_coresight, .halt_coresight = goya_halt_coresight,
.ctx_init = goya_ctx_init,
.get_clk_rate = goya_get_clk_rate, .get_clk_rate = goya_get_clk_rate,
.get_queue_id_for_cq = goya_get_queue_id_for_cq, .get_queue_id_for_cq = goya_get_queue_id_for_cq,
.read_device_fw_version = goya_read_device_fw_version, .read_device_fw_version = goya_read_device_fw_version,