drm/amdgpu: add instance mask for RAS inject

User can specify injected instances by the mask. For backward
compatibility, the mask value is incorporated into sub block index
without interface change of RAS TA.
User uses logical mask and driver should convert it to physical value
before sending it to RAS TA.

v2: update parameter name.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou 2023-02-27 18:25:23 +08:00 committed by Alex Deucher
parent af2ba36883
commit 2c22ed0bdb
8 changed files with 56 additions and 19 deletions

View File

@ -1672,14 +1672,33 @@ int psp_ras_initialize(struct psp_context *psp)
}
int psp_ras_trigger_error(struct psp_context *psp,
struct ta_ras_trigger_error_input *info)
struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
{
struct ta_ras_shared_memory *ras_cmd;
struct amdgpu_device *adev = psp->adev;
int ret;
uint32_t dev_mask;
if (!psp->ras_context.context.initialized)
return -EINVAL;
switch (info->block_id) {
case TA_RAS_BLOCK__GFX:
dev_mask = GET_MASK(GC, instance_mask);
break;
case TA_RAS_BLOCK__SDMA:
dev_mask = GET_MASK(SDMA0, instance_mask);
break;
default:
dev_mask = instance_mask;
break;
}
/* reuse sub_block_index for backward compatibility */
dev_mask <<= AMDGPU_RAS_INST_SHIFT;
dev_mask &= AMDGPU_RAS_INST_MASK;
info->sub_block_index |= dev_mask;
ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));

View File

@ -486,7 +486,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
int psp_ras_enable_features(struct psp_context *psp,
union ta_ras_cmd_input *info, bool enable);
int psp_ras_trigger_error(struct psp_context *psp,
struct ta_ras_trigger_error_input *info);
struct ta_ras_trigger_error_input *info, uint32_t instance_mask);
int psp_ras_terminate(struct psp_context *psp);
int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id);

View File

@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
int block_id;
uint32_t sub_block;
u64 address, value;
/* default value is 0 if the mask is not set by user */
u32 instance_mask = 0;
if (*pos)
return -EINVAL;
@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->op = op;
if (op == 2) {
if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
&sub_block, &address, &value, &instance_mask) != 4 &&
sscanf(str, "%*s %*s %*s %u %llu %llu %u",
&sub_block, &address, &value, &instance_mask) != 4 &&
sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
&sub_block, &address, &value) != 3 &&
sscanf(str, "%*s %*s %*s %u %llu %llu",
&sub_block, &address, &value) != 3)
@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->head.sub_block_index = sub_block;
data->inject.address = address;
data->inject.value = value;
data->inject.instance_mask = instance_mask;
}
} else {
if (size < sizeof(*data))
@ -341,7 +348,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
* name: the name of IP.
*
* inject has two more members than head, they are address, value.
* inject has three more members than head, they are address, value and mask.
* As their names indicate, inject operation will write the
* value to the address.
*
@ -365,7 +372,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
*
* echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
*
* Where N, is the card which you want to affect.
*
@ -382,13 +389,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
*
* The sub-block is a the sub-block index, pass 0 if there is no sub-block.
* The address and value are hexadecimal numbers, leading 0x is optional.
* The mask means instance mask, is optional, default value is 0x1.
*
* For instance,
*
* .. code-block:: bash
*
* echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
*
* How to check the result of the operation?
@ -1117,13 +1125,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, info);
ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
} else {
/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
info->instance_mask);
else /*If not defined .ras_error_inject, use default ras_error_inject*/
ret = psp_ras_trigger_error(&adev->psp, &block_info);
ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
}
if (ret)

View File

@ -32,6 +32,11 @@
struct amdgpu_iv_entry;
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
/* position of instance value in sub_block_index of
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
*/
#define AMDGPU_RAS_INST_MASK 0xfffff000
#define AMDGPU_RAS_INST_SHIFT 0xc
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
@ -508,6 +513,7 @@ struct ras_inject_if {
struct ras_common_if head;
uint64_t address;
uint64_t value;
uint32_t instance_mask;
};
struct ras_cure_if {
@ -545,7 +551,8 @@ struct amdgpu_ras_block_object {
};
struct amdgpu_ras_block_hw_ops {
int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
int (*ras_error_inject)(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask);
void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status);
void (*query_ras_error_status)(struct amdgpu_device *adev);
void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);

View File

@ -1014,7 +1014,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
}
/* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if)
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
{
int ret = 0;
struct ta_ras_trigger_error_input *block_info =
@ -1026,7 +1027,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *injec
if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
dev_warn(adev->dev, "Failed to disallow XGMI power down");
ret = psp_ras_trigger_error(&adev->psp, block_info);
ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
if (amdgpu_ras_intr_triggered())
return ret;

View File

@ -770,7 +770,7 @@ static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status);
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
void *inject_if);
void *inject_if, uint32_t instance_mask);
static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);
static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
@ -6335,7 +6335,7 @@ static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = {
};
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
void *inject_if)
void *inject_if, uint32_t instance_mask)
{
struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret;
@ -6374,7 +6374,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info);
ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex);
return ret;

View File

@ -971,7 +971,7 @@ static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
}
static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
void *inject_if)
void *inject_if, uint32_t instance_mask)
{
struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret;
@ -987,7 +987,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info);
ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex);
return ret;

View File

@ -1699,7 +1699,8 @@ static void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev)
gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL);
}
static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
{
struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret;
@ -1715,7 +1716,7 @@ static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_
block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info);
ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex);
return ret;