drm/amdgpu: support check xgmi/walf error mask bit for aldebaran
The pcs error count should be determined by PCS ERROR status and PCS ERROR MASK registers, only PCS ERROR status register can not refect error counts accurately. Changed from V1: remove clean noncorrectable mask registers optimize query pcs error status Changed from V2: remove check mask_value bits correct set value corresponding bit Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
1427a72027
commit
828fc79dcf
|
@ -35,7 +35,9 @@
|
|||
#include "amdgpu_reset.h"
|
||||
|
||||
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
|
||||
#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218
|
||||
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
|
||||
#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218
|
||||
|
||||
static DEFINE_MUTEX(xgmi_mutex);
|
||||
|
||||
|
@ -79,11 +81,27 @@ static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
|
|||
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
|
||||
};
|
||||
|
||||
static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000,
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000
|
||||
};
|
||||
|
||||
static const int walf_pcs_err_status_reg_aldebaran[] = {
|
||||
smnPCS_GOPX1_PCS_ERROR_STATUS,
|
||||
smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
|
||||
};
|
||||
|
||||
static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
|
||||
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK,
|
||||
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
|
||||
};
|
||||
|
||||
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
|
||||
{"XGMI PCS DataLossErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
|
||||
|
@ -809,39 +827,43 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
|||
|
||||
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
|
||||
uint32_t value,
|
||||
uint32_t mask_value,
|
||||
uint32_t *ue_count,
|
||||
uint32_t *ce_count,
|
||||
bool is_xgmi_pcs)
|
||||
bool is_xgmi_pcs,
|
||||
bool check_mask)
|
||||
{
|
||||
int i;
|
||||
int ue_cnt;
|
||||
int ue_cnt = 0;
|
||||
int mask_bit_value = 0;
|
||||
const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL;
|
||||
uint32_t field_array_size = 0;
|
||||
|
||||
if (is_xgmi_pcs) {
|
||||
/* query xgmi pcs error status,
|
||||
* only ue is supported */
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
|
||||
ue_cnt = (value &
|
||||
xgmi_pcs_ras_fields[i].pcs_err_mask) >>
|
||||
xgmi_pcs_ras_fields[i].pcs_err_shift;
|
||||
if (ue_cnt) {
|
||||
dev_info(adev->dev, "%s detected\n",
|
||||
xgmi_pcs_ras_fields[i].err_name);
|
||||
*ue_count += ue_cnt;
|
||||
}
|
||||
}
|
||||
pcs_ras_fields = &xgmi_pcs_ras_fields[0];
|
||||
field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields);
|
||||
} else {
|
||||
/* query wafl pcs error status,
|
||||
* only ue is supported */
|
||||
for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
|
||||
ue_cnt = (value &
|
||||
wafl_pcs_ras_fields[i].pcs_err_mask) >>
|
||||
wafl_pcs_ras_fields[i].pcs_err_shift;
|
||||
if (ue_cnt) {
|
||||
dev_info(adev->dev, "%s detected\n",
|
||||
wafl_pcs_ras_fields[i].err_name);
|
||||
*ue_count += ue_cnt;
|
||||
}
|
||||
pcs_ras_fields = &wafl_pcs_ras_fields[0];
|
||||
field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields);
|
||||
}
|
||||
|
||||
if (check_mask)
|
||||
value = value & ~mask_value;
|
||||
|
||||
/* query xgmi/walf pcs error status,
|
||||
* only ue is supported */
|
||||
for (i = 0; value && i < field_array_size; i++) {
|
||||
ue_cnt = (value &
|
||||
pcs_ras_fields[i].pcs_err_mask) >>
|
||||
pcs_ras_fields[i].pcs_err_shift;
|
||||
if (ue_cnt) {
|
||||
dev_info(adev->dev, "%s detected\n",
|
||||
pcs_ras_fields[i].err_name);
|
||||
*ue_count += ue_cnt;
|
||||
}
|
||||
|
||||
/* reset bit value if the bit is checked */
|
||||
value &= ~(pcs_ras_fields[i].pcs_err_mask);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -852,7 +874,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
|||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
int i;
|
||||
uint32_t data;
|
||||
uint32_t data, mask_data = 0;
|
||||
uint32_t ue_cnt = 0, ce_cnt = 0;
|
||||
|
||||
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
|
||||
|
@ -867,15 +889,15 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
|||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
|
||||
data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, true);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, true, false);
|
||||
}
|
||||
/* check wafl pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
|
||||
data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, false);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, false, false);
|
||||
}
|
||||
break;
|
||||
case CHIP_VEGA20:
|
||||
|
@ -883,31 +905,35 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
|||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
|
||||
data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, true);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, true, false);
|
||||
}
|
||||
/* check wafl pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
|
||||
data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, false);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, false, false);
|
||||
}
|
||||
break;
|
||||
case CHIP_ALDEBARAN:
|
||||
/* check xgmi3x16 pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
|
||||
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
|
||||
mask_data =
|
||||
RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, true);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, true, true);
|
||||
}
|
||||
/* check wafl pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
|
||||
data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
|
||||
mask_data =
|
||||
RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, false);
|
||||
amdgpu_xgmi_query_pcs_error_status(adev, data,
|
||||
mask_data, &ue_cnt, &ce_cnt, false, true);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
|
Loading…
Reference in New Issue