diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9eceb3bc1058..c2e8f6491ac6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); u8 csum; int res; @@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) hdr->first_rec_offset = RAS_RECORD_START_V2_1; hdr->tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE; + rai->rma_status = GPU_HEALTH_USABLE; + /** + * GPU health represented as a percentage. + * 0 means worst health, 100 means fully health. + */ + rai->health_percent = 100; + /* ecc_page_threshold = 0 means disable bad page retirement */ + rai->ecc_page_threshold = con->bad_page_cnt_threshold; } else { hdr->first_rec_offset = RAS_RECORD_START; hdr->tbl_size = RAS_TABLE_HEADER_SIZE; @@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) "Saved bad pages %d reaches threshold value %d\n", control->ras_num_recs, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; + if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { + control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; + control->tbl_rai.health_percent = 0; + } } if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) @@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) goto Out; } + /** + * bad page records have been stored in eeprom, + * now calculate gpu health percent + */ + if (amdgpu_bad_page_threshold != 0 && + control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && + control->ras_num_recs < ras->bad_page_cnt_threshold) + control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - + control->ras_num_recs) * 100) / + ras->bad_page_cnt_threshold; + /* Recalc the checksum. */ csum = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 3c5575c19bf8..6dfd667f3013 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -31,6 +31,11 @@ struct amdgpu_device; +enum amdgpu_ras_gpu_health_status { + GPU_HEALTH_USABLE = 0, + GPU_RETIRED__ECC_REACH_THRESHOLD = 2, +}; + enum amdgpu_ras_eeprom_err_type { AMDGPU_RAS_EEPROM_ERR_NA, AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,