From 98f876674a6fba3591c342dfbcfdbaa7ecf0a84e Mon Sep 17 00:00:00 2001 From: Kevin Barnett Date: Wed, 3 May 2017 18:53:11 -0500 Subject: [PATCH] scsi: smartpqi: add heartbeat check check for controller lockups Reviewed-by: Scott Benesh Signed-off-by: Kevin Barnett Signed-off-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi.h | 60 ++++++++++-- drivers/scsi/smartpqi/smartpqi_init.c | 126 +++++++++++++++++--------- drivers/scsi/smartpqi/smartpqi_sis.c | 4 + 3 files changed, 143 insertions(+), 47 deletions(-) diff --git a/drivers/scsi/smartpqi/smartpqi.h b/drivers/scsi/smartpqi/smartpqi.h index 06e2b7152d52..1ac09e74d8c2 100644 --- a/drivers/scsi/smartpqi/smartpqi.h +++ b/drivers/scsi/smartpqi/smartpqi.h @@ -490,7 +490,6 @@ struct pqi_raid_error_info { #define PQI_EVENT_TYPE_LOGICAL_DEVICE 0x5 #define PQI_EVENT_TYPE_AIO_STATE_CHANGE 0xfd #define PQI_EVENT_TYPE_AIO_CONFIG_CHANGE 0xfe -#define PQI_EVENT_TYPE_HEARTBEAT 0xff #pragma pack() @@ -635,6 +634,58 @@ struct pqi_encryption_info { u32 encrypt_tweak_upper; }; +#pragma pack(1) + +#define PQI_CONFIG_TABLE_SIGNATURE "CFGTABLE" +#define PQI_CONFIG_TABLE_MAX_LENGTH ((u16)~0) + +/* configuration table section IDs */ +#define PQI_CONFIG_TABLE_SECTION_GENERAL_INFO 0 +#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES 1 +#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_ERRATA 2 +#define PQI_CONFIG_TABLE_SECTION_DEBUG 3 +#define PQI_CONFIG_TABLE_SECTION_HEARTBEAT 4 + +struct pqi_config_table { + u8 signature[8]; /* "CFGTABLE" */ + __le32 first_section_offset; /* offset in bytes from the base */ + /* address of this table to the */ + /* first section */ +}; + +struct pqi_config_table_section_header { + __le16 section_id; /* as defined by the */ + /* PQI_CONFIG_TABLE_SECTION_* */ + /* manifest constants above */ + __le16 next_section_offset; /* offset in bytes from base */ + /* address of the table of the */ + /* next section or 0 if last entry */ +}; + +struct pqi_config_table_general_info { + struct pqi_config_table_section_header header; + __le32 section_length; /* size of this section in bytes */ + /* including the section header */ + __le32 max_outstanding_requests; /* max. outstanding */ + /* commands supported by */ + /* the controller */ + __le32 max_sg_size; /* max. transfer size of a single */ + /* command */ + __le32 max_sg_per_request; /* max. number of scatter-gather */ + /* entries supported in a single */ + /* command */ +}; + +struct pqi_config_table_debug { + struct pqi_config_table_section_header header; + __le32 scratchpad; +}; + +struct pqi_config_table_heartbeat { + struct pqi_config_table_section_header header; + __le32 heartbeat_counter; +}; + #define PQI_MAX_OUTSTANDING_REQUESTS ((u32)~0) #define PQI_MAX_TRANSFER_SIZE (4 * 1024U * 1024U) @@ -645,8 +696,6 @@ struct pqi_encryption_info { #define PQI_HBA_BUS 2 #define PQI_MAX_BUS PQI_HBA_BUS -#pragma pack(1) - struct report_lun_header { __be32 list_length; u8 extended_response; @@ -870,7 +919,6 @@ struct pqi_io_request { struct list_head request_list_entry; }; -#define PQI_EVENT_HEARTBEAT 0 #define PQI_NUM_SUPPORTED_EVENTS 6 struct pqi_event { @@ -943,7 +991,6 @@ struct pqi_ctrl_info { u8 inbound_spanning_supported : 1; u8 outbound_spanning_supported : 1; u8 pqi_mode_enabled : 1; - u8 heartbeat_timer_started : 1; u8 update_time_worker_scheduled : 1; struct list_head scsi_device_list; @@ -963,7 +1010,8 @@ struct pqi_ctrl_info { atomic_t num_interrupts; int previous_num_interrupts; - unsigned int num_heartbeats_requested; + u32 previous_heartbeat_count; + __le32 __iomem *heartbeat_counter; struct timer_list heartbeat_timer; struct semaphore sync_request_sem; diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 7af4add16276..de33942860b3 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -267,6 +267,14 @@ static inline void pqi_cancel_rescan_worker(struct pqi_ctrl_info *ctrl_info) cancel_delayed_work_sync(&ctrl_info->rescan_work); } +static inline u32 pqi_read_heartbeat_counter(struct pqi_ctrl_info *ctrl_info) +{ + if (!ctrl_info->heartbeat_counter) + return 0; + + return readl(ctrl_info->heartbeat_counter); +} + static int pqi_map_single(struct pci_dev *pci_dev, struct pqi_sg_descriptor *sg_descriptor, void *buffer, size_t buffer_length, int data_direction) @@ -2708,23 +2716,18 @@ static inline unsigned int pqi_num_elements_free(unsigned int pi, return elements_in_queue - num_elements_used - 1; } -#define PQI_EVENT_ACK_TIMEOUT 30 - -static void pqi_start_event_ack(struct pqi_ctrl_info *ctrl_info, +static void pqi_send_event_ack(struct pqi_ctrl_info *ctrl_info, struct pqi_event_acknowledge_request *iu, size_t iu_length) { pqi_index_t iq_pi; pqi_index_t iq_ci; unsigned long flags; void *next_element; - unsigned long timeout; struct pqi_queue_group *queue_group; queue_group = &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP]; put_unaligned_le16(queue_group->oq_id, &iu->header.response_queue_id); - timeout = (PQI_EVENT_ACK_TIMEOUT * HZ) + jiffies; - while (1) { spin_lock_irqsave(&queue_group->submit_lock[RAID_PATH], flags); @@ -2738,11 +2741,8 @@ static void pqi_start_event_ack(struct pqi_ctrl_info *ctrl_info, spin_unlock_irqrestore( &queue_group->submit_lock[RAID_PATH], flags); - if (time_after(jiffies, timeout)) { - dev_err(&ctrl_info->pci_dev->dev, - "sending event acknowledge timed out\n"); + if (pqi_ctrl_offline(ctrl_info)) return; - } } next_element = queue_group->iq_element_array[RAID_PATH] + @@ -2751,7 +2751,6 @@ static void pqi_start_event_ack(struct pqi_ctrl_info *ctrl_info, memcpy(next_element, iu, iu_length); iq_pi = (iq_pi + 1) % ctrl_info->num_elements_per_iq; - queue_group->iq_pi_copy[RAID_PATH] = iq_pi; /* @@ -2777,7 +2776,7 @@ static void pqi_acknowledge_event(struct pqi_ctrl_info *ctrl_info, request.event_id = event->event_id; request.additional_event_id = event->additional_event_id; - pqi_start_event_ack(ctrl_info, &request, sizeof(request)); + pqi_send_event_ack(ctrl_info, &request, sizeof(request)); } static void pqi_event_worker(struct work_struct *work) @@ -2785,7 +2784,6 @@ static void pqi_event_worker(struct work_struct *work) unsigned int i; struct pqi_ctrl_info *ctrl_info; struct pqi_event *event; - bool got_non_heartbeat_event = false; ctrl_info = container_of(work, struct pqi_ctrl_info, event_work); @@ -2797,8 +2795,6 @@ static void pqi_event_worker(struct work_struct *work) if (event->pending) { event->pending = false; pqi_acknowledge_event(ctrl_info, event); - if (i != PQI_EVENT_TYPE_HEARTBEAT) - got_non_heartbeat_event = true; } event++; } @@ -2848,57 +2844,58 @@ static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info) } } -#define PQI_HEARTBEAT_TIMER_INTERVAL (5 * HZ) -#define PQI_MAX_HEARTBEAT_REQUESTS 5 +#define PQI_HEARTBEAT_TIMER_INTERVAL (10 * HZ) static void pqi_heartbeat_timer_handler(unsigned long data) { int num_interrupts; + u32 heartbeat_count; struct pqi_ctrl_info *ctrl_info = (struct pqi_ctrl_info *)data; - if (!ctrl_info->heartbeat_timer_started) + pqi_check_ctrl_health(ctrl_info); + if (pqi_ctrl_offline(ctrl_info)) return; num_interrupts = atomic_read(&ctrl_info->num_interrupts); + heartbeat_count = pqi_read_heartbeat_counter(ctrl_info); if (num_interrupts == ctrl_info->previous_num_interrupts) { - ctrl_info->num_heartbeats_requested++; - if (ctrl_info->num_heartbeats_requested > - PQI_MAX_HEARTBEAT_REQUESTS) { + if (heartbeat_count == ctrl_info->previous_heartbeat_count) { + dev_err(&ctrl_info->pci_dev->dev, + "no heartbeat detected - last heartbeat count: %u\n", + heartbeat_count); pqi_take_ctrl_offline(ctrl_info); return; } - ctrl_info->events[PQI_EVENT_HEARTBEAT].pending = true; - schedule_work(&ctrl_info->event_work); } else { - ctrl_info->num_heartbeats_requested = 0; + ctrl_info->previous_num_interrupts = num_interrupts; } - ctrl_info->previous_num_interrupts = num_interrupts; + ctrl_info->previous_heartbeat_count = heartbeat_count; mod_timer(&ctrl_info->heartbeat_timer, jiffies + PQI_HEARTBEAT_TIMER_INTERVAL); } static void pqi_start_heartbeat_timer(struct pqi_ctrl_info *ctrl_info) { + if (!ctrl_info->heartbeat_counter) + return; + ctrl_info->previous_num_interrupts = atomic_read(&ctrl_info->num_interrupts); + ctrl_info->previous_heartbeat_count = + pqi_read_heartbeat_counter(ctrl_info); - init_timer(&ctrl_info->heartbeat_timer); ctrl_info->heartbeat_timer.expires = jiffies + PQI_HEARTBEAT_TIMER_INTERVAL; ctrl_info->heartbeat_timer.data = (unsigned long)ctrl_info; ctrl_info->heartbeat_timer.function = pqi_heartbeat_timer_handler; - ctrl_info->heartbeat_timer_started = true; add_timer(&ctrl_info->heartbeat_timer); } static inline void pqi_stop_heartbeat_timer(struct pqi_ctrl_info *ctrl_info) { - if (ctrl_info->heartbeat_timer_started) { - ctrl_info->heartbeat_timer_started = false; - del_timer_sync(&ctrl_info->heartbeat_timer); - } + del_timer_sync(&ctrl_info->heartbeat_timer); } static inline int pqi_event_type_to_event_index(unsigned int event_type) @@ -2925,12 +2922,10 @@ static unsigned int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info) struct pqi_event_queue *event_queue; struct pqi_event_response *response; struct pqi_event *event; - bool need_delayed_work; int event_index; event_queue = &ctrl_info->event_queue; num_events = 0; - need_delayed_work = false; oq_ci = event_queue->oq_ci_copy; while (1) { @@ -2953,10 +2948,6 @@ static unsigned int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info) event->event_id = response->event_id; event->additional_event_id = response->additional_event_id; - if (event_index != PQI_EVENT_TYPE_HEARTBEAT) { - event->pending = true; - need_delayed_work = true; - } } } @@ -2966,9 +2957,7 @@ static unsigned int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info) if (num_events) { event_queue->oq_ci_copy = oq_ci; writel(oq_ci, event_queue->oq_ci); - - if (need_delayed_work) - schedule_work(&ctrl_info->event_work); + schedule_work(&ctrl_info->event_work); } return num_events; @@ -3220,7 +3209,7 @@ static int pqi_alloc_operational_queues(struct pqi_ctrl_info *ctrl_info) if (!ctrl_info->queue_memory_base) { dev_err(&ctrl_info->pci_dev->dev, - "failed to allocate memory for PQI admin queues\n"); + "unable to allocate memory for PQI admin queues\n"); return -ENOMEM; } @@ -5672,6 +5661,55 @@ out: return rc; } +static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info) +{ + u32 table_length; + u32 section_offset; + void __iomem *table_iomem_addr; + struct pqi_config_table *config_table; + struct pqi_config_table_section_header *section; + + table_length = ctrl_info->config_table_length; + + config_table = kmalloc(table_length, GFP_KERNEL); + if (!config_table) { + dev_err(&ctrl_info->pci_dev->dev, + "unable to allocate memory for PQI configuration table\n"); + return -ENOMEM; + } + + /* + * Copy the config table contents from I/O memory space into the + * temporary buffer. + */ + table_iomem_addr = ctrl_info->iomem_base + + ctrl_info->config_table_offset; + memcpy_fromio(config_table, table_iomem_addr, table_length); + + section_offset = + get_unaligned_le32(&config_table->first_section_offset); + + while (section_offset) { + section = (void *)config_table + section_offset; + + switch (get_unaligned_le16(§ion->section_id)) { + case PQI_CONFIG_TABLE_SECTION_HEARTBEAT: + ctrl_info->heartbeat_counter = table_iomem_addr + + section_offset + + offsetof(struct pqi_config_table_heartbeat, + heartbeat_counter); + break; + } + + section_offset = + get_unaligned_le16(§ion->next_section_offset); + } + + kfree(config_table); + + return 0; +} + /* Switches the controller from PQI mode back into SIS mode. */ static int pqi_revert_to_sis_mode(struct pqi_ctrl_info *ctrl_info) @@ -5783,6 +5821,10 @@ static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info) ctrl_info->pqi_mode_enabled = true; pqi_save_ctrl_mode(ctrl_info, PQI_MODE); + rc = pqi_process_config_table(ctrl_info); + if (rc) + return rc; + rc = pqi_alloc_admin_queues(ctrl_info); if (rc) { dev_err(&ctrl_info->pci_dev->dev, @@ -6091,6 +6133,8 @@ static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node) INIT_DELAYED_WORK(&ctrl_info->rescan_work, pqi_rescan_worker); INIT_DELAYED_WORK(&ctrl_info->update_time_work, pqi_update_time_worker); + init_timer(&ctrl_info->heartbeat_timer); + sema_init(&ctrl_info->sync_request_sem, PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS); init_waitqueue_head(&ctrl_info->block_requests_wait); diff --git a/drivers/scsi/smartpqi/smartpqi_sis.c b/drivers/scsi/smartpqi/smartpqi_sis.c index 12853fd75550..3155bda88550 100644 --- a/drivers/scsi/smartpqi/smartpqi_sis.c +++ b/drivers/scsi/smartpqi/smartpqi_sis.c @@ -422,6 +422,10 @@ void sis_soft_reset(struct pqi_ctrl_info *ctrl_info) void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info) { + if (readl(&ctrl_info->registers->sis_firmware_status) & + SIS_CTRL_KERNEL_PANIC) + return; + writel(SIS_TRIGGER_SHUTDOWN, &ctrl_info->registers->sis_host_to_ctrl_doorbell); }