2018-01-27 04:12:23 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2016-04-29 06:24:48 +08:00
|
|
|
/*
|
|
|
|
* PCI Express Downstream Port Containment services driver
|
2016-08-25 04:57:44 +08:00
|
|
|
* Author: Keith Busch <keith.busch@intel.com>
|
|
|
|
*
|
2016-04-29 06:24:48 +08:00
|
|
|
* Copyright (C) 2016 Intel Corp.
|
|
|
|
*/
|
|
|
|
|
2019-05-08 07:24:47 +08:00
|
|
|
#define dev_fmt(fmt) "DPC: " fmt
|
|
|
|
|
2018-07-17 06:05:05 +08:00
|
|
|
#include <linux/aer.h>
|
2016-04-29 06:24:48 +08:00
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/interrupt.h>
|
2016-08-25 04:57:44 +08:00
|
|
|
#include <linux/init.h>
|
2016-04-29 06:24:48 +08:00
|
|
|
#include <linux/pci.h>
|
2018-02-14 11:52:18 +08:00
|
|
|
|
2018-03-10 01:42:01 +08:00
|
|
|
#include "portdrv.h"
|
2017-03-30 11:48:59 +08:00
|
|
|
#include "../pci.h"
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2017-08-19 17:07:20 +08:00
|
|
|
static const char * const rp_pio_error_string[] = {
|
|
|
|
"Configuration Request received UR Completion", /* Bit Position 0 */
|
|
|
|
"Configuration Request received CA Completion", /* Bit Position 1 */
|
|
|
|
"Configuration Request Completion Timeout", /* Bit Position 2 */
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
"I/O Request received UR Completion", /* Bit Position 8 */
|
|
|
|
"I/O Request received CA Completion", /* Bit Position 9 */
|
|
|
|
"I/O Request Completion Timeout", /* Bit Position 10 */
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
"Memory Request received UR Completion", /* Bit Position 16 */
|
|
|
|
"Memory Request received CA Completion", /* Bit Position 17 */
|
|
|
|
"Memory Request Completion Timeout", /* Bit Position 18 */
|
2016-04-29 06:24:48 +08:00
|
|
|
};
|
|
|
|
|
2018-09-21 00:27:08 +08:00
|
|
|
void pci_save_dpc_state(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
struct pci_cap_saved_state *save_state;
|
|
|
|
u16 *cap;
|
|
|
|
|
|
|
|
if (!pci_is_pcie(dev))
|
|
|
|
return;
|
|
|
|
|
|
|
|
save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
|
|
|
|
if (!save_state)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cap = (u16 *)&save_state->cap.data[0];
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_read_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, cap);
|
2018-09-21 00:27:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void pci_restore_dpc_state(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
struct pci_cap_saved_state *save_state;
|
|
|
|
u16 *cap;
|
|
|
|
|
|
|
|
if (!pci_is_pcie(dev))
|
|
|
|
return;
|
|
|
|
|
|
|
|
save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
|
|
|
|
if (!save_state)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cap = (u16 *)&save_state->cap.data[0];
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap);
|
2018-09-21 00:27:08 +08:00
|
|
|
}
|
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
static int dpc_wait_rp_inactive(struct pci_dev *pdev)
|
2017-02-04 05:46:13 +08:00
|
|
|
{
|
|
|
|
unsigned long timeout = jiffies + HZ;
|
2020-03-24 08:26:01 +08:00
|
|
|
u16 cap = pdev->dpc_cap, status;
|
2017-02-04 05:46:13 +08:00
|
|
|
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
|
2017-02-04 05:46:13 +08:00
|
|
|
while (status & PCI_EXP_DPC_RP_BUSY &&
|
|
|
|
!time_after(jiffies, timeout)) {
|
|
|
|
msleep(10);
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
|
2017-02-04 05:46:13 +08:00
|
|
|
}
|
|
|
|
if (status & PCI_EXP_DPC_RP_BUSY) {
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_warn(pdev, "root port still busy\n");
|
2017-02-04 05:46:13 +08:00
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
PCI/DPC: Expose dpc_process_error(), dpc_reset_link() for use by EDR
If firmware controls DPC, it is generally responsible for managing the DPC
capability and events, and the OS should not access the DPC capability.
However, if firmware controls DPC and both the OS and the platform support
Error Disconnect Recover (EDR) notifications, the OS EDR notify handler is
responsible for recovery, and the notify handler may read/write the DPC
capability until it clears the DPC Trigger Status bit. See [1], sec 4.5.1,
table 4-6.
Expose some DPC error handling functions so they can be used by the EDR
notify handler.
[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
affecting PCI Firmware Specification, Rev. 3.2
https://members.pcisig.com/wg/PCI-SIG/document/12888
Link: https://lore.kernel.org/r/e9000bb15b3a4293e81d98bb29ead7c84a6393c9.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2020-03-24 08:26:06 +08:00
|
|
|
pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
|
2016-04-29 06:24:48 +08:00
|
|
|
{
|
2018-06-21 05:38:27 +08:00
|
|
|
u16 cap;
|
2018-05-18 05:44:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* DPC disables the Link automatically in hardware, so it has
|
|
|
|
* already been reset by the time we get here.
|
|
|
|
*/
|
2020-03-24 08:26:01 +08:00
|
|
|
cap = pdev->dpc_cap;
|
2018-05-18 05:44:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until the Link is inactive, then clear DPC Trigger Status
|
|
|
|
* to allow the Port to leave DPC.
|
|
|
|
*/
|
PCI: pciehp: Reduce noisiness on hot removal
When a PCIe card is hot-removed, the Presence Detect State and Data Link
Layer Link Active bits often do not clear simultaneously. I've seen delays
of up to 244 msec between the two events with Thunderbolt.
After pciehp has brought down the slot in response to the first event, the
other bit may still be set. It's not discernible whether it's set because
a new card is already in the slot or if it will soon clear. So pciehp
tries to bring up the slot and in the latter case fails with a bunch of
messages, some of them at KERN_ERR severity. If the slot is no longer
occupied, the messages are false positives and annoy users.
Stuart Hayes reports the following splat on hot removal:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Timeout waiting for Presence Detect
KERN_ERR pcieport 0000:3c:06.0: pciehp: link training error: status 0x0001
KERN_ERR pcieport 0000:3c:06.0: pciehp: Failed to check link status
Dongdong Liu complains about a similar splat:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Link Down
KERN_INFO iommu: Removing device 0000:87:00.0 from group 12
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pcieport 0000:80:10.0: Data Link Layer Link Active not set in 1000 msec
KERN_ERR pciehp 0000:80:10.0:pcie004: Failed to check link status
Users are particularly irritated to see a bringup attempt even though the
slot was explicitly brought down via sysfs. In a perfect world, we could
avoid this by setting Link Disable on slot bringdown and re-enabling it
upon a Presence Detect State change. In reality however, there are broken
hotplug ports which hardwire Presence Detect to zero, see 80696f991424
("PCI: pciehp: Tolerate Presence Detect hardwired to zero"). Conversely,
PCIe r1.0 hotplug ports hardwire Link Active to zero because Link Active
Reporting wasn't specified before PCIe r1.1. On unplug, some ports first
clear Presence then Link (see Stuart Hayes' splat) whereas others use the
inverse order (see Dongdong Liu's splat). To top it off, there are hotplug
ports which flap the Presence and Link bits on slot bringup, see
6c35a1ac3da6 ("PCI: pciehp: Tolerate initially unstable link").
pciehp is designed to work with all of these variants. Surplus attempts at
slot bringup are a lesser evil than not being able to bring up slots at
all. Although we could try to perfect the behavior for specific hotplug
controllers, we'd risk breaking others or increasing code complexity.
But we can certainly minimize annoyance by emitting only a single message
with KERN_INFO severity if bringup is unsuccessful:
* Drop the "Timeout waiting for Presence Detect" message in
pcie_wait_for_presence(). The sole caller of that function,
pciehp_check_link_status(), ignores the timeout and carries on. It emits
error messages of its own and I don't think this particular message adds
much value.
* There's a single error condition in pciehp_check_link_status() which
does not emit a message. Adding one allows dropping the "Failed to check
link status" message emitted by board_added() if
pciehp_check_link_status() returns a non-zero integer.
* Tone down all messages in pciehp_check_link_status() to KERN_INFO
severity and rephrase them to look as innocuous as possible. To this
end, move the message emitted by pcie_wait_for_link_delay() to its
callers.
As a result, Stuart Hayes' splat becomes:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Cannot train link: status 0x0001
Dongdong Liu's splat becomes:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): No link
The messages now merely serve as information that presence or link bits
were set a little longer than expected. Bringup failures which are not
false positives are still reported, albeit no longer at KERN_ERR severity.
Link: https://lore.kernel.org/linux-pci/20200310182100.102987-1-stuart.w.hayes@gmail.com/
Link: https://lore.kernel.org/linux-pci/1547649064-19019-1-git-send-email-liudongdong3@huawei.com/
Link: https://lore.kernel.org/r/b45e46fd8a6aa6930aaac9d7718c2e4b787a4e5e.1595935071.git.lukas@wunner.de
Reported-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Reported-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
2020-09-18 05:13:20 +08:00
|
|
|
if (!pcie_wait_for_link(pdev, false))
|
|
|
|
pci_info(pdev, "Data Link Layer Link Active not cleared in 1000 msec\n");
|
2018-05-18 05:44:20 +08:00
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev))
|
2018-05-18 05:44:20 +08:00
|
|
|
return PCI_ERS_RESULT_DISCONNECT;
|
2017-08-19 17:07:20 +08:00
|
|
|
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
|
2018-05-17 04:59:35 +08:00
|
|
|
PCI_EXP_DPC_STATUS_TRIGGER);
|
2017-12-14 23:20:18 +08:00
|
|
|
|
PCI: pciehp: Reduce noisiness on hot removal
When a PCIe card is hot-removed, the Presence Detect State and Data Link
Layer Link Active bits often do not clear simultaneously. I've seen delays
of up to 244 msec between the two events with Thunderbolt.
After pciehp has brought down the slot in response to the first event, the
other bit may still be set. It's not discernible whether it's set because
a new card is already in the slot or if it will soon clear. So pciehp
tries to bring up the slot and in the latter case fails with a bunch of
messages, some of them at KERN_ERR severity. If the slot is no longer
occupied, the messages are false positives and annoy users.
Stuart Hayes reports the following splat on hot removal:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Timeout waiting for Presence Detect
KERN_ERR pcieport 0000:3c:06.0: pciehp: link training error: status 0x0001
KERN_ERR pcieport 0000:3c:06.0: pciehp: Failed to check link status
Dongdong Liu complains about a similar splat:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Link Down
KERN_INFO iommu: Removing device 0000:87:00.0 from group 12
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pcieport 0000:80:10.0: Data Link Layer Link Active not set in 1000 msec
KERN_ERR pciehp 0000:80:10.0:pcie004: Failed to check link status
Users are particularly irritated to see a bringup attempt even though the
slot was explicitly brought down via sysfs. In a perfect world, we could
avoid this by setting Link Disable on slot bringdown and re-enabling it
upon a Presence Detect State change. In reality however, there are broken
hotplug ports which hardwire Presence Detect to zero, see 80696f991424
("PCI: pciehp: Tolerate Presence Detect hardwired to zero"). Conversely,
PCIe r1.0 hotplug ports hardwire Link Active to zero because Link Active
Reporting wasn't specified before PCIe r1.1. On unplug, some ports first
clear Presence then Link (see Stuart Hayes' splat) whereas others use the
inverse order (see Dongdong Liu's splat). To top it off, there are hotplug
ports which flap the Presence and Link bits on slot bringup, see
6c35a1ac3da6 ("PCI: pciehp: Tolerate initially unstable link").
pciehp is designed to work with all of these variants. Surplus attempts at
slot bringup are a lesser evil than not being able to bring up slots at
all. Although we could try to perfect the behavior for specific hotplug
controllers, we'd risk breaking others or increasing code complexity.
But we can certainly minimize annoyance by emitting only a single message
with KERN_INFO severity if bringup is unsuccessful:
* Drop the "Timeout waiting for Presence Detect" message in
pcie_wait_for_presence(). The sole caller of that function,
pciehp_check_link_status(), ignores the timeout and carries on. It emits
error messages of its own and I don't think this particular message adds
much value.
* There's a single error condition in pciehp_check_link_status() which
does not emit a message. Adding one allows dropping the "Failed to check
link status" message emitted by board_added() if
pciehp_check_link_status() returns a non-zero integer.
* Tone down all messages in pciehp_check_link_status() to KERN_INFO
severity and rephrase them to look as innocuous as possible. To this
end, move the message emitted by pcie_wait_for_link_delay() to its
callers.
As a result, Stuart Hayes' splat becomes:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Cannot train link: status 0x0001
Dongdong Liu's splat becomes:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): No link
The messages now merely serve as information that presence or link bits
were set a little longer than expected. Bringup failures which are not
false positives are still reported, albeit no longer at KERN_ERR severity.
Link: https://lore.kernel.org/linux-pci/20200310182100.102987-1-stuart.w.hayes@gmail.com/
Link: https://lore.kernel.org/linux-pci/1547649064-19019-1-git-send-email-liudongdong3@huawei.com/
Link: https://lore.kernel.org/r/b45e46fd8a6aa6930aaac9d7718c2e4b787a4e5e.1595935071.git.lukas@wunner.de
Reported-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Reported-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
2020-09-18 05:13:20 +08:00
|
|
|
if (!pcie_wait_for_link(pdev, true)) {
|
|
|
|
pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n");
|
2018-09-21 00:27:17 +08:00
|
|
|
return PCI_ERS_RESULT_DISCONNECT;
|
PCI: pciehp: Reduce noisiness on hot removal
When a PCIe card is hot-removed, the Presence Detect State and Data Link
Layer Link Active bits often do not clear simultaneously. I've seen delays
of up to 244 msec between the two events with Thunderbolt.
After pciehp has brought down the slot in response to the first event, the
other bit may still be set. It's not discernible whether it's set because
a new card is already in the slot or if it will soon clear. So pciehp
tries to bring up the slot and in the latter case fails with a bunch of
messages, some of them at KERN_ERR severity. If the slot is no longer
occupied, the messages are false positives and annoy users.
Stuart Hayes reports the following splat on hot removal:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Timeout waiting for Presence Detect
KERN_ERR pcieport 0000:3c:06.0: pciehp: link training error: status 0x0001
KERN_ERR pcieport 0000:3c:06.0: pciehp: Failed to check link status
Dongdong Liu complains about a similar splat:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Link Down
KERN_INFO iommu: Removing device 0000:87:00.0 from group 12
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pcieport 0000:80:10.0: Data Link Layer Link Active not set in 1000 msec
KERN_ERR pciehp 0000:80:10.0:pcie004: Failed to check link status
Users are particularly irritated to see a bringup attempt even though the
slot was explicitly brought down via sysfs. In a perfect world, we could
avoid this by setting Link Disable on slot bringdown and re-enabling it
upon a Presence Detect State change. In reality however, there are broken
hotplug ports which hardwire Presence Detect to zero, see 80696f991424
("PCI: pciehp: Tolerate Presence Detect hardwired to zero"). Conversely,
PCIe r1.0 hotplug ports hardwire Link Active to zero because Link Active
Reporting wasn't specified before PCIe r1.1. On unplug, some ports first
clear Presence then Link (see Stuart Hayes' splat) whereas others use the
inverse order (see Dongdong Liu's splat). To top it off, there are hotplug
ports which flap the Presence and Link bits on slot bringup, see
6c35a1ac3da6 ("PCI: pciehp: Tolerate initially unstable link").
pciehp is designed to work with all of these variants. Surplus attempts at
slot bringup are a lesser evil than not being able to bring up slots at
all. Although we could try to perfect the behavior for specific hotplug
controllers, we'd risk breaking others or increasing code complexity.
But we can certainly minimize annoyance by emitting only a single message
with KERN_INFO severity if bringup is unsuccessful:
* Drop the "Timeout waiting for Presence Detect" message in
pcie_wait_for_presence(). The sole caller of that function,
pciehp_check_link_status(), ignores the timeout and carries on. It emits
error messages of its own and I don't think this particular message adds
much value.
* There's a single error condition in pciehp_check_link_status() which
does not emit a message. Adding one allows dropping the "Failed to check
link status" message emitted by board_added() if
pciehp_check_link_status() returns a non-zero integer.
* Tone down all messages in pciehp_check_link_status() to KERN_INFO
severity and rephrase them to look as innocuous as possible. To this
end, move the message emitted by pcie_wait_for_link_delay() to its
callers.
As a result, Stuart Hayes' splat becomes:
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Link Up
KERN_INFO pcieport 0000:3c:06.0: pciehp: Slot(180): Cannot train link: status 0x0001
Dongdong Liu's splat becomes:
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): Card present
KERN_INFO pciehp 0000:80:10.0:pcie004: Slot(36): No link
The messages now merely serve as information that presence or link bits
were set a little longer than expected. Bringup failures which are not
false positives are still reported, albeit no longer at KERN_ERR severity.
Link: https://lore.kernel.org/linux-pci/20200310182100.102987-1-stuart.w.hayes@gmail.com/
Link: https://lore.kernel.org/linux-pci/1547649064-19019-1-git-send-email-liudongdong3@huawei.com/
Link: https://lore.kernel.org/r/b45e46fd8a6aa6930aaac9d7718c2e4b787a4e5e.1595935071.git.lukas@wunner.de
Reported-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Reported-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
2020-09-18 05:13:20 +08:00
|
|
|
}
|
2018-09-21 00:27:17 +08:00
|
|
|
|
2018-05-18 05:44:20 +08:00
|
|
|
return PCI_ERS_RESULT_RECOVERED;
|
|
|
|
}
|
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
static void dpc_process_rp_pio_error(struct pci_dev *pdev)
|
2017-08-19 17:07:20 +08:00
|
|
|
{
|
2020-03-24 08:26:01 +08:00
|
|
|
u16 cap = pdev->dpc_cap, dpc_status, first_error;
|
2018-01-31 02:12:48 +08:00
|
|
|
u32 status, mask, sev, syserr, exc, dw0, dw1, dw2, dw3, log, prefix;
|
2017-08-19 17:07:20 +08:00
|
|
|
int i;
|
|
|
|
|
2018-01-31 02:12:48 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS, &status);
|
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_MASK, &mask);
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "rp_pio_status: %#010x, rp_pio_mask: %#010x\n",
|
2018-01-31 02:12:48 +08:00
|
|
|
status, mask);
|
2017-08-19 17:07:20 +08:00
|
|
|
|
2018-01-31 02:12:48 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_SEVERITY, &sev);
|
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_SYSERROR, &syserr);
|
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_EXCEPTION, &exc);
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "RP PIO severity=%#010x, syserror=%#010x, exception=%#010x\n",
|
2018-01-31 02:12:48 +08:00
|
|
|
sev, syserr, exc);
|
2017-08-19 17:07:20 +08:00
|
|
|
|
|
|
|
/* Get First Error Pointer */
|
2018-01-31 02:12:27 +08:00
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &dpc_status);
|
2018-01-31 02:12:48 +08:00
|
|
|
first_error = (dpc_status & 0x1f00) >> 8;
|
2017-08-19 17:07:20 +08:00
|
|
|
|
2018-01-31 02:12:38 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(rp_pio_error_string); i++) {
|
2018-07-17 06:05:03 +08:00
|
|
|
if ((status & ~mask) & (1 << i))
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "[%2d] %s%s\n", i, rp_pio_error_string[i],
|
2018-01-31 02:12:48 +08:00
|
|
|
first_error == i ? " (First)" : "");
|
2018-01-31 02:12:38 +08:00
|
|
|
}
|
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
if (pdev->dpc_rp_log_size < 4)
|
2018-07-17 06:05:03 +08:00
|
|
|
goto clear_status;
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
|
2018-01-31 02:12:48 +08:00
|
|
|
&dw0);
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 4,
|
2018-01-31 02:12:48 +08:00
|
|
|
&dw1);
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 8,
|
2018-01-31 02:12:48 +08:00
|
|
|
&dw2);
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 12,
|
2018-01-31 02:12:48 +08:00
|
|
|
&dw3);
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
|
2018-01-31 02:12:48 +08:00
|
|
|
dw0, dw1, dw2, dw3);
|
2017-08-19 17:07:20 +08:00
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
if (pdev->dpc_rp_log_size < 5)
|
2018-07-17 06:05:03 +08:00
|
|
|
goto clear_status;
|
2018-01-31 02:12:48 +08:00
|
|
|
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
|
2018-01-31 02:12:33 +08:00
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
for (i = 0; i < pdev->dpc_rp_log_size - 5; i++) {
|
2017-08-19 17:07:20 +08:00
|
|
|
pci_read_config_dword(pdev,
|
2018-01-31 02:12:48 +08:00
|
|
|
cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG, &prefix);
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
|
2018-01-31 02:12:38 +08:00
|
|
|
}
|
2018-07-17 06:05:03 +08:00
|
|
|
clear_status:
|
|
|
|
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS, status);
|
2017-08-19 17:07:20 +08:00
|
|
|
}
|
|
|
|
|
2019-02-11 15:02:59 +08:00
|
|
|
static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
|
|
|
|
struct aer_err_info *info)
|
|
|
|
{
|
|
|
|
int pos = dev->aer_cap;
|
|
|
|
u32 status, mask, sev;
|
|
|
|
|
|
|
|
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
|
|
|
|
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &mask);
|
|
|
|
status &= ~mask;
|
|
|
|
if (!status)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
|
|
|
|
status &= sev;
|
|
|
|
if (status)
|
|
|
|
info->severity = AER_FATAL;
|
|
|
|
else
|
|
|
|
info->severity = AER_NONFATAL;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
PCI/DPC: Expose dpc_process_error(), dpc_reset_link() for use by EDR
If firmware controls DPC, it is generally responsible for managing the DPC
capability and events, and the OS should not access the DPC capability.
However, if firmware controls DPC and both the OS and the platform support
Error Disconnect Recover (EDR) notifications, the OS EDR notify handler is
responsible for recovery, and the notify handler may read/write the DPC
capability until it clears the DPC Trigger Status bit. See [1], sec 4.5.1,
table 4-6.
Expose some DPC error handling functions so they can be used by the EDR
notify handler.
[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
affecting PCI Firmware Specification, Rev. 3.2
https://members.pcisig.com/wg/PCI-SIG/document/12888
Link: https://lore.kernel.org/r/e9000bb15b3a4293e81d98bb29ead7c84a6393c9.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2020-03-24 08:26:06 +08:00
|
|
|
void dpc_process_error(struct pci_dev *pdev)
|
2016-04-29 06:24:48 +08:00
|
|
|
{
|
2020-03-24 08:26:01 +08:00
|
|
|
u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
|
2018-07-17 06:05:05 +08:00
|
|
|
struct aer_err_info info;
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2018-01-26 08:06:03 +08:00
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
|
2018-07-17 06:05:02 +08:00
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_info(pdev, "containment event, status:%#06x source:%#06x\n",
|
2018-07-17 06:05:02 +08:00
|
|
|
status, source);
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2018-01-17 07:37:50 +08:00
|
|
|
reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
|
|
|
|
ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_warn(pdev, "%s detected\n",
|
2017-12-14 23:20:18 +08:00
|
|
|
(reason == 0) ? "unmasked uncorrectable error" :
|
|
|
|
(reason == 1) ? "ERR_NONFATAL" :
|
|
|
|
(reason == 2) ? "ERR_FATAL" :
|
|
|
|
(ext_reason == 0) ? "RP PIO error" :
|
|
|
|
(ext_reason == 1) ? "software trigger" :
|
|
|
|
"reserved error");
|
2018-07-17 06:05:02 +08:00
|
|
|
|
2017-12-14 23:20:18 +08:00
|
|
|
/* show RP PIO error detail information */
|
2020-03-24 08:26:01 +08:00
|
|
|
if (pdev->dpc_rp_extensions && reason == 3 && ext_reason == 0)
|
|
|
|
dpc_process_rp_pio_error(pdev);
|
2019-02-11 15:02:59 +08:00
|
|
|
else if (reason == 0 &&
|
|
|
|
dpc_get_aer_uncorrect_severity(pdev, &info) &&
|
|
|
|
aer_get_device_error_info(pdev, &info)) {
|
2018-07-17 06:05:05 +08:00
|
|
|
aer_print_error(pdev, &info);
|
2020-03-24 08:26:08 +08:00
|
|
|
pci_aer_clear_nonfatal_status(pdev);
|
2019-02-11 15:02:59 +08:00
|
|
|
pci_aer_clear_fatal_status(pdev);
|
2018-07-17 06:05:05 +08:00
|
|
|
}
|
PCI/DPC: Expose dpc_process_error(), dpc_reset_link() for use by EDR
If firmware controls DPC, it is generally responsible for managing the DPC
capability and events, and the OS should not access the DPC capability.
However, if firmware controls DPC and both the OS and the platform support
Error Disconnect Recover (EDR) notifications, the OS EDR notify handler is
responsible for recovery, and the notify handler may read/write the DPC
capability until it clears the DPC Trigger Status bit. See [1], sec 4.5.1,
table 4-6.
Expose some DPC error handling functions so they can be used by the EDR
notify handler.
[1] Downstream Port Containment Related Enhancements ECN, Jan 28, 2019,
affecting PCI Firmware Specification, Rev. 3.2
https://members.pcisig.com/wg/PCI-SIG/document/12888
Link: https://lore.kernel.org/r/e9000bb15b3a4293e81d98bb29ead7c84a6393c9.1585000084.git.sathyanarayanan.kuppuswamy@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2020-03-24 08:26:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static irqreturn_t dpc_handler(int irq, void *context)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = context;
|
|
|
|
|
|
|
|
dpc_process_error(pdev);
|
2017-12-14 23:20:18 +08:00
|
|
|
|
2018-07-17 06:05:02 +08:00
|
|
|
/* We configure DPC so it only triggers on ERR_FATAL */
|
2020-03-24 08:26:02 +08:00
|
|
|
pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
|
2018-07-17 06:05:06 +08:00
|
|
|
|
|
|
|
return IRQ_HANDLED;
|
2018-07-17 06:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static irqreturn_t dpc_irq(int irq, void *context)
|
|
|
|
{
|
2020-03-24 08:26:01 +08:00
|
|
|
struct pci_dev *pdev = context;
|
|
|
|
u16 cap = pdev->dpc_cap, status;
|
2018-07-17 06:05:02 +08:00
|
|
|
|
|
|
|
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
|
|
|
|
|
|
|
|
if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || status == (u16)(~0))
|
|
|
|
return IRQ_NONE;
|
|
|
|
|
2018-05-17 04:59:35 +08:00
|
|
|
pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
|
|
|
|
PCI_EXP_DPC_STATUS_INTERRUPT);
|
2018-06-21 05:38:27 +08:00
|
|
|
if (status & PCI_EXP_DPC_STATUS_TRIGGER)
|
2018-07-17 06:05:06 +08:00
|
|
|
return IRQ_WAKE_THREAD;
|
2016-04-29 06:24:48 +08:00
|
|
|
return IRQ_HANDLED;
|
|
|
|
}
|
|
|
|
|
2020-03-24 08:26:04 +08:00
|
|
|
void pci_dpc_init(struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
u16 cap;
|
|
|
|
|
|
|
|
pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
|
|
|
|
if (!pdev->dpc_cap)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
|
|
|
|
if (!(cap & PCI_EXP_DPC_CAP_RP_EXT))
|
|
|
|
return;
|
|
|
|
|
|
|
|
pdev->dpc_rp_extensions = true;
|
|
|
|
pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
|
|
|
|
if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
|
|
|
|
pci_err(pdev, "RP PIO log size %u is invalid\n",
|
|
|
|
pdev->dpc_rp_log_size);
|
|
|
|
pdev->dpc_rp_log_size = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-29 06:24:48 +08:00
|
|
|
#define FLAG(x, y) (((x) & (y)) ? '+' : '-')
|
|
|
|
static int dpc_probe(struct pcie_device *dev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = dev->port;
|
2017-08-19 17:07:21 +08:00
|
|
|
struct device *device = &dev->device;
|
2016-04-29 06:24:48 +08:00
|
|
|
int status;
|
|
|
|
u16 ctl, cap;
|
|
|
|
|
2020-05-27 07:18:29 +08:00
|
|
|
if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
|
2018-01-25 07:03:18 +08:00
|
|
|
return -ENOTSUPP;
|
|
|
|
|
2018-07-17 06:05:06 +08:00
|
|
|
status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
|
|
|
|
dpc_handler, IRQF_SHARED,
|
2020-03-24 08:26:01 +08:00
|
|
|
"pcie-dpc", pdev);
|
2016-04-29 06:24:48 +08:00
|
|
|
if (status) {
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
|
2016-04-29 06:24:48 +08:00
|
|
|
status);
|
2016-06-06 21:06:07 +08:00
|
|
|
return status;
|
2016-04-29 06:24:48 +08:00
|
|
|
}
|
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
|
|
|
|
pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2018-05-18 05:44:18 +08:00
|
|
|
ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
|
2020-05-09 17:56:54 +08:00
|
|
|
pci_info(pdev, "enabled with IRQ %d\n", dev->irq);
|
2016-04-29 06:24:48 +08:00
|
|
|
|
2019-05-08 07:24:47 +08:00
|
|
|
pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
|
|
|
|
cap & PCI_EXP_DPC_IRQ, FLAG(cap, PCI_EXP_DPC_CAP_RP_EXT),
|
|
|
|
FLAG(cap, PCI_EXP_DPC_CAP_POISONED_TLP),
|
2020-03-24 08:26:01 +08:00
|
|
|
FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), pdev->dpc_rp_log_size,
|
2019-05-08 07:24:47 +08:00
|
|
|
FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
|
2018-09-21 00:27:08 +08:00
|
|
|
|
|
|
|
pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_DPC, sizeof(u16));
|
2016-04-29 06:24:48 +08:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dpc_remove(struct pcie_device *dev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = dev->port;
|
|
|
|
u16 ctl;
|
|
|
|
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
|
2018-05-18 05:44:18 +08:00
|
|
|
ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
|
2020-03-24 08:26:01 +08:00
|
|
|
pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
|
2016-04-29 06:24:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct pcie_port_service_driver dpcdriver = {
|
|
|
|
.name = "dpc",
|
2016-07-07 00:06:00 +08:00
|
|
|
.port_type = PCIE_ANY_PORT,
|
2016-04-29 06:24:48 +08:00
|
|
|
.service = PCIE_PORT_SERVICE_DPC,
|
|
|
|
.probe = dpc_probe,
|
|
|
|
.remove = dpc_remove,
|
|
|
|
};
|
|
|
|
|
2018-09-21 00:27:06 +08:00
|
|
|
int __init pcie_dpc_init(void)
|
2016-04-29 06:24:48 +08:00
|
|
|
{
|
|
|
|
return pcie_port_service_register(&dpcdriver);
|
|
|
|
}
|