linux-sg2042/drivers/acpi/ec.c

2028 lines
54 KiB
C
Raw Normal View History

/*
* ec.c - ACPI Embedded Controller Driver (v3)
*
* Copyright (C) 2001-2015 Intel Corporation
* Author: 2014, 2015 Lv Zheng <lv.zheng@intel.com>
* 2006, 2007 Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
* 2006 Denis Sadykov <denis.m.sadykov@intel.com>
* 2004 Luming Yu <luming.yu@intel.com>
* 2001, 2002 Andy Grover <andrew.grover@intel.com>
* 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/* Uncomment next line to get verbose printout */
/* #define DEBUG */
#define pr_fmt(fmt) "ACPI: EC: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/spinlock.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
ACPI: Clean up inclusions of ACPI header files Replace direct inclusions of <acpi/acpi.h>, <acpi/acpi_bus.h> and <acpi/acpi_drivers.h>, which are incorrect, with <linux/acpi.h> inclusions and remove some inclusions of those files that aren't necessary. First of all, <acpi/acpi.h>, <acpi/acpi_bus.h> and <acpi/acpi_drivers.h> should not be included directly from any files that are built for CONFIG_ACPI unset, because that generally leads to build warnings about undefined symbols in !CONFIG_ACPI builds. For CONFIG_ACPI set, <linux/acpi.h> includes those files and for CONFIG_ACPI unset it provides stub ACPI symbols to be used in that case. Second, there are ordering dependencies between those files that always have to be met. Namely, it is required that <acpi/acpi_bus.h> be included prior to <acpi/acpi_drivers.h> so that the acpi_pci_root declarations the latter depends on are always there. And <acpi/acpi.h> which provides basic ACPICA type declarations should always be included prior to any other ACPI headers in CONFIG_ACPI builds. That also is taken care of including <linux/acpi.h> as appropriate. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: Tony Luck <tony.luck@intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Acked-by: Bjorn Helgaas <bhelgaas@google.com> (drivers/pci stuff) Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> (Xen stuff) Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-12-03 08:49:16 +08:00
#include <linux/acpi.h>
#include <linux/dmi.h>
ACPI: Clean up inclusions of ACPI header files Replace direct inclusions of <acpi/acpi.h>, <acpi/acpi_bus.h> and <acpi/acpi_drivers.h>, which are incorrect, with <linux/acpi.h> inclusions and remove some inclusions of those files that aren't necessary. First of all, <acpi/acpi.h>, <acpi/acpi_bus.h> and <acpi/acpi_drivers.h> should not be included directly from any files that are built for CONFIG_ACPI unset, because that generally leads to build warnings about undefined symbols in !CONFIG_ACPI builds. For CONFIG_ACPI set, <linux/acpi.h> includes those files and for CONFIG_ACPI unset it provides stub ACPI symbols to be used in that case. Second, there are ordering dependencies between those files that always have to be met. Namely, it is required that <acpi/acpi_bus.h> be included prior to <acpi/acpi_drivers.h> so that the acpi_pci_root declarations the latter depends on are always there. And <acpi/acpi.h> which provides basic ACPICA type declarations should always be included prior to any other ACPI headers in CONFIG_ACPI builds. That also is taken care of including <linux/acpi.h> as appropriate. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: Tony Luck <tony.luck@intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Acked-by: Bjorn Helgaas <bhelgaas@google.com> (drivers/pci stuff) Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> (Xen stuff) Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-12-03 08:49:16 +08:00
#include <asm/io.h>
#include "internal.h"
#define ACPI_EC_CLASS "embedded_controller"
#define ACPI_EC_DEVICE_NAME "Embedded Controller"
#define ACPI_EC_FILE_INFO "info"
/* EC status register */
#define ACPI_EC_FLAG_OBF 0x01 /* Output buffer full */
#define ACPI_EC_FLAG_IBF 0x02 /* Input buffer full */
#define ACPI_EC_FLAG_CMD 0x08 /* Input buffer contains a command */
#define ACPI_EC_FLAG_BURST 0x10 /* burst mode */
#define ACPI_EC_FLAG_SCI 0x20 /* EC-SCI occurred */
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
/*
* The SCI_EVT clearing timing is not defined by the ACPI specification.
* This leads to lots of practical timing issues for the host EC driver.
* The following variations are defined (from the target EC firmware's
* perspective):
* STATUS: After indicating SCI_EVT edge triggered IRQ to the host, the
* target can clear SCI_EVT at any time so long as the host can see
* the indication by reading the status register (EC_SC). So the
* host should re-check SCI_EVT after the first time the SCI_EVT
* indication is seen, which is the same time the query request
* (QR_EC) is written to the command register (EC_CMD). SCI_EVT set
* at any later time could indicate another event. Normally such
* kind of EC firmware has implemented an event queue and will
* return 0x00 to indicate "no outstanding event".
* QUERY: After seeing the query request (QR_EC) written to the command
* register (EC_CMD) by the host and having prepared the responding
* event value in the data register (EC_DATA), the target can safely
* clear SCI_EVT because the target can confirm that the current
* event is being handled by the host. The host then should check
* SCI_EVT right after reading the event response from the data
* register (EC_DATA).
* EVENT: After seeing the event response read from the data register
* (EC_DATA) by the host, the target can clear SCI_EVT. As the
* target requires time to notice the change in the data register
* (EC_DATA), the host may be required to wait additional guarding
* time before checking the SCI_EVT again. Such guarding may not be
* necessary if the host is notified via another IRQ.
*/
#define ACPI_EC_EVT_TIMING_STATUS 0x00
#define ACPI_EC_EVT_TIMING_QUERY 0x01
#define ACPI_EC_EVT_TIMING_EVENT 0x02
/* EC commands */
enum ec_command {
ACPI_EC_COMMAND_READ = 0x80,
ACPI_EC_COMMAND_WRITE = 0x81,
ACPI_EC_BURST_ENABLE = 0x82,
ACPI_EC_BURST_DISABLE = 0x83,
ACPI_EC_COMMAND_QUERY = 0x84,
};
#define ACPI_EC_DELAY 500 /* Wait 500ms max. during EC ops */
#define ACPI_EC_UDELAY_GLK 1000 /* Wait 1ms max. to get global lock */
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
#define ACPI_EC_UDELAY_POLL 550 /* Wait 1ms for EC transaction polling */
ACPI / EC: Clear stale EC events on Samsung systems A number of Samsung notebooks (530Uxx/535Uxx/540Uxx/550Pxx/900Xxx/etc) continue to log events during sleep (lid open/close, AC plug/unplug, battery level change), which accumulate in the EC until a buffer fills. After the buffer is full (tests suggest it holds 8 events), GPEs stop being triggered for new events. This state persists on wake or even on power cycle, and prevents new events from being registered until the EC is manually polled. This is the root cause of a number of bugs, including AC not being detected properly, lid close not triggering suspend, and low ambient light not triggering the keyboard backlight. The bug also seemed to be responsible for performance issues on at least one user's machine. Juan Manuel Cabo found the cause of bug and the workaround of polling the EC manually on wake. The loop which clears the stale events is based on an earlier patch by Lan Tianyu (see referenced attachment). This patch: - Adds a function acpi_ec_clear() which polls the EC for stale _Q events at most ACPI_EC_CLEAR_MAX (currently 100) times. A warning is logged if this limit is reached. - Adds a flag EC_FLAGS_CLEAR_ON_RESUME which is set to 1 if the DMI system vendor is Samsung. This check could be replaced by several more specific DMI vendor/product pairs, but it's likely that the bug affects more Samsung products than just the five series mentioned above. Further, it should not be harmful to run acpi_ec_clear() on systems without the bug; it will return immediately after finding no data waiting. - Runs acpi_ec_clear() on initialisation (boot), from acpi_ec_add() - Runs acpi_ec_clear() on wake, from acpi_ec_unblock_transactions() References: https://bugzilla.kernel.org/show_bug.cgi?id=44161 References: https://bugzilla.kernel.org/show_bug.cgi?id=45461 References: https://bugzilla.kernel.org/show_bug.cgi?id=57271 References: https://bugzilla.kernel.org/attachment.cgi?id=126801 Suggested-by: Juan Manuel Cabo <juanmanuel.cabo@gmail.com> Signed-off-by: Kieran Clancy <clancy.kieran@gmail.com> Reviewed-by: Lan Tianyu <tianyu.lan@intel.com> Reviewed-by: Dennis Jansen <dennis.jansen@web.de> Tested-by: Kieran Clancy <clancy.kieran@gmail.com> Tested-by: Juan Manuel Cabo <juanmanuel.cabo@gmail.com> Tested-by: Dennis Jansen <dennis.jansen@web.de> Tested-by: Maurizio D'Addona <mauritiusdadd@gmail.com> Tested-by: San Zamoyski <san@plusnet.pl> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-02-28 22:12:28 +08:00
#define ACPI_EC_CLEAR_MAX 100 /* Maximum number of events to query
* when trying to clear the EC */
#define ACPI_EC_MAX_QUERIES 16 /* Maximum number of parallel queries */
enum {
EC_FLAGS_QUERY_ENABLED, /* Query is enabled */
EC_FLAGS_QUERY_PENDING, /* Query is pending */
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
EC_FLAGS_QUERY_GUARDING, /* Guard for SCI_EVT check */
EC_FLAGS_GPE_HANDLER_INSTALLED, /* GPE handler installed */
EC_FLAGS_EC_HANDLER_INSTALLED, /* OpReg handler installed */
EC_FLAGS_EVT_HANDLER_INSTALLED, /* _Qxx handlers installed */
EC_FLAGS_STARTED, /* Driver is started */
EC_FLAGS_STOPPED, /* Driver is stopped */
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
EC_FLAGS_COMMAND_STORM, /* GPE storms occurred to the
* current command processing */
};
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
#define ACPI_EC_COMMAND_POLL 0x01 /* Available for command byte */
#define ACPI_EC_COMMAND_COMPLETE 0x02 /* Completed last byte */
/* ec.c is compiled in acpi namespace so this shows up as acpi.ec_delay param */
static unsigned int ec_delay __read_mostly = ACPI_EC_DELAY;
module_param(ec_delay, uint, 0644);
MODULE_PARM_DESC(ec_delay, "Timeout(ms) waited until an EC command completes");
static unsigned int ec_max_queries __read_mostly = ACPI_EC_MAX_QUERIES;
module_param(ec_max_queries, uint, 0644);
MODULE_PARM_DESC(ec_max_queries, "Maximum parallel _Qxx evaluations");
ACPI / EC: Add module params for polling modes. We have 2 polling modes in the EC driver: 1. busy polling: originally used for the MSI quirks. udelay() is used to perform register access guarding. 2. wait polling: normal code path uses wait_event_timeout() and it can be woken up as soon as the transaction is completed in the interrupt mode. It also contains the register acces guarding logic in case the interrupt doesn't arrive and the EC driver is about to advance the transaction in task context (the polling mode). The wait polling is useful for interrupt mode to allow other tasks to use the CPU during the wait. But for the polling mode, the busy polling takes less time than the wait polling, because if no interrupt arrives, the wait polling has to wait the minimal HZ interval. We have a new use case for using the busy polling mode. Some GPIO drivers initialize PIN configuration which cause a GPIO multiplexed EC GPE to be disabled out of the GPE register's control. Busy polling mode is useful here as it takes less time than the wait polling. But the guarding logic prevents it from responding even faster. We should spinning around the EC status rather than spinning around the nop execution lasted a determined period. This patch introduces 2 module params for the polling mode switch and the guard time, so that users can use the busy polling mode without the guarding in case the guarding is not necessary. This is an example to use the 2 module params for this purpose: acpi.ec_busy_polling acpi.ec_polling_guard=0 We've tested the patch on a test platform. The platform suffers from such kind of the GPIO PIN issue. The GPIO driver resets all PIN configuration and after that, EC interrupt cannot arrive because of the multiplexing. Then the platform suffers from a long delay carried out by the wait_event_timeout() as all further EC transactions will run in the polling mode. We switched the EC driver to use the busy polling mechanism instead of the wait timeout polling mechanism and the delay is still high: [ 44.283005] calling PNP0C0B:00+ @ 1305, parent: platform [ 44.417548] call PNP0C0B:00+ returned 0 after 131323 usecs And this patch can significantly reduce the delay: [ 44.502625] calling PNP0C0B:00+ @ 1308, parent: platform [ 44.503760] call PNP0C0B:00+ returned 0 after 1103 usecs Tested-by: Chen Yu <yu.c.chen@intel.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:48 +08:00
static bool ec_busy_polling __read_mostly;
module_param(ec_busy_polling, bool, 0644);
MODULE_PARM_DESC(ec_busy_polling, "Use busy polling to advance EC transaction");
static unsigned int ec_polling_guard __read_mostly = ACPI_EC_UDELAY_POLL;
module_param(ec_polling_guard, uint, 0644);
MODULE_PARM_DESC(ec_polling_guard, "Guard time(us) between EC accesses in polling modes");
ACPI / EC: Fix EC_FLAGS_QUERY_HANDSHAKE platforms using new event clearing timing. It is reported that on several platforms, EC firmware will not respond non-expected QR_EC (see EC_FLAGS_QUERY_HANDSHAKE, only write QR_EC when SCI_EVT is set). Unfortunately, ACPI specification doesn't define when the SCI_EVT should be cleared by the firmware, thus the original implementation queued up second QR_EC right after writing QR_EC command and before reading the returned event value as at that time the SCI_EVT is ensured not cleared. This behavior is also based on the assumption that the firmware should be able to return 0x00 to indicate "no outstanding event". This behavior did fix issues on Samsung platforms where the spurious query value of 0x00 is supported and didn't break platforms in my test queue. But recently, specific Acer, Asus, Lenovo platforms keep on blaming this change. This patch changes the behavior to re-check the SCI_EVT a bit later and removes EC_FLAGS_QUERY_HANDSHAKE quirks, hoping this is the Windows compliant EC driver behavior. In order to be robust to the possible regressions, instead of removing the quirk directly, this patch keeps the quirk code, removes the quirk users and keeps old behavior for Samsung platforms. Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:45 +08:00
static unsigned int ec_event_clearing __read_mostly = ACPI_EC_EVT_TIMING_QUERY;
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
/*
* If the number of false interrupts per one transaction exceeds
* this threshold, will think there is a GPE storm happened and
* will disable the GPE for normal transaction.
*/
static unsigned int ec_storm_threshold __read_mostly = 8;
module_param(ec_storm_threshold, uint, 0644);
MODULE_PARM_DESC(ec_storm_threshold, "Maxim false GPE numbers not considered as GPE storm");
static bool ec_freeze_events __read_mostly = false;
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
module_param(ec_freeze_events, bool, 0644);
MODULE_PARM_DESC(ec_freeze_events, "Disabling event handling during suspend/resume");
static bool ec_no_wakeup __read_mostly;
module_param(ec_no_wakeup, bool, 0644);
MODULE_PARM_DESC(ec_no_wakeup, "Do not wake up from suspend-to-idle");
struct acpi_ec_query_handler {
struct list_head node;
acpi_ec_query_func func;
acpi_handle handle;
void *data;
u8 query_bit;
struct kref kref;
};
struct transaction {
const u8 *wdata;
u8 *rdata;
unsigned short irq_count;
u8 command;
u8 wi;
u8 ri;
u8 wlen;
u8 rlen;
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
u8 flags;
};
struct acpi_ec_query {
struct transaction transaction;
struct work_struct work;
struct acpi_ec_query_handler *handler;
};
static int acpi_ec_query(struct acpi_ec *ec, u8 *data);
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
static void advance_transaction(struct acpi_ec *ec);
static void acpi_ec_event_handler(struct work_struct *work);
static void acpi_ec_event_processor(struct work_struct *work);
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
struct acpi_ec *boot_ec, *first_ec;
EXPORT_SYMBOL(first_ec);
static bool boot_ec_is_ecdt = false;
static struct workqueue_struct *ec_query_wq;
static int EC_FLAGS_QUERY_HANDSHAKE; /* Needs QR_EC issued when SCI_EVT set */
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
static int EC_FLAGS_CORRECT_ECDT; /* Needs ECDT port address correction */
static int EC_FLAGS_IGNORE_DSDT_GPE; /* Needs ECDT GPE as correction setting */
/* --------------------------------------------------------------------------
* Logging/Debugging
* -------------------------------------------------------------------------- */
/*
* Splitters used by the developers to track the boundary of the EC
* handling processes.
*/
#ifdef DEBUG
#define EC_DBG_SEP " "
#define EC_DBG_DRV "+++++"
#define EC_DBG_STM "====="
#define EC_DBG_REQ "*****"
#define EC_DBG_EVT "#####"
#else
#define EC_DBG_SEP ""
#define EC_DBG_DRV
#define EC_DBG_STM
#define EC_DBG_REQ
#define EC_DBG_EVT
#endif
#define ec_log_raw(fmt, ...) \
pr_info(fmt "\n", ##__VA_ARGS__)
#define ec_dbg_raw(fmt, ...) \
pr_debug(fmt "\n", ##__VA_ARGS__)
#define ec_log(filter, fmt, ...) \
ec_log_raw(filter EC_DBG_SEP fmt EC_DBG_SEP filter, ##__VA_ARGS__)
#define ec_dbg(filter, fmt, ...) \
ec_dbg_raw(filter EC_DBG_SEP fmt EC_DBG_SEP filter, ##__VA_ARGS__)
#define ec_log_drv(fmt, ...) \
ec_log(EC_DBG_DRV, fmt, ##__VA_ARGS__)
#define ec_dbg_drv(fmt, ...) \
ec_dbg(EC_DBG_DRV, fmt, ##__VA_ARGS__)
#define ec_dbg_stm(fmt, ...) \
ec_dbg(EC_DBG_STM, fmt, ##__VA_ARGS__)
#define ec_dbg_req(fmt, ...) \
ec_dbg(EC_DBG_REQ, fmt, ##__VA_ARGS__)
#define ec_dbg_evt(fmt, ...) \
ec_dbg(EC_DBG_EVT, fmt, ##__VA_ARGS__)
#define ec_dbg_ref(ec, fmt, ...) \
ec_dbg_raw("%lu: " fmt, ec->reference_count, ## __VA_ARGS__)
/* --------------------------------------------------------------------------
* Device Flags
* -------------------------------------------------------------------------- */
static bool acpi_ec_started(struct acpi_ec *ec)
{
return test_bit(EC_FLAGS_STARTED, &ec->flags) &&
!test_bit(EC_FLAGS_STOPPED, &ec->flags);
}
static bool acpi_ec_event_enabled(struct acpi_ec *ec)
{
/*
* There is an OSPM early stage logic. During the early stages
* (boot/resume), OSPMs shouldn't enable the event handling, only
* the EC transactions are allowed to be performed.
*/
if (!test_bit(EC_FLAGS_QUERY_ENABLED, &ec->flags))
return false;
/*
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
* However, disabling the event handling is experimental for late
* stage (suspend), and is controlled by the boot parameter of
* "ec_freeze_events":
* 1. true: The EC event handling is disabled before entering
* the noirq stage.
* 2. false: The EC event handling is automatically disabled as
* soon as the EC driver is stopped.
*/
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
if (ec_freeze_events)
return acpi_ec_started(ec);
else
return test_bit(EC_FLAGS_STARTED, &ec->flags);
}
static bool acpi_ec_flushed(struct acpi_ec *ec)
{
return ec->reference_count == 1;
}
/* --------------------------------------------------------------------------
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
* EC Registers
* -------------------------------------------------------------------------- */
static inline u8 acpi_ec_read_status(struct acpi_ec *ec)
{
u8 x = inb(ec->command_addr);
ec_dbg_raw("EC_SC(R) = 0x%2.2x "
"SCI_EVT=%d BURST=%d CMD=%d IBF=%d OBF=%d",
x,
!!(x & ACPI_EC_FLAG_SCI),
!!(x & ACPI_EC_FLAG_BURST),
!!(x & ACPI_EC_FLAG_CMD),
!!(x & ACPI_EC_FLAG_IBF),
!!(x & ACPI_EC_FLAG_OBF));
return x;
}
static inline u8 acpi_ec_read_data(struct acpi_ec *ec)
{
u8 x = inb(ec->data_addr);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
ec->timestamp = jiffies;
ec_dbg_raw("EC_DATA(R) = 0x%2.2x", x);
return x;
}
static inline void acpi_ec_write_cmd(struct acpi_ec *ec, u8 command)
{
ec_dbg_raw("EC_SC(W) = 0x%2.2x", command);
outb(command, ec->command_addr);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
ec->timestamp = jiffies;
}
static inline void acpi_ec_write_data(struct acpi_ec *ec, u8 data)
{
ec_dbg_raw("EC_DATA(W) = 0x%2.2x", data);
outb(data, ec->data_addr);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
ec->timestamp = jiffies;
}
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
static const char *acpi_ec_cmd_string(u8 cmd)
{
switch (cmd) {
case 0x80:
return "RD_EC";
case 0x81:
return "WR_EC";
case 0x82:
return "BE_EC";
case 0x83:
return "BD_EC";
case 0x84:
return "QR_EC";
}
return "UNKNOWN";
}
#else
#define acpi_ec_cmd_string(cmd) "UNDEF"
#endif
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
/* --------------------------------------------------------------------------
* GPE Registers
* -------------------------------------------------------------------------- */
static inline bool acpi_ec_is_gpe_raised(struct acpi_ec *ec)
{
acpi_event_status gpe_status = 0;
(void)acpi_get_gpe_status(NULL, ec->gpe, &gpe_status);
return (gpe_status & ACPI_EVENT_FLAG_STATUS_SET) ? true : false;
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
}
static inline void acpi_ec_enable_gpe(struct acpi_ec *ec, bool open)
{
if (open)
acpi_enable_gpe(NULL, ec->gpe);
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
else {
BUG_ON(ec->reference_count < 1);
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_ENABLE);
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
}
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
if (acpi_ec_is_gpe_raised(ec)) {
/*
* On some platforms, EN=1 writes cannot trigger GPE. So
* software need to manually trigger a pseudo GPE event on
* EN=1 writes.
*/
ec_dbg_raw("Polling quirk");
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
advance_transaction(ec);
}
}
static inline void acpi_ec_disable_gpe(struct acpi_ec *ec, bool close)
{
if (close)
acpi_disable_gpe(NULL, ec->gpe);
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
else {
BUG_ON(ec->reference_count < 1);
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_DISABLE);
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
}
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
}
static inline void acpi_ec_clear_gpe(struct acpi_ec *ec)
{
/*
* GPE STS is a W1C register, which means:
* 1. Software can clear it without worrying about clearing other
* GPEs' STS bits when the hardware sets them in parallel.
* 2. As long as software can ensure only clearing it when it is
* set, hardware won't set it in parallel.
* So software can clear GPE in any contexts.
* Warning: do not move the check into advance_transaction() as the
* EC commands will be sent without GPE raised.
*/
if (!acpi_ec_is_gpe_raised(ec))
return;
acpi_clear_gpe(NULL, ec->gpe);
}
/* --------------------------------------------------------------------------
* Transaction Management
* -------------------------------------------------------------------------- */
static void acpi_ec_submit_request(struct acpi_ec *ec)
{
ec->reference_count++;
if (test_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags) &&
ec->reference_count == 1)
acpi_ec_enable_gpe(ec, true);
}
static void acpi_ec_complete_request(struct acpi_ec *ec)
{
bool flushed = false;
ec->reference_count--;
if (test_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags) &&
ec->reference_count == 0)
acpi_ec_disable_gpe(ec, true);
flushed = acpi_ec_flushed(ec);
if (flushed)
wake_up(&ec->wait);
}
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
static void acpi_ec_set_storm(struct acpi_ec *ec, u8 flag)
{
if (!test_bit(flag, &ec->flags)) {
acpi_ec_disable_gpe(ec, false);
ec_dbg_drv("Polling enabled");
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
set_bit(flag, &ec->flags);
}
}
static void acpi_ec_clear_storm(struct acpi_ec *ec, u8 flag)
{
if (test_bit(flag, &ec->flags)) {
clear_bit(flag, &ec->flags);
acpi_ec_enable_gpe(ec, false);
ec_dbg_drv("Polling disabled");
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
}
}
/*
* acpi_ec_submit_flushable_request() - Increase the reference count unless
* the flush operation is not in
* progress
* @ec: the EC device
*
* This function must be used before taking a new action that should hold
* the reference count. If this function returns false, then the action
* must be discarded or it will prevent the flush operation from being
* completed.
*/
static bool acpi_ec_submit_flushable_request(struct acpi_ec *ec)
{
if (!acpi_ec_started(ec))
return false;
acpi_ec_submit_request(ec);
return true;
}
static void acpi_ec_submit_query(struct acpi_ec *ec)
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
{
ACPI: EC: Fix an EC event IRQ storming issue The EC event IRQ (SCI_EVT) can only be handled by submitting QR_EC. As the EC driver handles SCI_EVT in a workqueue, after SCI_EVT is flagged and before QR_EC is submitted, there is a period risking IRQ storming. EC IRQ must be masked for this period but linux EC driver never does so. No end user notices the IRQ storming and no developer fixes this known issue because: 1. The EC IRQ is always edge triggered GPE, and 2. The kernel can execute no-op EC IRQ handler very fast. For edge-triggered EC GPE platforms, it is only reported of post-resume EC event lost issues, there won't be an IRQ storming. For level triggered EC GPE platforms, fortunately the kernel is always fast enough to execute such a no-op EC IRQ handler so that the IRQ handler won't be accumulated to starve the task contexts, causing a real IRQ storming. But the IRQ storming actually can still happen when: 1. The EC IRQ performs like level triggered GPE, and 2. The kernel EC debugging log is turned on but the console is slow enough. There are more and more platforms using EC GPE as wake GPE where the EC GPE is likely designed as level triggered. Then when EC debugging log is enabled, the EC IRQ handler is no longer a no-op but dumps IRQ status to the consoles. If the consoles are slow enough, the EC IRQs can arrive much faster than executing the handler. Finally the accumulated EC event IRQ handlers starve the task contexts, causing the IRQ storming to occur, and the kernel hangs can be observed during boot/resume. This patch fixes this issue by masking EC IRQ for this period: 1. Begins when there is an SCI_EVT IRQ pending, and 2. Ends when there is a QR_EC completed (SCI_EVT acknowledged). Tested-by: Wang Wendy <wendy.wang@intel.com> Tested-by: Feng Chenzhou <chenzhoux.feng@intel.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-14 13:59:09 +08:00
acpi_ec_set_storm(ec, EC_FLAGS_COMMAND_STORM);
if (!acpi_ec_event_enabled(ec))
return;
if (!test_and_set_bit(EC_FLAGS_QUERY_PENDING, &ec->flags)) {
ec_dbg_evt("Command(%s) submitted/blocked",
acpi_ec_cmd_string(ACPI_EC_COMMAND_QUERY));
ec->nr_pending_queries++;
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
schedule_work(&ec->work);
}
}
static void acpi_ec_complete_query(struct acpi_ec *ec)
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
{
ACPI: EC: Fix an EC event IRQ storming issue The EC event IRQ (SCI_EVT) can only be handled by submitting QR_EC. As the EC driver handles SCI_EVT in a workqueue, after SCI_EVT is flagged and before QR_EC is submitted, there is a period risking IRQ storming. EC IRQ must be masked for this period but linux EC driver never does so. No end user notices the IRQ storming and no developer fixes this known issue because: 1. The EC IRQ is always edge triggered GPE, and 2. The kernel can execute no-op EC IRQ handler very fast. For edge-triggered EC GPE platforms, it is only reported of post-resume EC event lost issues, there won't be an IRQ storming. For level triggered EC GPE platforms, fortunately the kernel is always fast enough to execute such a no-op EC IRQ handler so that the IRQ handler won't be accumulated to starve the task contexts, causing a real IRQ storming. But the IRQ storming actually can still happen when: 1. The EC IRQ performs like level triggered GPE, and 2. The kernel EC debugging log is turned on but the console is slow enough. There are more and more platforms using EC GPE as wake GPE where the EC GPE is likely designed as level triggered. Then when EC debugging log is enabled, the EC IRQ handler is no longer a no-op but dumps IRQ status to the consoles. If the consoles are slow enough, the EC IRQs can arrive much faster than executing the handler. Finally the accumulated EC event IRQ handlers starve the task contexts, causing the IRQ storming to occur, and the kernel hangs can be observed during boot/resume. This patch fixes this issue by masking EC IRQ for this period: 1. Begins when there is an SCI_EVT IRQ pending, and 2. Ends when there is a QR_EC completed (SCI_EVT acknowledged). Tested-by: Wang Wendy <wendy.wang@intel.com> Tested-by: Feng Chenzhou <chenzhoux.feng@intel.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-14 13:59:09 +08:00
if (test_and_clear_bit(EC_FLAGS_QUERY_PENDING, &ec->flags))
ec_dbg_evt("Command(%s) unblocked",
acpi_ec_cmd_string(ACPI_EC_COMMAND_QUERY));
ACPI: EC: Fix an EC event IRQ storming issue The EC event IRQ (SCI_EVT) can only be handled by submitting QR_EC. As the EC driver handles SCI_EVT in a workqueue, after SCI_EVT is flagged and before QR_EC is submitted, there is a period risking IRQ storming. EC IRQ must be masked for this period but linux EC driver never does so. No end user notices the IRQ storming and no developer fixes this known issue because: 1. The EC IRQ is always edge triggered GPE, and 2. The kernel can execute no-op EC IRQ handler very fast. For edge-triggered EC GPE platforms, it is only reported of post-resume EC event lost issues, there won't be an IRQ storming. For level triggered EC GPE platforms, fortunately the kernel is always fast enough to execute such a no-op EC IRQ handler so that the IRQ handler won't be accumulated to starve the task contexts, causing a real IRQ storming. But the IRQ storming actually can still happen when: 1. The EC IRQ performs like level triggered GPE, and 2. The kernel EC debugging log is turned on but the console is slow enough. There are more and more platforms using EC GPE as wake GPE where the EC GPE is likely designed as level triggered. Then when EC debugging log is enabled, the EC IRQ handler is no longer a no-op but dumps IRQ status to the consoles. If the consoles are slow enough, the EC IRQs can arrive much faster than executing the handler. Finally the accumulated EC event IRQ handlers starve the task contexts, causing the IRQ storming to occur, and the kernel hangs can be observed during boot/resume. This patch fixes this issue by masking EC IRQ for this period: 1. Begins when there is an SCI_EVT IRQ pending, and 2. Ends when there is a QR_EC completed (SCI_EVT acknowledged). Tested-by: Wang Wendy <wendy.wang@intel.com> Tested-by: Feng Chenzhou <chenzhoux.feng@intel.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-14 13:59:09 +08:00
acpi_ec_clear_storm(ec, EC_FLAGS_COMMAND_STORM);
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
}
static inline void __acpi_ec_enable_event(struct acpi_ec *ec)
{
if (!test_and_set_bit(EC_FLAGS_QUERY_ENABLED, &ec->flags))
ec_log_drv("event unblocked");
if (!test_bit(EC_FLAGS_QUERY_PENDING, &ec->flags))
advance_transaction(ec);
}
static inline void __acpi_ec_disable_event(struct acpi_ec *ec)
{
if (test_and_clear_bit(EC_FLAGS_QUERY_ENABLED, &ec->flags))
ec_log_drv("event blocked");
}
static void acpi_ec_enable_event(struct acpi_ec *ec)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
if (acpi_ec_started(ec))
__acpi_ec_enable_event(ec);
spin_unlock_irqrestore(&ec->lock, flags);
}
#ifdef CONFIG_PM_SLEEP
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
static bool acpi_ec_query_flushed(struct acpi_ec *ec)
{
bool flushed;
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
flushed = !ec->nr_pending_queries;
spin_unlock_irqrestore(&ec->lock, flags);
return flushed;
}
static void __acpi_ec_flush_event(struct acpi_ec *ec)
{
/*
* When ec_freeze_events is true, we need to flush events in
* the proper position before entering the noirq stage.
*/
wait_event(ec->wait, acpi_ec_query_flushed(ec));
if (ec_query_wq)
flush_workqueue(ec_query_wq);
}
static void acpi_ec_disable_event(struct acpi_ec *ec)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
__acpi_ec_disable_event(ec);
spin_unlock_irqrestore(&ec->lock, flags);
__acpi_ec_flush_event(ec);
}
void acpi_ec_flush_work(void)
{
if (first_ec)
__acpi_ec_flush_event(first_ec);
flush_scheduled_work();
}
#endif /* CONFIG_PM_SLEEP */
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
static bool acpi_ec_guard_event(struct acpi_ec *ec)
{
bool guarded = true;
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
/*
* If firmware SCI_EVT clearing timing is "event", we actually
* don't know when the SCI_EVT will be cleared by firmware after
* evaluating _Qxx, so we need to re-check SCI_EVT after waiting an
* acceptable period.
*
* The guarding period begins when EC_FLAGS_QUERY_PENDING is
* flagged, which means SCI_EVT check has just been performed.
* But if the current transaction is ACPI_EC_COMMAND_QUERY, the
* guarding should have already been performed (via
* EC_FLAGS_QUERY_GUARDING) and should not be applied so that the
* ACPI_EC_COMMAND_QUERY transaction can be transitioned into
* ACPI_EC_COMMAND_POLL state immediately.
*/
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
if (ec_event_clearing == ACPI_EC_EVT_TIMING_STATUS ||
ec_event_clearing == ACPI_EC_EVT_TIMING_QUERY ||
!test_bit(EC_FLAGS_QUERY_PENDING, &ec->flags) ||
(ec->curr && ec->curr->command == ACPI_EC_COMMAND_QUERY))
guarded = false;
spin_unlock_irqrestore(&ec->lock, flags);
return guarded;
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
}
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
static int ec_transaction_polled(struct acpi_ec *ec)
{
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&ec->lock, flags);
if (ec->curr && (ec->curr->flags & ACPI_EC_COMMAND_POLL))
ret = 1;
spin_unlock_irqrestore(&ec->lock, flags);
return ret;
}
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
static int ec_transaction_completed(struct acpi_ec *ec)
{
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&ec->lock, flags);
if (ec->curr && (ec->curr->flags & ACPI_EC_COMMAND_COMPLETE))
ret = 1;
spin_unlock_irqrestore(&ec->lock, flags);
return ret;
}
static inline void ec_transaction_transition(struct acpi_ec *ec, unsigned long flag)
{
ec->curr->flags |= flag;
if (ec->curr->command == ACPI_EC_COMMAND_QUERY) {
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
if (ec_event_clearing == ACPI_EC_EVT_TIMING_STATUS &&
flag == ACPI_EC_COMMAND_POLL)
acpi_ec_complete_query(ec);
if (ec_event_clearing == ACPI_EC_EVT_TIMING_QUERY &&
flag == ACPI_EC_COMMAND_COMPLETE)
acpi_ec_complete_query(ec);
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
if (ec_event_clearing == ACPI_EC_EVT_TIMING_EVENT &&
flag == ACPI_EC_COMMAND_COMPLETE)
set_bit(EC_FLAGS_QUERY_GUARDING, &ec->flags);
}
}
static void advance_transaction(struct acpi_ec *ec)
{
struct transaction *t;
ACPI / EC: Avoid race condition related to advance_transaction() The advance_transaction() will be invoked from the IRQ context GPE handler and the task context ec_poll(). The handling of this function is locked so that the EC state machine are ensured to be advanced sequentially. But there is a problem. Before invoking advance_transaction(), EC_SC(R) is read. Then for advance_transaction(), there could be race condition around the lock from both contexts. The first one reading the register could fail this race and when it passes the stale register value to the state machine advancement code, the hardware condition is totally different from when the register is read. And the hardware accesses determined from the wrong hardware status can break the EC state machine. And there could be cases that the functionalities of the platform firmware are seriously affected. For example: 1. When 2 EC_DATA(W) writes compete the IBF=0, the 2nd EC_DATA(W) write may be invalid due to IBF=1 after the 1st EC_DATA(W) write. Then the hardware will either refuse to respond a next EC_SC(W) write of the next command or discard the current WR_EC command when it receives a EC_SC(W) write of the next command. 2. When 1 EC_SC(W) write and 1 EC_DATA(W) write compete the IBF=0, the EC_DATA(W) write may be invalid due to IBF=1 after the EC_SC(W) write. The next EC_DATA(R) could never be responded by the hardware. This is the root cause of the reported issue. Fix this issue by moving the EC_SC(R) access into the lock so that we can ensure that the state machine is advanced consistently. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:17 +08:00
u8 status;
bool wakeup = false;
ec_dbg_stm("%s (%d)", in_interrupt() ? "IRQ" : "TASK",
smp_processor_id());
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
/*
* By always clearing STS before handling all indications, we can
* ensure a hardware STS 0->1 change after this clearing can always
* trigger a GPE interrupt.
*/
acpi_ec_clear_gpe(ec);
ACPI / EC: Avoid race condition related to advance_transaction() The advance_transaction() will be invoked from the IRQ context GPE handler and the task context ec_poll(). The handling of this function is locked so that the EC state machine are ensured to be advanced sequentially. But there is a problem. Before invoking advance_transaction(), EC_SC(R) is read. Then for advance_transaction(), there could be race condition around the lock from both contexts. The first one reading the register could fail this race and when it passes the stale register value to the state machine advancement code, the hardware condition is totally different from when the register is read. And the hardware accesses determined from the wrong hardware status can break the EC state machine. And there could be cases that the functionalities of the platform firmware are seriously affected. For example: 1. When 2 EC_DATA(W) writes compete the IBF=0, the 2nd EC_DATA(W) write may be invalid due to IBF=1 after the 1st EC_DATA(W) write. Then the hardware will either refuse to respond a next EC_SC(W) write of the next command or discard the current WR_EC command when it receives a EC_SC(W) write of the next command. 2. When 1 EC_SC(W) write and 1 EC_DATA(W) write compete the IBF=0, the EC_DATA(W) write may be invalid due to IBF=1 after the EC_SC(W) write. The next EC_DATA(R) could never be responded by the hardware. This is the root cause of the reported issue. Fix this issue by moving the EC_SC(R) access into the lock so that we can ensure that the state machine is advanced consistently. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:17 +08:00
status = acpi_ec_read_status(ec);
t = ec->curr;
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
/*
* Another IRQ or a guarded polling mode advancement is detected,
* the next QR_EC submission is then allowed.
*/
if (!t || !(t->flags & ACPI_EC_COMMAND_POLL)) {
if (ec_event_clearing == ACPI_EC_EVT_TIMING_EVENT &&
(!ec->nr_pending_queries ||
test_bit(EC_FLAGS_QUERY_GUARDING, &ec->flags))) {
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
clear_bit(EC_FLAGS_QUERY_GUARDING, &ec->flags);
acpi_ec_complete_query(ec);
}
}
if (!t)
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
goto err;
if (t->flags & ACPI_EC_COMMAND_POLL) {
if (t->wlen > t->wi) {
if ((status & ACPI_EC_FLAG_IBF) == 0)
acpi_ec_write_data(ec, t->wdata[t->wi++]);
else
goto err;
} else if (t->rlen > t->ri) {
if ((status & ACPI_EC_FLAG_OBF) == 1) {
t->rdata[t->ri++] = acpi_ec_read_data(ec);
if (t->rlen == t->ri) {
ec_transaction_transition(ec, ACPI_EC_COMMAND_COMPLETE);
ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set There is a platform refusing to respond QR_EC when SCI_EVT isn't set (Acer Aspire V5-573G). Currently, we rely on the behaviour that the EC firmware can respond something (for example, 0x00 to indicate "no outstanding events") to QR_EC even when SCI_EVT is not set, but the reporter has complained about AC/battery pluging/unpluging and video brightness change delay on that platform. This is because the work item that has issued QR_EC has to wait until timeout in this case, and the _Qxx method evaluation work item queued after QR_EC one is delayed. It sounds reasonable to fix this issue by: 1. Implementing SCI_EVT sanity check before issuing QR_EC in the EC driver's main state machine. 2. Moving QR_EC issuing out of the work queue used by _Qxx evaluation to a seperate IRQ handling thread. This patch fixes this issue using solution 1. By disallowing QR_EC to be issued when SCI_EVT isn't set, we are able to handle such platform in the EC driver's main state machine. This patch enhances the state machine in this way to survive with such malfunctioning EC firmware. Note that this patch can also fix CLEAR_ON_RESUME quirk which also relies on the assumption that the platforms are able to respond even when SCI_EVT isn't set. Fixes: c0d653412fc8 ACPI / EC: Fix race condition in ec_transaction_completed() Link: https://bugzilla.kernel.org/show_bug.cgi?id=82611 Reported-and-tested-by: Alexander Mezin <mezin.alexander@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-08-21 14:41:13 +08:00
if (t->command == ACPI_EC_COMMAND_QUERY)
ec_dbg_evt("Command(%s) completed by hardware",
acpi_ec_cmd_string(ACPI_EC_COMMAND_QUERY));
wakeup = true;
}
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
} else
goto err;
} else if (t->wlen == t->wi &&
(status & ACPI_EC_FLAG_IBF) == 0) {
ec_transaction_transition(ec, ACPI_EC_COMMAND_COMPLETE);
wakeup = true;
}
goto out;
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
} else {
if (EC_FLAGS_QUERY_HANDSHAKE &&
!(status & ACPI_EC_FLAG_SCI) &&
ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set There is a platform refusing to respond QR_EC when SCI_EVT isn't set (Acer Aspire V5-573G). Currently, we rely on the behaviour that the EC firmware can respond something (for example, 0x00 to indicate "no outstanding events") to QR_EC even when SCI_EVT is not set, but the reporter has complained about AC/battery pluging/unpluging and video brightness change delay on that platform. This is because the work item that has issued QR_EC has to wait until timeout in this case, and the _Qxx method evaluation work item queued after QR_EC one is delayed. It sounds reasonable to fix this issue by: 1. Implementing SCI_EVT sanity check before issuing QR_EC in the EC driver's main state machine. 2. Moving QR_EC issuing out of the work queue used by _Qxx evaluation to a seperate IRQ handling thread. This patch fixes this issue using solution 1. By disallowing QR_EC to be issued when SCI_EVT isn't set, we are able to handle such platform in the EC driver's main state machine. This patch enhances the state machine in this way to survive with such malfunctioning EC firmware. Note that this patch can also fix CLEAR_ON_RESUME quirk which also relies on the assumption that the platforms are able to respond even when SCI_EVT isn't set. Fixes: c0d653412fc8 ACPI / EC: Fix race condition in ec_transaction_completed() Link: https://bugzilla.kernel.org/show_bug.cgi?id=82611 Reported-and-tested-by: Alexander Mezin <mezin.alexander@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-08-21 14:41:13 +08:00
(t->command == ACPI_EC_COMMAND_QUERY)) {
ec_transaction_transition(ec, ACPI_EC_COMMAND_POLL);
ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set There is a platform refusing to respond QR_EC when SCI_EVT isn't set (Acer Aspire V5-573G). Currently, we rely on the behaviour that the EC firmware can respond something (for example, 0x00 to indicate "no outstanding events") to QR_EC even when SCI_EVT is not set, but the reporter has complained about AC/battery pluging/unpluging and video brightness change delay on that platform. This is because the work item that has issued QR_EC has to wait until timeout in this case, and the _Qxx method evaluation work item queued after QR_EC one is delayed. It sounds reasonable to fix this issue by: 1. Implementing SCI_EVT sanity check before issuing QR_EC in the EC driver's main state machine. 2. Moving QR_EC issuing out of the work queue used by _Qxx evaluation to a seperate IRQ handling thread. This patch fixes this issue using solution 1. By disallowing QR_EC to be issued when SCI_EVT isn't set, we are able to handle such platform in the EC driver's main state machine. This patch enhances the state machine in this way to survive with such malfunctioning EC firmware. Note that this patch can also fix CLEAR_ON_RESUME quirk which also relies on the assumption that the platforms are able to respond even when SCI_EVT isn't set. Fixes: c0d653412fc8 ACPI / EC: Fix race condition in ec_transaction_completed() Link: https://bugzilla.kernel.org/show_bug.cgi?id=82611 Reported-and-tested-by: Alexander Mezin <mezin.alexander@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-08-21 14:41:13 +08:00
t->rdata[t->ri++] = 0x00;
ec_transaction_transition(ec, ACPI_EC_COMMAND_COMPLETE);
ec_dbg_evt("Command(%s) completed by software",
acpi_ec_cmd_string(ACPI_EC_COMMAND_QUERY));
ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set There is a platform refusing to respond QR_EC when SCI_EVT isn't set (Acer Aspire V5-573G). Currently, we rely on the behaviour that the EC firmware can respond something (for example, 0x00 to indicate "no outstanding events") to QR_EC even when SCI_EVT is not set, but the reporter has complained about AC/battery pluging/unpluging and video brightness change delay on that platform. This is because the work item that has issued QR_EC has to wait until timeout in this case, and the _Qxx method evaluation work item queued after QR_EC one is delayed. It sounds reasonable to fix this issue by: 1. Implementing SCI_EVT sanity check before issuing QR_EC in the EC driver's main state machine. 2. Moving QR_EC issuing out of the work queue used by _Qxx evaluation to a seperate IRQ handling thread. This patch fixes this issue using solution 1. By disallowing QR_EC to be issued when SCI_EVT isn't set, we are able to handle such platform in the EC driver's main state machine. This patch enhances the state machine in this way to survive with such malfunctioning EC firmware. Note that this patch can also fix CLEAR_ON_RESUME quirk which also relies on the assumption that the platforms are able to respond even when SCI_EVT isn't set. Fixes: c0d653412fc8 ACPI / EC: Fix race condition in ec_transaction_completed() Link: https://bugzilla.kernel.org/show_bug.cgi?id=82611 Reported-and-tested-by: Alexander Mezin <mezin.alexander@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-08-21 14:41:13 +08:00
wakeup = true;
} else if ((status & ACPI_EC_FLAG_IBF) == 0) {
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
acpi_ec_write_cmd(ec, t->command);
ec_transaction_transition(ec, ACPI_EC_COMMAND_POLL);
} else
goto err;
goto out;
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
}
err:
/*
* If SCI bit is set, then don't think it's a false IRQ
* otherwise will take a not handled IRQ as a false one.
*/
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
if (!(status & ACPI_EC_FLAG_SCI)) {
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
if (in_interrupt() && t) {
if (t->irq_count < ec_storm_threshold)
++t->irq_count;
/* Allow triggering on 0 threshold */
if (t->irq_count == ec_storm_threshold)
acpi_ec_set_storm(ec, EC_FLAGS_COMMAND_STORM);
}
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
}
out:
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
if (status & ACPI_EC_FLAG_SCI)
acpi_ec_submit_query(ec);
if (wakeup && in_interrupt())
wake_up(&ec->wait);
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
}
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
static void start_transaction(struct acpi_ec *ec)
{
ec->curr->irq_count = ec->curr->wi = ec->curr->ri = 0;
ec->curr->flags = 0;
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
}
static int ec_guard(struct acpi_ec *ec)
{
unsigned long guard = usecs_to_jiffies(ec->polling_guard);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
unsigned long timeout = ec->timestamp + guard;
/* Ensure guarding period before polling EC status */
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
do {
if (ec->busy_polling) {
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
/* Perform busy polling */
if (ec_transaction_completed(ec))
return 0;
udelay(jiffies_to_usecs(guard));
} else {
/*
* Perform wait polling
* 1. Wait the transaction to be completed by the
* GPE handler after the transaction enters
* ACPI_EC_COMMAND_POLL state.
* 2. A special guarding logic is also required
* for event clearing mode "event" before the
* transaction enters ACPI_EC_COMMAND_POLL
* state.
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
*/
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
if (!ec_transaction_polled(ec) &&
!acpi_ec_guard_event(ec))
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
break;
if (wait_event_timeout(ec->wait,
ec_transaction_completed(ec),
guard))
return 0;
}
} while (time_before(jiffies, timeout));
return -ETIME;
}
static int ec_poll(struct acpi_ec *ec)
{
unsigned long flags;
int repeat = 5; /* number of command restarts */
while (repeat--) {
unsigned long delay = jiffies +
msecs_to_jiffies(ec_delay);
do {
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
if (!ec_guard(ec))
return 0;
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
spin_lock_irqsave(&ec->lock, flags);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
advance_transaction(ec);
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
spin_unlock_irqrestore(&ec->lock, flags);
} while (time_before(jiffies, delay));
pr_debug("controller reset, restart transaction\n");
spin_lock_irqsave(&ec->lock, flags);
start_transaction(ec);
spin_unlock_irqrestore(&ec->lock, flags);
}
return -ETIME;
}
static int acpi_ec_transaction_unlocked(struct acpi_ec *ec,
struct transaction *t)
{
unsigned long tmp;
int ret = 0;
/* start transaction */
spin_lock_irqsave(&ec->lock, tmp);
/* Enable GPE for command processing (IBF=0/OBF=1) */
if (!acpi_ec_submit_flushable_request(ec)) {
ret = -EINVAL;
goto unlock;
}
ec_dbg_ref(ec, "Increase command");
/* following two actions should be kept atomic */
ec->curr = t;
ec_dbg_req("Command(%s) started", acpi_ec_cmd_string(t->command));
start_transaction(ec);
Revert "ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC" It is reported that the following commit breaks Samsung hardware: Commit: 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4. Subject: ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC Which means the Samsung behavior conflicts with the Acer behavior. 1. Samsung may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear Without the above commit, Samsung can work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC CAN prepare next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear write QR_EC read event [ -event 2 ] SCI_EVT clear With the above commit, Samsung cannot work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear CANNOT prepare next QR_EC as SCI_EVT=0 2. Acer may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT clear [ +event 2 ] SCI_EVT set Without the above commit, Acer cannot work when there is only 1 event: [ +event 1 ] SCI_EVT set write QR_EC can prepared next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear CANNOT write QR_EC as SCI_EVT=0 With the above commit, Acer can work: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT set can prepare next QR_EC because SCI_EVT=0 CAN write QR_EC as SCI_EVT=1 Since Acer can also work with only the following commit applied: Commit: 3afcf2ece453e1a8c2c6de19cdf06da3772a1b08 Subject: ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set commit 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4 can be reverted. Fixes: 558e4736f2e1 (ACPI / EC: Add support to disallow QR_EC to be issued ...) Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-and-tested-by: Ortwin Glück <odi@odi.ch> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.17+ <stable@vger.kernel.org> # 3.17+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-10-29 11:33:43 +08:00
spin_unlock_irqrestore(&ec->lock, tmp);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
Revert "ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC" It is reported that the following commit breaks Samsung hardware: Commit: 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4. Subject: ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC Which means the Samsung behavior conflicts with the Acer behavior. 1. Samsung may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear Without the above commit, Samsung can work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC CAN prepare next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear write QR_EC read event [ -event 2 ] SCI_EVT clear With the above commit, Samsung cannot work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear CANNOT prepare next QR_EC as SCI_EVT=0 2. Acer may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT clear [ +event 2 ] SCI_EVT set Without the above commit, Acer cannot work when there is only 1 event: [ +event 1 ] SCI_EVT set write QR_EC can prepared next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear CANNOT write QR_EC as SCI_EVT=0 With the above commit, Acer can work: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT set can prepare next QR_EC because SCI_EVT=0 CAN write QR_EC as SCI_EVT=1 Since Acer can also work with only the following commit applied: Commit: 3afcf2ece453e1a8c2c6de19cdf06da3772a1b08 Subject: ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set commit 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4 can be reverted. Fixes: 558e4736f2e1 (ACPI / EC: Add support to disallow QR_EC to be issued ...) Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-and-tested-by: Ortwin Glück <odi@odi.ch> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.17+ <stable@vger.kernel.org> # 3.17+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-10-29 11:33:43 +08:00
ret = ec_poll(ec);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
Revert "ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC" It is reported that the following commit breaks Samsung hardware: Commit: 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4. Subject: ACPI / EC: Add support to disallow QR_EC to be issued before completing previous QR_EC Which means the Samsung behavior conflicts with the Acer behavior. 1. Samsung may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear Without the above commit, Samsung can work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC CAN prepare next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear write QR_EC read event [ -event 2 ] SCI_EVT clear With the above commit, Samsung cannot work: [ +event 1 ] SCI_EVT set [ +event 2 ] SCI_EVT set write QR_EC read event [ -event 1 ] SCI_EVT clear CANNOT prepare next QR_EC as SCI_EVT=0 2. Acer may behave like: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT clear [ +event 2 ] SCI_EVT set Without the above commit, Acer cannot work when there is only 1 event: [ +event 1 ] SCI_EVT set write QR_EC can prepared next QR_EC as SCI_EVT=1 read event [ -event 1 ] SCI_EVT clear CANNOT write QR_EC as SCI_EVT=0 With the above commit, Acer can work: [ +event 1 ] SCI_EVT set [ +event 2 ] write QR_EC read event [ -event 1 ] SCI_EVT set can prepare next QR_EC because SCI_EVT=0 CAN write QR_EC as SCI_EVT=1 Since Acer can also work with only the following commit applied: Commit: 3afcf2ece453e1a8c2c6de19cdf06da3772a1b08 Subject: ACPI / EC: Add support to disallow QR_EC to be issued when SCI_EVT isn't set commit 558e4736f2e1b0e6323adf7a5e4df77ed6cfc1a4 can be reverted. Fixes: 558e4736f2e1 (ACPI / EC: Add support to disallow QR_EC to be issued ...) Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-and-tested-by: Ortwin Glück <odi@odi.ch> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: 3.17+ <stable@vger.kernel.org> # 3.17+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-10-29 11:33:43 +08:00
spin_lock_irqsave(&ec->lock, tmp);
ACPI / EC: Refine command storm prevention support This patch refines EC command storm prevention support. Current command storming code is wrong, when the storming condition is detected, it only flags the condition without doing anything for the current command but performing storming prevention for the follow-up commands. So: 1. The first command which suffers from the storming still suffers from storming. 2. The follow-up commands which may not suffer from the storming are unconditionally forced into the storming prevention mode. Ideally, we should only enable storm prevention immediately after detection for the current command so that the next command can try the power/performance efficient interrupt mode again. This patch improves the command storm prevention by disabling GPE right after the detection and re-enabling it right before completing the command transaction using the GPE storming prevention APIs. This thus deploys the following GPE handling model: 1. acpi_enable_gpe()/acpi_disable_gpe() for reference count changes: This set of APIs are used for EC usage reference counting. 2. acpi_set_gpe(ACPI_GPE_ENABLE)/acpi_set_gpe(ACPI_GPE_DISABLE): This set of APIs are used for preventing GPE storm. They must be invoked when the reference count > 0. Note that as the storming prevention should always happen when there is an outstanding request, or GPE enabling value will be messed up by the races. This patch also adds BUG_ON() to enforces this rule to prevent future bugs. The msleep(1) used after completing a transaction is useless now as this sounds like a guard time only useful for platforms that need the EC_FLAGS_MSI quirks while we have fixed GPE race issues using the previous raw handler mode enabling. It is kept to avoid regressions. A seperate patch which deletes EC_FLAGS_MSI quirks should take care of deleting it. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-06 08:58:05 +08:00
if (t->irq_count == ec_storm_threshold)
acpi_ec_clear_storm(ec, EC_FLAGS_COMMAND_STORM);
ec_dbg_req("Command(%s) stopped", acpi_ec_cmd_string(t->command));
ec->curr = NULL;
/* Disable GPE for command processing (IBF=0/OBF=1) */
acpi_ec_complete_request(ec);
ec_dbg_ref(ec, "Decrease command");
unlock:
spin_unlock_irqrestore(&ec->lock, tmp);
return ret;
}
static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t)
{
int status;
u32 glk;
if (!ec || (!t) || (t->wlen && !t->wdata) || (t->rlen && !t->rdata))
return -EINVAL;
if (t->rdata)
memset(t->rdata, 0, t->rlen);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
mutex_lock(&ec->mutex);
if (ec->global_lock) {
status = acpi_acquire_global_lock(ACPI_EC_UDELAY_GLK, &glk);
if (ACPI_FAILURE(status)) {
status = -ENODEV;
goto unlock;
}
}
status = acpi_ec_transaction_unlocked(ec, t);
if (ec->global_lock)
acpi_release_global_lock(glk);
unlock:
mutex_unlock(&ec->mutex);
return status;
}
static int acpi_ec_burst_enable(struct acpi_ec *ec)
{
u8 d;
struct transaction t = {.command = ACPI_EC_BURST_ENABLE,
.wdata = NULL, .rdata = &d,
.wlen = 0, .rlen = 1};
return acpi_ec_transaction(ec, &t);
}
static int acpi_ec_burst_disable(struct acpi_ec *ec)
{
struct transaction t = {.command = ACPI_EC_BURST_DISABLE,
.wdata = NULL, .rdata = NULL,
.wlen = 0, .rlen = 0};
return (acpi_ec_read_status(ec) & ACPI_EC_FLAG_BURST) ?
acpi_ec_transaction(ec, &t) : 0;
}
static int acpi_ec_read(struct acpi_ec *ec, u8 address, u8 *data)
{
int result;
u8 d;
struct transaction t = {.command = ACPI_EC_COMMAND_READ,
.wdata = &address, .rdata = &d,
.wlen = 1, .rlen = 1};
result = acpi_ec_transaction(ec, &t);
*data = d;
return result;
}
static int acpi_ec_write(struct acpi_ec *ec, u8 address, u8 data)
{
u8 wdata[2] = { address, data };
struct transaction t = {.command = ACPI_EC_COMMAND_WRITE,
.wdata = wdata, .rdata = NULL,
.wlen = 2, .rlen = 0};
return acpi_ec_transaction(ec, &t);
}
int ec_read(u8 addr, u8 *val)
{
int err;
u8 temp_data;
if (!first_ec)
return -ENODEV;
err = acpi_ec_read(first_ec, addr, &temp_data);
if (!err) {
*val = temp_data;
return 0;
}
return err;
}
EXPORT_SYMBOL(ec_read);
int ec_write(u8 addr, u8 val)
{
int err;
if (!first_ec)
return -ENODEV;
err = acpi_ec_write(first_ec, addr, val);
return err;
}
EXPORT_SYMBOL(ec_write);
int ec_transaction(u8 command,
const u8 *wdata, unsigned wdata_len,
u8 *rdata, unsigned rdata_len)
{
struct transaction t = {.command = command,
.wdata = wdata, .rdata = rdata,
.wlen = wdata_len, .rlen = rdata_len};
if (!first_ec)
return -ENODEV;
return acpi_ec_transaction(first_ec, &t);
}
EXPORT_SYMBOL(ec_transaction);
/* Get the handle to the EC device */
acpi_handle ec_get_handle(void)
{
if (!first_ec)
return NULL;
return first_ec->handle;
}
EXPORT_SYMBOL(ec_get_handle);
static void acpi_ec_start(struct acpi_ec *ec, bool resuming)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
if (!test_and_set_bit(EC_FLAGS_STARTED, &ec->flags)) {
ec_dbg_drv("Starting EC");
/* Enable GPE for event processing (SCI_EVT=1) */
if (!resuming) {
acpi_ec_submit_request(ec);
ec_dbg_ref(ec, "Increase driver");
ACPI / EC: Add PM operations to improve event handling for resume process This patch makes 2 changes: 1. Restore old behavior Originally, EC driver stops handling both events and transactions in acpi_ec_block_transactions(), and restarts to handle transactions in acpi_ec_unblock_transactions_early(), restarts to handle both events and transactions in acpi_ec_unblock_transactions(). While currently, EC driver still stops handling both events and transactions in acpi_ec_block_transactions(), but restarts to handle both events and transactions in acpi_ec_unblock_transactions_early(). This patch tries to restore the old behavior by dropping __acpi_ec_enable_event() from acpi_unblock_transactions_early(). 2. Improve old behavior However this still cannot fix the real issue as both of the acpi_ec_unblock_xxx() functions are invoked in the noirq stage. Since the EC driver actually doesn't implement the event handling in the polling mode, re-enabling the event handling too early in the noirq stage could result in the problem that if there is no triggering source causing advance_transaction() to be invoked, pending SCI_EVT cannot be detected by the EC driver and _Qxx cannot be triggered. It actually makes sense to restart the event handling in any point during resuming after the noirq stage. Just like the boot stage where the event handling is enabled in .add(), this patch further moves acpi_ec_enable_event() to .resume(). After doing that, the following 2 functions can be combined: acpi_ec_unblock_transactions_early()/acpi_ec_unblock_transactions(). The differences of the event handling availability between the old behavior (this patch isn't applied) and the new behavior (this patch is applied) are as follows: !Applied Applied before suspend Y Y suspend before EC Y Y suspend after EC Y Y suspend_late Y Y suspend_noirq Y (actually N) Y (actually N) resume_noirq Y (actually N) Y (actually N) resume_late Y (actually N) Y (actually N) resume before EC Y (actually N) Y (actually N) resume after EC Y (actually N) Y after resume Y (actually N) Y Where "actually N" means if there is no triggering source, the EC driver is actually not able to notice the pending SCI_EVT occurred in the noirq stage. So we can clearly see that this patch has improved the situation. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:36 +08:00
}
ec_log_drv("EC started");
}
spin_unlock_irqrestore(&ec->lock, flags);
}
static bool acpi_ec_stopped(struct acpi_ec *ec)
{
unsigned long flags;
bool flushed;
spin_lock_irqsave(&ec->lock, flags);
flushed = acpi_ec_flushed(ec);
spin_unlock_irqrestore(&ec->lock, flags);
return flushed;
}
static void acpi_ec_stop(struct acpi_ec *ec, bool suspending)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
if (acpi_ec_started(ec)) {
ec_dbg_drv("Stopping EC");
set_bit(EC_FLAGS_STOPPED, &ec->flags);
spin_unlock_irqrestore(&ec->lock, flags);
wait_event(ec->wait, acpi_ec_stopped(ec));
spin_lock_irqsave(&ec->lock, flags);
/* Disable GPE for event processing (SCI_EVT=1) */
if (!suspending) {
acpi_ec_complete_request(ec);
ec_dbg_ref(ec, "Decrease driver");
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
} else if (!ec_freeze_events)
__acpi_ec_disable_event(ec);
clear_bit(EC_FLAGS_STARTED, &ec->flags);
clear_bit(EC_FLAGS_STOPPED, &ec->flags);
ec_log_drv("EC stopped");
}
spin_unlock_irqrestore(&ec->lock, flags);
}
static void acpi_ec_enter_noirq(struct acpi_ec *ec)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
ec->busy_polling = true;
ec->polling_guard = 0;
ec_log_drv("interrupt blocked");
spin_unlock_irqrestore(&ec->lock, flags);
}
static void acpi_ec_leave_noirq(struct acpi_ec *ec)
{
unsigned long flags;
spin_lock_irqsave(&ec->lock, flags);
ec->busy_polling = ec_busy_polling;
ec->polling_guard = ec_polling_guard;
ec_log_drv("interrupt unblocked");
spin_unlock_irqrestore(&ec->lock, flags);
}
void acpi_ec_block_transactions(void)
{
struct acpi_ec *ec = first_ec;
if (!ec)
return;
mutex_lock(&ec->mutex);
/* Prevent transactions from being carried out */
acpi_ec_stop(ec, true);
mutex_unlock(&ec->mutex);
}
void acpi_ec_unblock_transactions(void)
{
/*
* Allow transactions to happen again (this function is called from
* atomic context during wakeup, so we don't need to acquire the mutex).
*/
if (first_ec)
acpi_ec_start(first_ec, true);
}
/* --------------------------------------------------------------------------
Event Management
-------------------------------------------------------------------------- */
static struct acpi_ec_query_handler *
acpi_ec_get_query_handler(struct acpi_ec_query_handler *handler)
{
if (handler)
kref_get(&handler->kref);
return handler;
}
static struct acpi_ec_query_handler *
acpi_ec_get_query_handler_by_value(struct acpi_ec *ec, u8 value)
{
struct acpi_ec_query_handler *handler;
bool found = false;
mutex_lock(&ec->mutex);
list_for_each_entry(handler, &ec->list, node) {
if (value == handler->query_bit) {
found = true;
break;
}
}
mutex_unlock(&ec->mutex);
return found ? acpi_ec_get_query_handler(handler) : NULL;
}
static void acpi_ec_query_handler_release(struct kref *kref)
{
struct acpi_ec_query_handler *handler =
container_of(kref, struct acpi_ec_query_handler, kref);
kfree(handler);
}
static void acpi_ec_put_query_handler(struct acpi_ec_query_handler *handler)
{
kref_put(&handler->kref, acpi_ec_query_handler_release);
}
int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit,
acpi_handle handle, acpi_ec_query_func func,
void *data)
{
struct acpi_ec_query_handler *handler =
kzalloc(sizeof(struct acpi_ec_query_handler), GFP_KERNEL);
if (!handler)
return -ENOMEM;
handler->query_bit = query_bit;
handler->handle = handle;
handler->func = func;
handler->data = data;
mutex_lock(&ec->mutex);
kref_init(&handler->kref);
list_add(&handler->node, &ec->list);
mutex_unlock(&ec->mutex);
return 0;
}
EXPORT_SYMBOL_GPL(acpi_ec_add_query_handler);
static void acpi_ec_remove_query_handlers(struct acpi_ec *ec,
bool remove_all, u8 query_bit)
{
struct acpi_ec_query_handler *handler, *tmp;
LIST_HEAD(free_list);
mutex_lock(&ec->mutex);
list_for_each_entry_safe(handler, tmp, &ec->list, node) {
if (remove_all || query_bit == handler->query_bit) {
list_del_init(&handler->node);
list_add(&handler->node, &free_list);
}
}
mutex_unlock(&ec->mutex);
list_for_each_entry_safe(handler, tmp, &free_list, node)
acpi_ec_put_query_handler(handler);
}
void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit)
{
acpi_ec_remove_query_handlers(ec, false, query_bit);
}
EXPORT_SYMBOL_GPL(acpi_ec_remove_query_handler);
static struct acpi_ec_query *acpi_ec_create_query(u8 *pval)
{
struct acpi_ec_query *q;
struct transaction *t;
q = kzalloc(sizeof (struct acpi_ec_query), GFP_KERNEL);
if (!q)
return NULL;
INIT_WORK(&q->work, acpi_ec_event_processor);
t = &q->transaction;
t->command = ACPI_EC_COMMAND_QUERY;
t->rdata = pval;
t->rlen = 1;
return q;
}
static void acpi_ec_delete_query(struct acpi_ec_query *q)
{
if (q) {
if (q->handler)
acpi_ec_put_query_handler(q->handler);
kfree(q);
}
}
static void acpi_ec_event_processor(struct work_struct *work)
{
struct acpi_ec_query *q = container_of(work, struct acpi_ec_query, work);
struct acpi_ec_query_handler *handler = q->handler;
ec_dbg_evt("Query(0x%02x) started", handler->query_bit);
if (handler->func)
handler->func(handler->data);
else if (handler->handle)
acpi_evaluate_object(handler->handle, NULL, NULL, NULL);
ec_dbg_evt("Query(0x%02x) stopped", handler->query_bit);
acpi_ec_delete_query(q);
}
static int acpi_ec_query(struct acpi_ec *ec, u8 *data)
{
u8 value = 0;
int result;
struct acpi_ec_query *q;
q = acpi_ec_create_query(&value);
if (!q)
return -ENOMEM;
ACPI / EC: Process rather than discard events in acpi_ec_clear Address a regression caused by commit ad332c8a4533: (ACPI / EC: Clear stale EC events on Samsung systems) After the earlier patch, there was found to be a race condition on some earlier Samsung systems (N150/N210/N220). The function acpi_ec_clear was sometimes discarding a new EC event before its GPE was triggered by the system. In the case of these systems, this meant that the "lid open" event was not registered on resume if that was the cause of the wake, leading to problems when attempting to close the lid to suspend again. After testing on a number of Samsung systems, both those affected by the previous EC bug and those affected by the race condition, it seemed that the best course of action was to process rather than discard the events. On Samsung systems which accumulate stale EC events, there does not seem to be any adverse side-effects of running the associated _Q methods. This patch adds an argument to the static function acpi_ec_sync_query so that it may be used within the acpi_ec_clear loop in place of acpi_ec_query_unlocked which was used previously. With thanks to Stefan Biereigel for reporting the issue, and for all the people who helped test the new patch on affected systems. Fixes: ad332c8a4533 (ACPI / EC: Clear stale EC events on Samsung systems) References: https://lkml.kernel.org/r/532FE3B2.9060808@biereigel-wb.de References: https://bugzilla.kernel.org/show_bug.cgi?id=44161#c173 Reported-by: Stefan Biereigel <stefan@biereigel.de> Signed-off-by: Kieran Clancy <clancy.kieran@gmail.com> Tested-by: Stefan Biereigel <stefan@biereigel.de> Tested-by: Dennis Jansen <dennis.jansen@web.de> Tested-by: Nicolas Porcel <nicolasporcel06@gmail.com> Tested-by: Maurizio D'Addona <mauritiusdadd@gmail.com> Tested-by: Juan Manuel Cabo <juanmanuel.cabo@gmail.com> Tested-by: Giannis Koutsou <giannis.koutsou@gmail.com> Tested-by: Kieran Clancy <clancy.kieran@gmail.com> Cc: 3.14+ <stable@vger.kernel.org> # 3.14+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-04-29 22:51:20 +08:00
/*
* Query the EC to find out which _Qxx method we need to evaluate.
* Note that successful completion of the query causes the ACPI_EC_SCI
* bit to be cleared (and thus clearing the interrupt source).
*/
result = acpi_ec_transaction(ec, &q->transaction);
if (!value)
result = -ENODATA;
if (result)
goto err_exit;
ACPI / EC: Process rather than discard events in acpi_ec_clear Address a regression caused by commit ad332c8a4533: (ACPI / EC: Clear stale EC events on Samsung systems) After the earlier patch, there was found to be a race condition on some earlier Samsung systems (N150/N210/N220). The function acpi_ec_clear was sometimes discarding a new EC event before its GPE was triggered by the system. In the case of these systems, this meant that the "lid open" event was not registered on resume if that was the cause of the wake, leading to problems when attempting to close the lid to suspend again. After testing on a number of Samsung systems, both those affected by the previous EC bug and those affected by the race condition, it seemed that the best course of action was to process rather than discard the events. On Samsung systems which accumulate stale EC events, there does not seem to be any adverse side-effects of running the associated _Q methods. This patch adds an argument to the static function acpi_ec_sync_query so that it may be used within the acpi_ec_clear loop in place of acpi_ec_query_unlocked which was used previously. With thanks to Stefan Biereigel for reporting the issue, and for all the people who helped test the new patch on affected systems. Fixes: ad332c8a4533 (ACPI / EC: Clear stale EC events on Samsung systems) References: https://lkml.kernel.org/r/532FE3B2.9060808@biereigel-wb.de References: https://bugzilla.kernel.org/show_bug.cgi?id=44161#c173 Reported-by: Stefan Biereigel <stefan@biereigel.de> Signed-off-by: Kieran Clancy <clancy.kieran@gmail.com> Tested-by: Stefan Biereigel <stefan@biereigel.de> Tested-by: Dennis Jansen <dennis.jansen@web.de> Tested-by: Nicolas Porcel <nicolasporcel06@gmail.com> Tested-by: Maurizio D'Addona <mauritiusdadd@gmail.com> Tested-by: Juan Manuel Cabo <juanmanuel.cabo@gmail.com> Tested-by: Giannis Koutsou <giannis.koutsou@gmail.com> Tested-by: Kieran Clancy <clancy.kieran@gmail.com> Cc: 3.14+ <stable@vger.kernel.org> # 3.14+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-04-29 22:51:20 +08:00
q->handler = acpi_ec_get_query_handler_by_value(ec, value);
if (!q->handler) {
result = -ENODATA;
goto err_exit;
}
/*
* It is reported that _Qxx are evaluated in a parallel way on
* Windows:
* https://bugzilla.kernel.org/show_bug.cgi?id=94411
*
* Put this log entry before schedule_work() in order to make
* it appearing before any other log entries occurred during the
* work queue execution.
*/
ec_dbg_evt("Query(0x%02x) scheduled", value);
if (!queue_work(ec_query_wq, &q->work)) {
ec_dbg_evt("Query(0x%02x) overlapped", value);
result = -EBUSY;
}
err_exit:
if (result)
acpi_ec_delete_query(q);
if (data)
*data = value;
return result;
}
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
static void acpi_ec_check_event(struct acpi_ec *ec)
{
unsigned long flags;
if (ec_event_clearing == ACPI_EC_EVT_TIMING_EVENT) {
if (ec_guard(ec)) {
spin_lock_irqsave(&ec->lock, flags);
/*
* Take care of the SCI_EVT unless no one else is
* taking care of it.
*/
if (!ec->curr)
advance_transaction(ec);
spin_unlock_irqrestore(&ec->lock, flags);
}
}
}
static void acpi_ec_event_handler(struct work_struct *work)
{
unsigned long flags;
ACPI / EC: Fix issues related to the SCI_EVT handling This patch fixes 2 issues related to the draining behavior. But it doesn't implement the draining support, it only cleans up code so that further draining support is possible. The draining behavior is expected by some platforms (for example, Samsung) where SCI_EVT is set only once for a set of events and might be cleared for the very first QR_EC command issued after SCI_EVT is set. EC firmware on such platforms will return 0x00 to indicate "no outstanding event". Thus after seeing an SCI_EVT indication, EC driver need to fetch events until 0x00 returned (see acpi_ec_clear()). Issue 1 - acpi_ec_submit_query(): It's reported on Samsung laptops that SCI_EVT isn't checked when the transactions are advanced in ec_poll(), which leads to SCI_EVT triggering source lost: If no EC GPE IRQs are arrived after that, EC driver cannot detect this event and handle it. See comment 244/247 for kernel bugzilla 44161. This patch fixes this issue by moving SCI_EVT checks into advance_transaction(). So that SCI_EVT is checked each time we are going to handle the EC firmware indications. And this check will happen for both IRQ context and task context. Since after doing that, SCI_EVT is also checked after completing a transaction, ec_check_sci() and ec_check_sci_sync() can be removed. Issue 2 - acpi_ec_complete_query(): We expect to clear EC_FLAGS_QUERY_PENDING to allow queuing another draining QR_EC after writing a QR_EC command and before reading the event. After reading the event, SCI_EVT might be cleared by the firmware, thus it may not be possible to queue such a draining QR_EC at that time. But putting the EC_FLAGS_QUERY_PENDING clearing code after start_transaction() is wrong as there are chances that after start_transaction(), QR_EC can fail to be sent. If this happens, EC_FLAG_QUERY_PENDING will be cleared earlier. As a consequence, the draining QR_EC will also be queued earlier than expected. This patch also moves this code into advance_transaction() where QR_EC is just sent (ACPI_EC_COMMAND_POLL flagged) to fix this issue. Notes: 1. After introducing the 2 SCI_EVT related handlings into advance_transaction(), a next QR_EC can be queued right after writing the current QR_EC command and before reading the event. But this still hasn't implemented the draining behavior as the draining support requires: If a previous returned event value isn't 0x00, a draining QR_EC need to be issued even when SCI_EVT isn't set. 2. In this patch, acpi_os_execute() is also converted into a seperate work item to avoid invoking kmalloc() in the atomic context. We can do this because of the previous global lock fix. 3. Originally, EC_FLAGS_EVENT_PENDING is also used to avoid queuing up multiple work items (created by acpi_os_execute()), this can be covered by only using a single work item. But this patch still keeps this flag as there are different usages in the driver initialization steps relying on this flag. Link: https://bugzilla.kernel.org/show_bug.cgi?id=44161 Reported-by: Kieran Clancy <clancy.kieran@gmail.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-01-14 19:28:47 +08:00
struct acpi_ec *ec = container_of(work, struct acpi_ec, work);
ec_dbg_evt("Event started");
spin_lock_irqsave(&ec->lock, flags);
while (ec->nr_pending_queries) {
spin_unlock_irqrestore(&ec->lock, flags);
(void)acpi_ec_query(ec, NULL);
spin_lock_irqsave(&ec->lock, flags);
ec->nr_pending_queries--;
/*
* Before exit, make sure that this work item can be
* scheduled again. There might be QR_EC failures, leaving
* EC_FLAGS_QUERY_PENDING uncleared and preventing this work
* item from being scheduled again.
*/
if (!ec->nr_pending_queries) {
if (ec_event_clearing == ACPI_EC_EVT_TIMING_STATUS ||
ec_event_clearing == ACPI_EC_EVT_TIMING_QUERY)
acpi_ec_complete_query(ec);
}
}
spin_unlock_irqrestore(&ec->lock, flags);
ec_dbg_evt("Event stopped");
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
acpi_ec_check_event(ec);
}
static u32 acpi_ec_gpe_handler(acpi_handle gpe_device,
u32 gpe_number, void *data)
{
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
unsigned long flags;
struct acpi_ec *ec = data;
ACPI / EC: Add asynchronous command byte write support Move the first command byte write into advance_transaction() so that all EC register accesses that can affect the command processing state machine can happen in this asynchronous state machine advancement function. The advance_transaction() function then can be a complete implementation of an asyncrhonous transaction for a single command so that: 1. The first command byte can be written in the interrupt context; 2. The command completion waiter can also be used to wait the first command byte's timeout; 3. In BURST mode, the follow-up command bytes can be written in the interrupt context directly, so that it doesn't need to return to the task context. Returning to the task context reduces the throughput of the BURST mode and in the worst cases where the system workload is very high, this leads to the hardware driven automatic BURST mode exit. In order not to increase memory consumption, convert 'done' into 'flags' to contain multiple indications: 1. ACPI_EC_COMMAND_COMPLETE: converting from original 'done' condition, indicating the completion of the command transaction. 2. ACPI_EC_COMMAND_POLL: indicating the availability of writing the first command byte. A new command can utilize this flag to compete for the right of accessing the underlying hardware. There is a follow-up bug fix that has utilized this new flag. The 2 flags are important because it also reflects a key concept of IO programs' design used in the system softwares. Normally an IO program running in the kernel should first be implemented in the asynchronous way. And the 2 flags are the most common way to implement its synchronous operations on top of the asynchronous operations: 1. POLL: This flag can be used to block until the asynchronous operations can happen. 2. COMPLETE: This flag can be used to block until the asynchronous operations have completed. By constructing code cleanly in this way, many difficult problems can be solved smoothly. Link: https://bugzilla.kernel.org/show_bug.cgi?id=70891 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63931 Link: https://bugzilla.kernel.org/show_bug.cgi?id=59911 Reported-and-tested-by: Gareth Williams <gareth@garethwilliams.me.uk> Reported-and-tested-by: Hans de Goede <jwrdegoede@fedoraproject.org> Reported-by: Barton Xu <tank.xuhan@gmail.com> Tested-by: Steffen Weber <steffen.weber@gmail.com> Tested-by: Arthur Chen <axchen@nvidia.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Cc: All applicable <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-15 08:41:35 +08:00
spin_lock_irqsave(&ec->lock, flags);
advance_transaction(ec);
spin_unlock_irqrestore(&ec->lock, flags);
ACPI / EC: Fix several GPE handling issues by deploying ACPI_GPE_DISPATCH_RAW_HANDLER mode. This patch switches EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode where the GPE lock is not held for acpi_ec_gpe_handler() and the ACPICA internal GPE enabling/disabling/clearing operations are bypassed so that further improvements are possible with the GPE APIs. There are 2 strong reasons for deploying raw GPE handler mode in the EC driver: 1. Some hardware logics can control their interrupts via their own registers, so their interrupts can be disabled/enabled/acknowledged without using the super IRQ controller provided functions. While there is no mean (EC commands) for the EC driver to achieve this. 2. During suspending, the EC driver is still working for a while to complete the platform firmware provided functionailities using ec_poll() after all GPEs are disabled (see acpi_ec_block_transactions()), which means the EC driver will drive the EC GPE out of the GPE core's control. Without deploying the raw GPE handler mode, we can see many races between the EC driver and the GPE core due to the above restrictions: 1. There is a race condition due to ACPICA internal GPE disabling/clearing/enabling logics in acpi_ev_gpe_dispatch(): Orignally EC GPE is disabled (EN=0), cleared (STS=0) before invoking a GPE handler and re-enabled (EN=1) after invoking a GPE handler in acpi_ev_gpe_dispatch(). When re-enabling appears, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() ec_poll() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** EN=1 This race condition is the root cause of different issues on different silicon variations. A. Silicon variation A: On some platforms, GPE will be triggered due to "writing 1 to EN when STS=1". This is because both EN and STS lines are wired to the GPE trigger line. 1. Issue 1: We can see no-op acpi_ec_gpe_handler() invoked on such platforms. This is because: a. event pending B: An event can arrive after ACPICA's GPE clearing performed in acpi_ev_gpe_dispatch(), this event may fail to be detected by EC_SC read that is performed before its arrival; b. event handling B: The event can be handled in ec_poll() because EC lock is released after acpi_ec_gpe_handler() invocation; c. There is no code in ec_poll() to clear STS but the GPE can still be triggered by the EN=1 write performed in acpi_ev_finish_gpe(), this leads to a no-op EC GPE handler invocation; d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 1: If we removed GPE disabling/enabling code from acpi_ev_gpe_dispatch(), we could still see no-op GPE handlers triggered by the event arriving after the GPE clearing and before the GPE handling on both silicon variation A and B. This can only occur if the CPU is very slow (timing slice between STS=0 write and EC_SC read should be short enough before hardware sets another GPE indication). Thus this is very rare and is not what we need to fix. B. Silicon variation B: On other platforms, GPE may not be triggered due to "writing 1 to EN when STS=1". This is because only STS line is wired to the GPE trigger line. 2. Issue 2: We can see GPE loss on such platforms. This is because: a. event pending B vs. event handling A: An event can arrive after ACPICA's GPE handling performed in acpi_ev_gpe_dispatch(), or event pending C vs. event handling B: An event can arrive after Linux's GPE handling performed in ec_poll(), these events may fail to be detected by EC_SC read that is performed before their arrival; b. The GPE cannot be triggered by EN=1 write performed in acpi_ev_finish_gpe(); c. If no polling mechanism is implemented in the driver for the pending event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 2: On most platforms, there might be another rule that GPE may not be triggered due to "writing 1 to STS when STS=1 and EN=1". Then on silicon variation B, an even worse case is if the issue 2 event loss happens, further events may never trigger GPE again on such platforms due to being blocked by the current STS=1. Unless someone clears STS, all events have to be polled. 2. There is a race condition due to lacking in GPE status checking in EC driver: Originally, GPE status is checked in ACPICA core but not checked in the GPE handler. Thus since the status checking and handling is not locked, it can be interrupted by another handling path. ================================================================= (event pending A) ================================================================= acpi_ev_gpe_detect() ec_poll() if (EN==1 && STS==1) ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** acpi_ev_gpe_dispatch() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling B) Lock(EC) advance_transaction() EC_SC read Unlock(EC) ***************************************************************** 3. Issue 3: We can see no-op acpi_ec_gpe_handler() invoked on both silicon variation A and B. This is because: a. event pending A: An event can arrive to trigger an EC GPE and ACPICA checks it and is about to invoke the EC GPE handler; b. event handling A: The event can be handled in ec_poll() because EC lock is not held after the GPE status checking; c. event handling B: Then when the EC GPE handler is invoked, it becomes a no-op GPE handler invocation. d. As no-op GPE handler invocations are counted by the EC driver to trigger the command storming conditions, the wrong no-op GPE handler invocations thus can easily trigger wrong command storming conditions. Note 3: This no-op GPE handler invocation is rare because the time between the IRQ arrival and the acpi_ec_gpe_handler() invocation is less than the timeout value waited in ec_poll(). So most of the no-op GPE handler invocations are caused by the reason described in issue 1. 3. There is a race condition due to ACPICA internal GPE clearing logic in acpi_enable_gpe(): During runtime, acpi_enable_gpe() can be invoked by the EC storming prevention code. When it is invoked, GPE may be flagged (STS=1). ================================================================= (event pending A) ================================================================= acpi_ev_gpe_dispatch() acpi_ec_transaction() EN=0 STS=0 acpi_ec_gpe_handler() ***************************************************************** (event handling A) Lock(EC) advance_transaction() EC_SC read EC_SC handled Unlock(EC) ***************************************************************** EN=1 ? Lock(EC) Unlock(EC) ================================================================= (event pending B) ================================================================= acpi_enable_gpe() STS=0 EN=1 4. Issue 4: We can see GPE loss on both silicon variation A and B platforms. This is because: a. event pending B: An event can arrive right before ACPICA's GPE clearing performed in acpi_enable_gpe(); b. If the GPE is cleared when GPE is disabled, then EN=1 write in acpi_enable_gpe() cannot trigger this GPE; c. If no polling mechanism is implemented in the driver for this event (for example, SCI_EVT), this event is lost due to no GPE being triggered. Note 4: Currently we don't have this issue, but after we switch the EC driver into ACPI_GPE_DISPATCH_RAW_HANDLER mode, we need to take care of handling this because the EN=1 write in acpi_ev_gpe_dispatch() will be abandoned. There might be more race issues for the current GPE handler usages. This is because the EC IRQ's enabling/disabling/checking/clearing/handling operations should be locked by a single lock that is under the EC driver's control to achieve the serialization. Which means we need to invoke GPE APIs with EC driver's lock held and all ACPICA internal GPE operations related to the GPE handler should be abandoned. Invoking GPE APIs inside of the EC driver lock and bypassing ACPICA internal GPE operations requires the ACPI_GPE_DISPATCH_RAW_HANDLER mode where the same lock used by the APIs are released prior than invoking the handlers. Otherwise, we can see dead locks due to circular locking dependencies (see Reference below). This patch then switches the EC driver into the ACPI_GPE_DISPATCH_RAW_HANDLER mode so that it can perform correct GPE operations using the GPE APIs: 1. Bypasses EN modifications performed in acpi_ev_gpe_dispatch() by using acpi_install_gpe_raw_handler() and invoking all GPE APIs with EC spin lock held. This can fix issue 1 as it makes a non frequent GPE enabling/disabling environment. 2. Bypasses STS clearing performed in acpi_enable_gpe() by replacing acpi_enable_gpe()/acpi_disable_gpe() with acpi_set_gpe(). This can fix issue 4. And this can also help to fix issue 1 as it makes a no sudden GPE clearing environment when GPE is frequently enabled/disabled. 3. Ensures STS acknowledged before handling by invoking acpi_clear_gpe() in advance_transaction(). This can finally fix issue 1 even in a frequent GPE enabling/disabling environment. And this can also finally fix issue 3 when issue 2 is fixed. Note 3: GPE clearing is edge triggered W1C, which means we can clear it right before handling it. Since all EC GPE indications are handled in advance_transaction() by previous commits, we can now move GPE clearing into it to implement the correct GPE clearing. Note 4: We can use acpi_set_gpe() which is not shared GPE safer instead of acpi_enable_gpe()/acpi_disable_gpe() because EC GPE is not shared by other hardware, which is mentioned in the ACPI specification 5.0, 12.6 Interrupt Model: "OSPM driver treats this as an edge event (the EC SCI cannot be shared)". So we can stop using shared GPE safer APIs acpi_enable_gpe()/acpi_disable_gpe() in the EC driver. Otherwise cleanups need to be made in acpi_ev_enable_gpe() to bypass the GPE clearing logic before keeping acpi_enable_gpe(). This patch also invokes advance_transaction() when GPE is re-enabled in the task context which: 1. Ensures EN=1 can trigger GPE by checking and handling EC status register right after EN=1 writes. This can fix issue 2. After applying this patch, without frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() ec_poll() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 1 (event pending B) can arrive as a next GPE due to the previous IRQ context STS=0 write. And if it is handled by ec_poll() (event handling B), as it is also acknowledged by ec_poll(), the event pending for issue 2 (event pending C) can properly arrive as a next GPE after the task context STS=0 write. So no GPE will be lost and never triggered due to GPE clearing performed in the wrong position. And since all GPE handling is performed after a locked GPE status checking, we can hardly see no-op GPE handler invocations due to issue 1 and 3. We may still see no-op GPE handler invocations due to "Note 1", but as it is inevitable, it needn't be fixed. After applying this patch, with frequent GPE enablings considered: ================================================================= (event pending A) ================================================================= acpi_ec_gpe_handler() acpi_ec_transaction() ***************************************************************** (event handling A) Lock(EC) advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending B) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** ***************************************************************** (event handling B) Lock(EC) EN=1 if STS==1 advance_transaction() if STS==1 STS=0 EC_SC read ================================================================= (event pending C) ================================================================= EC_SC handled Unlock(EC) ***************************************************************** The event pending for issue 2 can be manually handled by advance_transaction(). And after the STS=0 write performed in the manual triggered advance_transaction(), GPE can always arrive. So no GPE will be lost due to frequent GPE disabling/enabling performed in the driver like issue 4. Note 5: It's ideally when EN=1 write occurred, an IRQ thread should be woken up to handle the GPE when the GPE was raised. But this requires the IRQ thread to contain the poller code for all EC GPE indications, while currently some of the indications are handled in the user tasks. It then is very hard for the code to determine whether a user task should be invoked or the poller work item should be scheduled. So we have to invoke advance_transaction() directly now and it leaves us such a restriction for the GPE re-enabling: it must be performed in the task context to avoid starving the GPEs. As a conclusion: we can see the EC GPE is always handled in serial after deploying the raw GPE handler mode: Lock(EC) if (STS==1) STS=0 EC_SC read EC_SC handled Unlock(EC) The EC driver specific lock is responsible to make the EC GPE handling processes serialized so that EC can handle its GPE from both IRQ and task contexts and the next IRQ can be ensured to arrive after this process. Note 6: We have many EC_FLAGS_MSI qurik users in the current driver. They all seem to be suffering from unexpected GPE triggering source lost. And they are false root caused to a timing issue. Since EC communication protocol has already flow control defined, timing shouldn't be the root cause, while this fix might be fixing the root cause of the old bugs. Link: https://lkml.org/lkml/2014/11/4/974 Link: https://lkml.org/lkml/2014/11/18/316 Link: https://www.spinics.net/lists/linux-acpi/msg54340.html Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-02-05 16:27:22 +08:00
return ACPI_INTERRUPT_HANDLED;
}
/* --------------------------------------------------------------------------
* Address Space Management
* -------------------------------------------------------------------------- */
static acpi_status
acpi_ec_space_handler(u32 function, acpi_physical_address address,
u32 bits, u64 *value64,
void *handler_context, void *region_context)
{
struct acpi_ec *ec = handler_context;
int result = 0, i, bytes = bits / 8;
u8 *value = (u8 *)value64;
if ((address > 0xFF) || !value || !handler_context)
return AE_BAD_PARAMETER;
if (function != ACPI_READ && function != ACPI_WRITE)
return AE_BAD_PARAMETER;
if (ec->busy_polling || bits > 8)
acpi_ec_burst_enable(ec);
for (i = 0; i < bytes; ++i, ++address, ++value)
result = (function == ACPI_READ) ?
acpi_ec_read(ec, address, value) :
acpi_ec_write(ec, address, *value);
if (ec->busy_polling || bits > 8)
acpi_ec_burst_disable(ec);
switch (result) {
case -EINVAL:
return AE_BAD_PARAMETER;
case -ENODEV:
return AE_NOT_FOUND;
case -ETIME:
return AE_TIME;
default:
return AE_OK;
}
}
/* --------------------------------------------------------------------------
* Driver Interface
* -------------------------------------------------------------------------- */
static acpi_status
ec_parse_io_ports(struct acpi_resource *resource, void *context);
static void acpi_ec_free(struct acpi_ec *ec)
{
if (first_ec == ec)
first_ec = NULL;
if (boot_ec == ec)
boot_ec = NULL;
kfree(ec);
}
static struct acpi_ec *acpi_ec_alloc(void)
{
struct acpi_ec *ec = kzalloc(sizeof(struct acpi_ec), GFP_KERNEL);
if (!ec)
return NULL;
mutex_init(&ec->mutex);
init_waitqueue_head(&ec->wait);
INIT_LIST_HEAD(&ec->list);
spin_lock_init(&ec->lock);
INIT_WORK(&ec->work, acpi_ec_event_handler);
ACPI / EC: Fix and clean up register access guarding logics. In the polling mode, EC driver shouldn't access the EC registers too frequently. Though this statement is concluded from the non-root caused bugs (see links below), we've maintained the register access guarding logics in the current EC driver. The guarding logics can be found here and there, makes it hard to root cause real timing issues. This patch collects the guarding logics into one single function so that all hidden logics related to this can be seen clearly. The current guarding related code also has several issues: 1. Per-transaction timestamp prevents inter-transaction guarding from being implemented in the same place. We have an inter-transaction udelay() in acpi_ec_transaction_unblocked(), this logic can be merged into ec_poll() if we can use per-device timestamp. This patch completes such merge to form a new ec_guard() function and collects all guarding related hidden logics in it. One hidden logic is: there is no inter-transaction guarding performed for non MSI quirk (wait polling mode), this patch skips inter-transaction guarding before wait_event_timeout() for the wait polling mode to reveal the hidden logic. The other hidden logic is: there is msleep() inter-transaction guarding performed when the GPE storming is observed. As after merging this commit: Commit: e1d4d90fc0313d3d58cbd7912c90f8ef24df45ff Subject: ACPI / EC: Refine command storm prevention support EC_FLAGS_COMMAND_STORM is ensured to be cleared after invoking acpi_ec_transaction_unlocked(), the msleep() guard logic will never happen now. Since no one complains such change, this logic is likely added during the old times where the EC race issues are not fixed and the bugs are false root-caused to the timing issue. This patch simply removes the out-dated logic. We can restore it by stop skipping inter-transaction guarding for wait polling mode. Two different delay values are defined for msleep() and udelay() while they are merged in this patch to 550us. 2. time_after() causes additional delay in the polling mode (can only be observed in noirq suspend/resume processes where polling mode is always used) before advance_transaction() is invoked ("wait polling" log is added before wait_event_timeout()). We can see 2 wait_event_timeout() invocations. This is because time_after() ensures a ">" validation while we only need a ">=" validation here: [ 86.739909] ACPI: Waking up from system sleep state S3 [ 86.742857] ACPI : EC: 2: Increase command [ 86.742859] ACPI : EC: ***** Command(RD_EC) started ***** [ 86.742861] ACPI : EC: ===== TASK (0) ===== [ 86.742871] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.742873] ACPI : EC: EC_SC(W) = 0x80 [ 86.742876] ACPI : EC: ***** Event started ***** [ 86.742880] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.743972] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.747966] ACPI : EC: ===== TASK (0) ===== [ 86.747977] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 86.747978] ACPI : EC: EC_DATA(W) = 0x06 [ 86.747981] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.751971] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755969] ACPI : EC: ===== TASK (0) ===== [ 86.755991] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 86.755993] ACPI : EC: EC_DATA(R) = 0x03 [ 86.755994] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 86.755995] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 86.755996] ACPI : EC: 1: Decrease command This patch corrects this by using time_before() instead in ec_guard(): [ 54.283146] ACPI: Waking up from system sleep state S3 [ 54.285414] ACPI : EC: 2: Increase command [ 54.285415] ACPI : EC: ***** Command(RD_EC) started ***** [ 54.285416] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.285417] ACPI : EC: ===== TASK (0) ===== [ 54.285424] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.285425] ACPI : EC: EC_SC(W) = 0x80 [ 54.285427] ACPI : EC: ***** Event started ***** [ 54.285429] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.287209] ACPI : EC: ===== TASK (0) ===== [ 54.287218] ACPI : EC: EC_SC(R) = 0x20 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=0 [ 54.287219] ACPI : EC: EC_DATA(W) = 0x06 [ 54.287222] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291190] ACPI : EC: ===== TASK (0) ===== [ 54.291210] ACPI : EC: EC_SC(R) = 0x21 SCI_EVT=1 BURST=0 CMD=0 IBF=0 OBF=1 [ 54.291213] ACPI : EC: EC_DATA(R) = 0x03 [ 54.291214] ACPI : EC: ~~~~~ wait polling ~~~~~ [ 54.291215] ACPI : EC: ***** Command(RD_EC) stopped ***** [ 54.291216] ACPI : EC: 1: Decrease command After cleaning up all guarding logics, we have one single function ec_guard() collecting all old, non-root-caused, hidden logics. Then we can easily tune the logics in one place to respond to the bug reports. Except the time_before() change, all other changes do not change the behavior of the EC driver. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:16:42 +08:00
ec->timestamp = jiffies;
ec->busy_polling = true;
ec->polling_guard = 0;
return ec;
}
static acpi_status
acpi_ec_register_query_methods(acpi_handle handle, u32 level,
void *context, void **return_value)
{
char node_name[5];
struct acpi_buffer buffer = { sizeof(node_name), node_name };
struct acpi_ec *ec = context;
int value = 0;
acpi_status status;
status = acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer);
if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1)
acpi_ec_add_query_handler(ec, value, handle, NULL, NULL);
return AE_OK;
}
static acpi_status
ec_parse_device(acpi_handle handle, u32 Level, void *context, void **retval)
{
acpi_status status;
unsigned long long tmp = 0;
struct acpi_ec *ec = context;
/* clear addr values, ec_parse_io_ports depend on it */
ec->command_addr = ec->data_addr = 0;
status = acpi_walk_resources(handle, METHOD_NAME__CRS,
ec_parse_io_ports, ec);
if (ACPI_FAILURE(status))
return status;
if (ec->data_addr == 0 || ec->command_addr == 0)
return AE_OK;
if (boot_ec && boot_ec_is_ecdt && EC_FLAGS_IGNORE_DSDT_GPE) {
/*
* Always inherit the GPE number setting from the ECDT
* EC.
*/
ec->gpe = boot_ec->gpe;
} else {
/* Get GPE bit assignment (EC events). */
/* TODO: Add support for _GPE returning a package */
status = acpi_evaluate_integer(handle, "_GPE", NULL, &tmp);
if (ACPI_FAILURE(status))
return status;
ec->gpe = tmp;
}
/* Use the global lock for all EC transactions? */
tmp = 0;
acpi_evaluate_integer(handle, "_GLK", NULL, &tmp);
ec->global_lock = tmp;
ec->handle = handle;
return AE_CTRL_TERMINATE;
}
/*
* Note: This function returns an error code only when the address space
* handler is not installed, which means "not able to handle
* transactions".
*/
static int ec_install_handlers(struct acpi_ec *ec, bool handle_events)
{
acpi_status status;
acpi_ec_start(ec, false);
if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) {
acpi_ec_enter_noirq(ec);
status = acpi_install_address_space_handler(ec->handle,
ACPI_ADR_SPACE_EC,
&acpi_ec_space_handler,
NULL, ec);
if (ACPI_FAILURE(status)) {
if (status == AE_NOT_FOUND) {
/*
* Maybe OS fails in evaluating the _REG
* object. The AE_NOT_FOUND error will be
* ignored and OS * continue to initialize
* EC.
*/
pr_err("Fail in evaluating the _REG object"
" of EC device. Broken bios is suspected.\n");
} else {
acpi_ec_stop(ec, false);
return -ENODEV;
}
}
set_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags);
}
if (!handle_events)
return 0;
if (!test_bit(EC_FLAGS_EVT_HANDLER_INSTALLED, &ec->flags)) {
/* Find and register all query methods */
acpi_walk_namespace(ACPI_TYPE_METHOD, ec->handle, 1,
acpi_ec_register_query_methods,
NULL, ec, NULL);
set_bit(EC_FLAGS_EVT_HANDLER_INSTALLED, &ec->flags);
}
if (!test_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags)) {
status = acpi_install_gpe_raw_handler(NULL, ec->gpe,
ACPI_GPE_EDGE_TRIGGERED,
&acpi_ec_gpe_handler, ec);
/* This is not fatal as we can poll EC events */
if (ACPI_SUCCESS(status)) {
set_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags);
acpi_ec_leave_noirq(ec);
if (test_bit(EC_FLAGS_STARTED, &ec->flags) &&
ec->reference_count >= 1)
acpi_ec_enable_gpe(ec, true);
/* EC is fully operational, allow queries */
acpi_ec_enable_event(ec);
}
}
return 0;
}
static void ec_remove_handlers(struct acpi_ec *ec)
{
if (test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) {
if (ACPI_FAILURE(acpi_remove_address_space_handler(ec->handle,
ACPI_ADR_SPACE_EC, &acpi_ec_space_handler)))
pr_err("failed to remove space handler\n");
clear_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags);
}
/*
* Stops handling the EC transactions after removing the operation
* region handler. This is required because _REG(DISCONNECT)
* invoked during the removal can result in new EC transactions.
*
* Flushes the EC requests and thus disables the GPE before
* removing the GPE handler. This is required by the current ACPICA
* GPE core. ACPICA GPE core will automatically disable a GPE when
* it is indicated but there is no way to handle it. So the drivers
* must disable the GPEs prior to removing the GPE handlers.
*/
acpi_ec_stop(ec, false);
if (test_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags)) {
if (ACPI_FAILURE(acpi_remove_gpe_handler(NULL, ec->gpe,
&acpi_ec_gpe_handler)))
pr_err("failed to remove gpe handler\n");
clear_bit(EC_FLAGS_GPE_HANDLER_INSTALLED, &ec->flags);
}
if (test_bit(EC_FLAGS_EVT_HANDLER_INSTALLED, &ec->flags)) {
acpi_ec_remove_query_handlers(ec, true, 0);
clear_bit(EC_FLAGS_EVT_HANDLER_INSTALLED, &ec->flags);
}
}
static int acpi_ec_setup(struct acpi_ec *ec, bool handle_events)
{
int ret;
ret = ec_install_handlers(ec, handle_events);
if (ret)
return ret;
/* First EC capable of handling transactions */
if (!first_ec) {
first_ec = ec;
acpi_handle_info(first_ec->handle, "Used as first EC\n");
}
acpi_handle_info(ec->handle,
"GPE=0x%lx, EC_CMD/EC_SC=0x%lx, EC_DATA=0x%lx\n",
ec->gpe, ec->command_addr, ec->data_addr);
return ret;
}
static int acpi_config_boot_ec(struct acpi_ec *ec, acpi_handle handle,
bool handle_events, bool is_ecdt)
{
int ret;
/*
* Changing the ACPI handle results in a re-configuration of the
* boot EC. And if it happens after the namespace initialization,
* it causes _REG evaluations.
*/
if (boot_ec && boot_ec->handle != handle)
ec_remove_handlers(boot_ec);
/* Unset old boot EC */
if (boot_ec != ec)
acpi_ec_free(boot_ec);
/*
* ECDT device creation is split into acpi_ec_ecdt_probe() and
* acpi_ec_ecdt_start(). This function takes care of completing the
* ECDT parsing logic as the handle update should be performed
* between the installation/uninstallation of the handlers.
*/
if (ec->handle != handle)
ec->handle = handle;
ret = acpi_ec_setup(ec, handle_events);
if (ret)
return ret;
/* Set new boot EC */
if (!boot_ec) {
boot_ec = ec;
boot_ec_is_ecdt = is_ecdt;
}
acpi_handle_info(boot_ec->handle,
"Used as boot %s EC to handle transactions%s\n",
is_ecdt ? "ECDT" : "DSDT",
handle_events ? " and events" : "");
return ret;
}
static bool acpi_ec_ecdt_get_handle(acpi_handle *phandle)
{
struct acpi_table_ecdt *ecdt_ptr;
acpi_status status;
acpi_handle handle;
status = acpi_get_table(ACPI_SIG_ECDT, 1,
(struct acpi_table_header **)&ecdt_ptr);
if (ACPI_FAILURE(status))
return false;
status = acpi_get_handle(NULL, ecdt_ptr->id, &handle);
if (ACPI_FAILURE(status))
return false;
*phandle = handle;
return true;
}
static bool acpi_is_boot_ec(struct acpi_ec *ec)
{
if (!boot_ec)
return false;
if (ec->handle == boot_ec->handle &&
ec->gpe == boot_ec->gpe &&
ec->command_addr == boot_ec->command_addr &&
ec->data_addr == boot_ec->data_addr)
return true;
return false;
}
static int acpi_ec_add(struct acpi_device *device)
{
struct acpi_ec *ec = NULL;
int ret;
strcpy(acpi_device_name(device), ACPI_EC_DEVICE_NAME);
strcpy(acpi_device_class(device), ACPI_EC_CLASS);
ec = acpi_ec_alloc();
if (!ec)
return -ENOMEM;
if (ec_parse_device(device->handle, 0, ec, NULL) !=
AE_CTRL_TERMINATE) {
ret = -EINVAL;
goto err_alloc;
}
if (acpi_is_boot_ec(ec)) {
boot_ec_is_ecdt = false;
acpi_handle_debug(ec->handle, "duplicated.\n");
acpi_ec_free(ec);
ec = boot_ec;
ret = acpi_config_boot_ec(ec, ec->handle, true, false);
} else
ret = acpi_ec_setup(ec, true);
if (ret)
goto err_query;
device->driver_data = ec;
ret = !!request_region(ec->data_addr, 1, "EC data");
WARN(!ret, "Could not request EC data io port 0x%lx", ec->data_addr);
ret = !!request_region(ec->command_addr, 1, "EC cmd");
WARN(!ret, "Could not request EC cmd io port 0x%lx", ec->command_addr);
/* Reprobe devices depending on the EC */
acpi_walk_dep_device_list(ec->handle);
acpi_handle_debug(ec->handle, "enumerated.\n");
return 0;
err_query:
if (ec != boot_ec)
acpi_ec_remove_query_handlers(ec, true, 0);
err_alloc:
if (ec != boot_ec)
acpi_ec_free(ec);
return ret;
}
static int acpi_ec_remove(struct acpi_device *device)
{
struct acpi_ec *ec;
if (!device)
return -EINVAL;
ec = acpi_driver_data(device);
release_region(ec->data_addr, 1);
release_region(ec->command_addr, 1);
device->driver_data = NULL;
if (ec != boot_ec) {
ec_remove_handlers(ec);
acpi_ec_free(ec);
}
return 0;
}
static acpi_status
ec_parse_io_ports(struct acpi_resource *resource, void *context)
{
struct acpi_ec *ec = context;
if (resource->type != ACPI_RESOURCE_TYPE_IO)
return AE_OK;
/*
* The first address region returned is the data port, and
* the second address region returned is the status/command
* port.
*/
if (ec->data_addr == 0)
ec->data_addr = resource->data.io.minimum;
else if (ec->command_addr == 0)
ec->command_addr = resource->data.io.minimum;
else
return AE_CTRL_TERMINATE;
return AE_OK;
}
static const struct acpi_device_id ec_device_ids[] = {
{"PNP0C09", 0},
{"", 0},
};
ACPI / EC: Add support to skip boot stage DSDT probe We prepared _INI/_STA methods for \_SB, \_SB.PCI0, \_SB.LID0 and \_SB.EC, _HID(PNP0C09)/_CRS/_GPE for \_SB.EC to poke Windows behavior with qemu, we got the following execution sequence: \_SB._INI \_SB.PCI0._STA \_SB.LID0._STA \_SB.EC._STA \_SB.PCI0._INI \_SB.LID0._INI \_SB.EC._INI There is no extra DSDT EC device enumeration process occurring before the main ACPI device enumeration process. That means acpi_ec_dsdt_probe() is not Windows-compatible. Tracking back, it was added by the following commit: Commit: c5279dee26c0e8d7c4200993bfc4b540d2469598 Subject: ACPI: EC: Add some basic check for ECDT data but that commit was misguided. Why we shouldn't enumerate DSDT EC before the main ACPI device enumeration? The only way to know if the DSDT EC is valid would be to evaluate its _STA control method, but it's not safe to evaluate this control method that early and out of the ACPI enumeration process, because _STA may refer to entities (such as resources or ACPI device objects) that may not have been initialized before OSPM starts to enumerate them via the main ACPI device enumeration. But after we had reverted back to the expected behavior, a regression was reported. On that platform, there is no ECDT, but the platform control methods access EC operation region earlier than Linux expects causing some ACPI method execution errors. For this reason, we just go back to old behavior to still probe DSDT EC as the boot EC. However, that turns out to lead to yet another functional breakage and in order to work around all of the problems, we skip boot stage DSDT probe when the ECDT exists so that a later quirk can always use correct ECDT GPE setting. Link: http://bugzilla.kernel.org/show_bug.cgi?id=11880 Link: http://bugzilla.kernel.org/show_bug.cgi?id=119261 Link: http://bugzilla.kernel.org/show_bug.cgi?id=195651 Tested-by: Daniel Drake <drake@endlessm.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> [ rjw: Changelog & comments massage ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-15 09:41:41 +08:00
/*
* This function is not Windows-compatible as Windows never enumerates the
* namespace EC before the main ACPI device enumeration process. It is
* retained for historical reason and will be deprecated in the future.
*/
int __init acpi_ec_dsdt_probe(void)
{
acpi_status status;
struct acpi_ec *ec;
int ret;
ACPI / EC: Add support to skip boot stage DSDT probe We prepared _INI/_STA methods for \_SB, \_SB.PCI0, \_SB.LID0 and \_SB.EC, _HID(PNP0C09)/_CRS/_GPE for \_SB.EC to poke Windows behavior with qemu, we got the following execution sequence: \_SB._INI \_SB.PCI0._STA \_SB.LID0._STA \_SB.EC._STA \_SB.PCI0._INI \_SB.LID0._INI \_SB.EC._INI There is no extra DSDT EC device enumeration process occurring before the main ACPI device enumeration process. That means acpi_ec_dsdt_probe() is not Windows-compatible. Tracking back, it was added by the following commit: Commit: c5279dee26c0e8d7c4200993bfc4b540d2469598 Subject: ACPI: EC: Add some basic check for ECDT data but that commit was misguided. Why we shouldn't enumerate DSDT EC before the main ACPI device enumeration? The only way to know if the DSDT EC is valid would be to evaluate its _STA control method, but it's not safe to evaluate this control method that early and out of the ACPI enumeration process, because _STA may refer to entities (such as resources or ACPI device objects) that may not have been initialized before OSPM starts to enumerate them via the main ACPI device enumeration. But after we had reverted back to the expected behavior, a regression was reported. On that platform, there is no ECDT, but the platform control methods access EC operation region earlier than Linux expects causing some ACPI method execution errors. For this reason, we just go back to old behavior to still probe DSDT EC as the boot EC. However, that turns out to lead to yet another functional breakage and in order to work around all of the problems, we skip boot stage DSDT probe when the ECDT exists so that a later quirk can always use correct ECDT GPE setting. Link: http://bugzilla.kernel.org/show_bug.cgi?id=11880 Link: http://bugzilla.kernel.org/show_bug.cgi?id=119261 Link: http://bugzilla.kernel.org/show_bug.cgi?id=195651 Tested-by: Daniel Drake <drake@endlessm.com> Signed-off-by: Lv Zheng <lv.zheng@intel.com> [ rjw: Changelog & comments massage ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-15 09:41:41 +08:00
/*
* If a platform has ECDT, there is no need to proceed as the
* following probe is not a part of the ACPI device enumeration,
* executing _STA is not safe, and thus this probe may risk of
* picking up an invalid EC device.
*/
if (boot_ec)
return -ENODEV;
ec = acpi_ec_alloc();
if (!ec)
return -ENOMEM;
/*
* At this point, the namespace is initialized, so start to find
* the namespace objects.
*/
status = acpi_get_devices(ec_device_ids[0].id,
ec_parse_device, ec, NULL);
if (ACPI_FAILURE(status) || !ec->handle) {
ret = -ENODEV;
goto error;
}
/*
* When the DSDT EC is available, always re-configure boot EC to
* have _REG evaluated. _REG can only be evaluated after the
* namespace initialization.
* At this point, the GPE is not fully initialized, so do not to
* handle the events.
*/
ret = acpi_config_boot_ec(ec, ec->handle, false, false);
error:
if (ret)
acpi_ec_free(ec);
return ret;
}
/*
* If the DSDT EC is not functioning, we still need to prepare a fully
* functioning ECDT EC first in order to handle the events.
* https://bugzilla.kernel.org/show_bug.cgi?id=115021
*/
static int __init acpi_ec_ecdt_start(void)
{
acpi_handle handle;
if (!boot_ec)
return -ENODEV;
/*
* The DSDT EC should have already been started in
* acpi_ec_add().
*/
if (!boot_ec_is_ecdt)
return -ENODEV;
/*
* At this point, the namespace and the GPE is initialized, so
* start to find the namespace objects and handle the events.
*/
if (!acpi_ec_ecdt_get_handle(&handle))
return -ENODEV;
return acpi_config_boot_ec(boot_ec, handle, true, true);
}
ACPI / EC: Fix EC_FLAGS_QUERY_HANDSHAKE platforms using new event clearing timing. It is reported that on several platforms, EC firmware will not respond non-expected QR_EC (see EC_FLAGS_QUERY_HANDSHAKE, only write QR_EC when SCI_EVT is set). Unfortunately, ACPI specification doesn't define when the SCI_EVT should be cleared by the firmware, thus the original implementation queued up second QR_EC right after writing QR_EC command and before reading the returned event value as at that time the SCI_EVT is ensured not cleared. This behavior is also based on the assumption that the firmware should be able to return 0x00 to indicate "no outstanding event". This behavior did fix issues on Samsung platforms where the spurious query value of 0x00 is supported and didn't break platforms in my test queue. But recently, specific Acer, Asus, Lenovo platforms keep on blaming this change. This patch changes the behavior to re-check the SCI_EVT a bit later and removes EC_FLAGS_QUERY_HANDSHAKE quirks, hoping this is the Windows compliant EC driver behavior. In order to be robust to the possible regressions, instead of removing the quirk directly, this patch keeps the quirk code, removes the quirk users and keeps old behavior for Samsung platforms. Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:45 +08:00
#if 0
/*
ACPI / EC: Fix EC_FLAGS_QUERY_HANDSHAKE platforms using new event clearing timing. It is reported that on several platforms, EC firmware will not respond non-expected QR_EC (see EC_FLAGS_QUERY_HANDSHAKE, only write QR_EC when SCI_EVT is set). Unfortunately, ACPI specification doesn't define when the SCI_EVT should be cleared by the firmware, thus the original implementation queued up second QR_EC right after writing QR_EC command and before reading the returned event value as at that time the SCI_EVT is ensured not cleared. This behavior is also based on the assumption that the firmware should be able to return 0x00 to indicate "no outstanding event". This behavior did fix issues on Samsung platforms where the spurious query value of 0x00 is supported and didn't break platforms in my test queue. But recently, specific Acer, Asus, Lenovo platforms keep on blaming this change. This patch changes the behavior to re-check the SCI_EVT a bit later and removes EC_FLAGS_QUERY_HANDSHAKE quirks, hoping this is the Windows compliant EC driver behavior. In order to be robust to the possible regressions, instead of removing the quirk directly, this patch keeps the quirk code, removes the quirk users and keeps old behavior for Samsung platforms. Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:45 +08:00
* Some EC firmware variations refuses to respond QR_EC when SCI_EVT is not
* set, for which case, we complete the QR_EC without issuing it to the
* firmware.
* https://bugzilla.kernel.org/show_bug.cgi?id=82611
* https://bugzilla.kernel.org/show_bug.cgi?id=97381
*/
static int ec_flag_query_handshake(const struct dmi_system_id *id)
{
pr_debug("Detected the EC firmware requiring QR_EC issued when SCI_EVT set\n");
EC_FLAGS_QUERY_HANDSHAKE = 1;
return 0;
}
ACPI / EC: Fix EC_FLAGS_QUERY_HANDSHAKE platforms using new event clearing timing. It is reported that on several platforms, EC firmware will not respond non-expected QR_EC (see EC_FLAGS_QUERY_HANDSHAKE, only write QR_EC when SCI_EVT is set). Unfortunately, ACPI specification doesn't define when the SCI_EVT should be cleared by the firmware, thus the original implementation queued up second QR_EC right after writing QR_EC command and before reading the returned event value as at that time the SCI_EVT is ensured not cleared. This behavior is also based on the assumption that the firmware should be able to return 0x00 to indicate "no outstanding event". This behavior did fix issues on Samsung platforms where the spurious query value of 0x00 is supported and didn't break platforms in my test queue. But recently, specific Acer, Asus, Lenovo platforms keep on blaming this change. This patch changes the behavior to re-check the SCI_EVT a bit later and removes EC_FLAGS_QUERY_HANDSHAKE quirks, hoping this is the Windows compliant EC driver behavior. In order to be robust to the possible regressions, instead of removing the quirk directly, this patch keeps the quirk code, removes the quirk users and keeps old behavior for Samsung platforms. Cc: 3.16+ <stable@vger.kernel.org> # 3.16+ Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:45 +08:00
#endif
/*
* Some ECDTs contain wrong register addresses.
* MSI MS-171F
* https://bugzilla.kernel.org/show_bug.cgi?id=12461
*/
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
static int ec_correct_ecdt(const struct dmi_system_id *id)
{
pr_debug("Detected system needing ECDT address correction.\n");
EC_FLAGS_CORRECT_ECDT = 1;
return 0;
}
/*
* Some DSDTs contain wrong GPE setting.
* Asus FX502VD/VE, GL702VMK, X550VXK, X580VD
* https://bugzilla.kernel.org/show_bug.cgi?id=195651
*/
static int ec_honor_ecdt_gpe(const struct dmi_system_id *id)
{
pr_debug("Detected system needing ignore DSDT GPE setting.\n");
EC_FLAGS_IGNORE_DSDT_GPE = 1;
return 0;
}
static struct dmi_system_id ec_dmi_table[] __initdata = {
{
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
ec_correct_ecdt, "MSI MS-171F", {
ACPI / EC: Remove non-root-caused busy polling quirks. { Update to correct 1 patch subject in the description } We have fixed a lot of race issues in the EC driver recently. The following commit introduces MSI udelay()/msleep() quirk to MSI laptops to make EC firmware working for bug 12011 without root causing any EC driver race issues: Commit: 5423a0cb3f74c16e90683f8ee1cec6c240a9556e Subject: ACPI: EC: Add delay for slow MSI controller Commit: 34ff4dbccccce54c83b1234d39b7ad9e548a75dd Subject: ACPI: EC: Separate delays for MSI hardware The following commit extends ECDT validation quirk to MSI laptops to make EC driver locating EC registers properly for bug 12461: Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This is a different quirk than the MSI udelay()/msleep() quirk. This patch keeps validating ECDT for only "Micro-Star MS-171F" as reported. The following commit extends MSI udelay()/msleep() quirk to Quanta laptops to make EC firmware working for bug 20242, there is no requirement to validate ECDT for Quanta laptops: Commit: 534bc4e3d27096e2f3fc00c14a20efd597837a4f Mon Sep 17 00:00:00 2001 Subject: ACPI EC: enable MSI workaround for Quanta laptops The following commit extends MSI udelay()/msleep() quirk to Clevo laptops to make EC firmware working for bug 77431, there is no requirement to validate ECDT for Clevo laptops: Commit: 777cb382958851c88763253fe00a26529be4c0e9 Subject: ACPI / EC: Add msi quirk for Clevo W350etq All udelay()/msleep() quirks for MSI/Quanta/Clevo seem to be the wrong fixes generated without fixing the EC driver race issues. And even if it is not wrong, the guarding can be covered by the following commits in wait polling mode: Commit: 9e295ac14d6a59180beed0735e6a504c2ee87761 Subject: ACPI / EC: Reduce ec_poll() by referencing the last register access timestamp. Commit: commit in the same series Subject: ACPI / EC: Fix and clean up register access guarding logics. The only case that is not covered is the inter-transaction guarding. And there is no evidence that we need the inter-transaction guarding upon reading the noted bug entries. So it is time to remove the quirks and let the users to try again. If there is a regression, the only thing we need to do is to restore the inter-transaction guarding for the reported platforms. Link: https://bugzilla.kernel.org/show_bug.cgi?id=12011 Link: https://bugzilla.kernel.org/show_bug.cgi?id=12461 Link: https://bugzilla.kernel.org/show_bug.cgi?id=20242 Link: https://bugzilla.kernel.org/show_bug.cgi?id=77431 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-05-15 14:37:11 +08:00
DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star"),
DMI_MATCH(DMI_PRODUCT_NAME, "MS-171F"),}, NULL},
{
ec_honor_ecdt_gpe, "ASUS FX502VD", {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "FX502VD"),}, NULL},
{
ec_honor_ecdt_gpe, "ASUS FX502VE", {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "FX502VE"),}, NULL},
{
ec_honor_ecdt_gpe, "ASUS GL702VMK", {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "GL702VMK"),}, NULL},
{
ec_honor_ecdt_gpe, "ASUS X550VXK", {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "X550VXK"),}, NULL},
{
ec_honor_ecdt_gpe, "ASUS X580VD", {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "X580VD"),}, NULL},
{},
};
int __init acpi_ec_ecdt_probe(void)
{
int ret;
acpi_status status;
struct acpi_table_ecdt *ecdt_ptr;
struct acpi_ec *ec;
ec = acpi_ec_alloc();
if (!ec)
return -ENOMEM;
/*
* Generate a boot ec context
*/
dmi_check_system(ec_dmi_table);
status = acpi_get_table(ACPI_SIG_ECDT, 1,
(struct acpi_table_header **)&ecdt_ptr);
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
if (ACPI_FAILURE(status)) {
ret = -ENODEV;
goto error;
}
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
if (!ecdt_ptr->control.address || !ecdt_ptr->data.address) {
/*
* Asus X50GL:
* https://bugzilla.kernel.org/show_bug.cgi?id=11880
*/
ret = -ENODEV;
goto error;
}
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
if (EC_FLAGS_CORRECT_ECDT) {
ec->command_addr = ecdt_ptr->data.address;
ec->data_addr = ecdt_ptr->control.address;
} else {
ec->command_addr = ecdt_ptr->control.address;
ec->data_addr = ecdt_ptr->data.address;
}
ec->gpe = ecdt_ptr->gpe;
/*
* At this point, the namespace is not initialized, so do not find
* the namespace objects, or handle the events.
*/
ret = acpi_config_boot_ec(ec, ACPI_ROOT_OBJECT, false, true);
error:
if (ret)
acpi_ec_free(ec);
ACPI 2.0 / ECDT: Remove early namespace reference from EC All operation region accesses are allowed by AML interpreter when AML is executed, so actually BIOSen are responsible to avoid the operation region accesses in AML before OSPM has prepared an operation region driver. This is done via _REG control method. So AML code normally sets a global named object REGC to 1 when _REG(3, 1) is evaluated. Then what is ECDT? Quoting from ACPI spec 6.0, 5.2.15 Embedded Controller Boot Resources Table (ECDT): "The presence of this table allows OSPM to provide Embedded Controller operation region space access before the namespace has been evaluated." Spec also suggests a compatible mean to indicate the early EC access availability: Device (EC) { Name (REGC, Ones) Method (_REG, 2) { If (LEqual (Arg0, 3)) { Store (Arg1, REGC) } } Method (ECAV) { If (LEqual (REGC, Ones)) { If (LGreaterEqual (_REV, 2)) { Return (One) } Else { Return (Zero) } } Else { Return (REGC) } } } In this way, it allows EC accesses to happen before EC._REG(3, 1) is invoked. But ECAV is not the only way practical BIOSen using to indicate the early EC access availibility, the known variations include: 1. Setting REGC to One in \_SB._INI when _REV >= 2. Since \_SB._INI is the first control method evaluated by OSPM during the enumeration, this allows EC accesses to happen for the entire enumeration process before the namespace EC is enumerated. 2. Initialize REGC to One by default, this even allows EC accesses to happen during the table loading. Linux is now broken around ECDT support during the long term bug fixing work because it has merged many wrong ECDT bug fixes (see details below). Linux currently uses namespace EC's settings instead of ECDT settings when ECDT is detected. This apparently will result in namespace walk and _CRS/_GPE/_REG evaluations. Such stuffs could only happen after namespace is ready, while ECDT is purposely to be used before namespace is ready. The wrong bug fixing story is: 1. Link 1: At Linux ACPI early stages, "no _Lxx/_Exx/_Qxx evaluation can happen before the namespace is ready" are not ensured by ACPICA core and Linux. This is currently ensured by deferred enabling of GPE and defered registering of EC query methods (acpi_ec_register_query_methods). 2. Link 2: Reporters reported buggy ECDTs, expecting quirks for the platform. Originally, the quirk is simple, only doing things with ECDT. Bug 9399 and 12461 are platforms (Asus L4R, Asus M6R, MSI MS-171F) reported to have wrong ECDT IO port addresses, the port addresses are reversed. Bug 11880 is a platform (Asus X50GL) reported to have 0 valued port addresses, we can see that all EC accesses are protected by ECAV on this platform, so actually no early EC accesses is required by this platform. 3. Link 3: But when the bug fixing developer was requested to provide a handy and non-quirk bug fix, he tried to use correct EC settings from namespace and broke the spec purpose. We can even see that the developer was suffered from many regrssions. One interesting one is 14086, where the actual root cause obviously should be: _REG is evaluated too early. But unfortunately, the bug is fixed in a totally wrong way. So everything goes wrong from these commits: Commit: c6cb0e878446c79f42e7833d7bb69ed6bfbb381f Subject: ACPI: EC: Don't trust ECDT tables from ASUS Commit: a5032bfdd9c80e0231a6324661e123818eb46ecd Subject: ACPI: EC: Always parse EC device This patch reverts Linux behavior to simple ECDT quirk support in order to stop early _CRS/_GPE/_REG evaluations. For Bug 9399, 12461, since it is reported that the platforms require early EC accesses, this patch restores the simple ECDT quirks for them. For Bug 11880, since it is not reported that the platform requires early EC accesses and its ACPI tables contain correct ECAV, we choose an ECDT enumeration failure for this platform. Link 1: https://bugzilla.kernel.org/show_bug.cgi?id=9916 http://bugzilla.kernel.org/show_bug.cgi?id=10100 https://lkml.org/lkml/2008/2/25/282 Link 2: https://bugzilla.kernel.org/show_bug.cgi?id=9399 https://bugzilla.kernel.org/show_bug.cgi?id=12461 https://bugzilla.kernel.org/show_bug.cgi?id=11880 Link 3: https://bugzilla.kernel.org/show_bug.cgi?id=11884 https://bugzilla.kernel.org/show_bug.cgi?id=14081 https://bugzilla.kernel.org/show_bug.cgi?id=14086 https://bugzilla.kernel.org/show_bug.cgi?id=14446 Link 4: https://bugzilla.kernel.org/show_bug.cgi?id=112911 Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Chris Bainbridge <chris.bainbridge@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-24 10:42:53 +08:00
return ret;
}
#ifdef CONFIG_PM_SLEEP
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
static int acpi_ec_suspend(struct device *dev)
{
struct acpi_ec *ec =
acpi_driver_data(to_acpi_device(dev));
ACPI / sleep: EC-based wakeup from suspend-to-idle on recent systems Some recent Dell laptops, including the XPS13 model numbers 9360 and 9365, cannot be woken up from suspend-to-idle by pressing the power button which is unexpected and makes that feature less usable on those systems. Moreover, on the 9365 ACPI S3 (suspend-to-RAM) is not expected to be used at all (the OS these systems ship with never exercises the ACPI S3 path in the firmware) and suspend-to-idle is the only viable system suspend mechanism there. The reason why the power button wakeup from suspend-to-idle doesn't work on those systems is because their power button events are signaled by the EC (Embedded Controller), whose GPE (General Purpose Event) line is disabled during suspend-to-idle transitions in Linux. That is done on purpose, because in general the EC tends to be noisy for various reasons (battery and thermal updates and similar, for example) and all events signaled by it would kick the CPUs out of deep idle states while in suspend-to-idle, which effectively might defeat its purpose. Of course, on the Dell systems in question the EC GPE must be enabled during suspend-to-idle transitions for the button press events to be signaled while suspended at all, but fortunately there is a way out of this puzzle. First of all, those systems have the ACPI_FADT_LOW_POWER_S0 flag set in their ACPI tables, which means that the OS is expected to prefer the "low power S0 idle" system state over ACPI S3 on them. That causes the most recent versions of other OSes to simply ignore ACPI S3 on those systems, so it is reasonable to expect that it should not be necessary to block GPEs during suspend-to-idle on them. Second, in addition to that, the systems in question provide a special firmware interface that can be used to indicate to the platform that the OS is transitioning into a system-wide low-power state in which certain types of activity are not desirable or that it is leaving such a state and that (in principle) should allow the platform to adjust its operation mode accordingly. That interface is a special _DSM object under a System Power Management Controller device (PNP0D80). The expected way to use it is to invoke function 0 from it on system initialization, functions 3 and 5 during suspend transitions and functions 4 and 6 during resume transitions (to reverse the actions carried out by the former). In particular, function 5 from the "Low-Power S0" device _DSM is expected to cause the platform to put itself into a low-power operation mode which should include making the EC less verbose (so to speak). Next, on resume, function 6 switches the platform back to the "working-state" operation mode. In accordance with the above, modify the ACPI suspend-to-idle code to look for the "Low-Power S0" _DSM interface on platforms with the ACPI_FADT_LOW_POWER_S0 flag set in the ACPI tables. If it's there, use it during suspend-to-idle transitions as prescribed and avoid changing the GPE configuration in that case. [That should reflect what the most recent versions of other OSes do.] Also modify the ACPI EC driver to make it handle events during suspend-to-idle in the usual way if the "Low-Power S0" _DSM interface is going to be used to make the power button events work while suspended on the Dell machines mentioned above Link: http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-06-23 21:24:32 +08:00
if (acpi_sleep_no_ec_events() && ec_freeze_events)
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
acpi_ec_disable_event(ec);
return 0;
}
static int acpi_ec_suspend_noirq(struct device *dev)
{
struct acpi_ec *ec = acpi_driver_data(to_acpi_device(dev));
/*
* The SCI handler doesn't run at this point, so the GPE can be
* masked at the low level without side effects.
*/
if (ec_no_wakeup && test_bit(EC_FLAGS_STARTED, &ec->flags) &&
ec->reference_count >= 1)
acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_DISABLE);
return 0;
}
static int acpi_ec_resume_noirq(struct device *dev)
{
struct acpi_ec *ec = acpi_driver_data(to_acpi_device(dev));
if (ec_no_wakeup && test_bit(EC_FLAGS_STARTED, &ec->flags) &&
ec->reference_count >= 1)
acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_ENABLE);
return 0;
}
ACPI / EC: Add PM operations to improve event handling for resume process This patch makes 2 changes: 1. Restore old behavior Originally, EC driver stops handling both events and transactions in acpi_ec_block_transactions(), and restarts to handle transactions in acpi_ec_unblock_transactions_early(), restarts to handle both events and transactions in acpi_ec_unblock_transactions(). While currently, EC driver still stops handling both events and transactions in acpi_ec_block_transactions(), but restarts to handle both events and transactions in acpi_ec_unblock_transactions_early(). This patch tries to restore the old behavior by dropping __acpi_ec_enable_event() from acpi_unblock_transactions_early(). 2. Improve old behavior However this still cannot fix the real issue as both of the acpi_ec_unblock_xxx() functions are invoked in the noirq stage. Since the EC driver actually doesn't implement the event handling in the polling mode, re-enabling the event handling too early in the noirq stage could result in the problem that if there is no triggering source causing advance_transaction() to be invoked, pending SCI_EVT cannot be detected by the EC driver and _Qxx cannot be triggered. It actually makes sense to restart the event handling in any point during resuming after the noirq stage. Just like the boot stage where the event handling is enabled in .add(), this patch further moves acpi_ec_enable_event() to .resume(). After doing that, the following 2 functions can be combined: acpi_ec_unblock_transactions_early()/acpi_ec_unblock_transactions(). The differences of the event handling availability between the old behavior (this patch isn't applied) and the new behavior (this patch is applied) are as follows: !Applied Applied before suspend Y Y suspend before EC Y Y suspend after EC Y Y suspend_late Y Y suspend_noirq Y (actually N) Y (actually N) resume_noirq Y (actually N) Y (actually N) resume_late Y (actually N) Y (actually N) resume before EC Y (actually N) Y (actually N) resume after EC Y (actually N) Y after resume Y (actually N) Y Where "actually N" means if there is no triggering source, the EC driver is actually not able to notice the pending SCI_EVT occurred in the noirq stage. So we can clearly see that this patch has improved the situation. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:36 +08:00
static int acpi_ec_resume(struct device *dev)
{
struct acpi_ec *ec =
acpi_driver_data(to_acpi_device(dev));
acpi_ec_enable_event(ec);
return 0;
}
#endif
static const struct dev_pm_ops acpi_ec_pm = {
SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(acpi_ec_suspend_noirq, acpi_ec_resume_noirq)
ACPI / EC: Add PM operations to improve event handling for suspend process In the original EC driver, though the event handling is not explicitly stopped, the EC driver is actually not able to handle events during the noirq stage as the EC driver is not prepared to handle the EC events in the polling mode. So if there is no advance_transaction() triggered, the EC driver couldn't notice the EC events. However, do we actually need to handle EC events during suspend/resume stage? EC events are mostly useless for the suspend/resume period (key strokes and battery/thermal updates, etc.,), and the useful ones (lid close, power/sleep button press) should have already been delivered to the OSPM to trigger the power saving operations. Thus this patch implements acpi_ec_disable_event() to be a reverse call of acpi_ec_enable_event(), with which, the EC driver is able to stop handling the EC events in a position before entering the noirq stage. Since there are actually 2 choices for us: 1. implement event handling in polling mode; 2. stop event handling before entering noirq stage. And this patch only implements the second choice using .suspend() callback. Thus this is experimental (first choice is better? or different hook position is better?). This patch finally keeps the old behavior by default and prepares a boot parameter to enable this feature. The differences of the event handling availability between the old behavior (this patch is not applied) and the new behavior (this patch is applied) are as follows: !FreezeEvents FreezeEvents before suspend Y Y suspend before EC Y Y suspend after EC Y N suspend_late Y N suspend_noirq Y (actually N) N resume_noirq Y (actually N) N resume_late Y (actually N) N resume before EC Y (actually N) N resume after EC Y Y after resume Y Y Where "actually N" means if there is no EC transactions, the EC driver is actually not able to notice the pending events. We can see that FreezeEvents is the only approach now can actually flush the EC event handling with both query commands and _Qxx evaluations flushed, other modes can only flush the EC event handling with only query commands flushed, _Qxx evaluations occurred after stopping the EC driver may end up failure due to the failure of the EC transaction carried out in the _Qxx control methods. We also can see that this feature should be able to trigger some platform notifications later than resuming other drivers. Signed-off-by: Lv Zheng <lv.zheng@intel.com> Tested-by: Todd E Brandt <todd.e.brandt@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-08-03 16:01:43 +08:00
SET_SYSTEM_SLEEP_PM_OPS(acpi_ec_suspend, acpi_ec_resume)
};
ACPI / EC: Add event clearing variation support. We've been suffering from the uncertainty of the SCI_EVT clearing timing. This patch implements 3 of 4 possible modes to handle SCI_EVT clearing variations. The old behavior is kept in this patch. Status: QR_EC is re-checked as early as possible after checking previous SCI_EVT. This always leads to 2 QR_EC transactions per SCI_EVT indication and the target may implement event queue which returns 0x00 indicating "no outstanding event". This is proven to be a conflict against Windows behavior, but is still kept in this patch to make the EC driver robust to the possible regressions that may occur on Samsung platforms. Query: QR_EC is re-checked after the target has handled the QR_EC query request command pushed by the host. Event: QR_EC is re-checked after the target has noticed the query event response data pulled by the host. This timing is not determined by any IRQs, so we may need to use a guard period in this mode, which may explain the existence of the ec_guard() code used by the old EC driver where the re-check timing is implemented in the similar way as this mode. Method: QR_EC is re-checked as late as possible after completing the _Qxx evaluation. The target may implement SCI_EVT like a level triggered interrupt. It is proven on kernel bugzilla 94411 that, Windows will have all _Qxx evaluations parallelized. Thus unless required by further evidences, we needn't implement this mode as it is a conflict of the _Qxx parallelism requirement. Note that, according to the reports, there are platforms that cannot be handled using the "Status" mode without enabling the EC_FLAGS_QUERY_HANDSHAKE quirk. But they can be handled with the other modes according to the tests (kernel bugzilla 97381). The following log entry can be used to confirm the differences of the 3 modes as it should appear at the different positions for the 3 modes: Command(QR_EC) unblocked Status: appearing after EC_SC(W) = 0x84 Query: appearing after EC_DATA(R) = 0xXX where XX is the event number used to determine _QXX Event: appearing after first EC_SC(R) = 0xX0 SCI_EVT=x BURST=0 CMD=0 IBF=0 OBF=0 that is next to the following log entry: Command(QR_EC) completed by hardware Link: https://bugzilla.kernel.org/show_bug.cgi?id=94411 Link: https://bugzilla.kernel.org/show_bug.cgi?id=97381 Link: https://bugzilla.kernel.org/show_bug.cgi?id=98111 Reported-and-tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com> Reported-and-tested-by: Tigran Gabrielyan <tigrangab@gmail.com> Reported-and-tested-by: Adrien D <ghbdtn@openmailbox.org> Signed-off-by: Lv Zheng <lv.zheng@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-11 13:21:38 +08:00
static int param_set_event_clearing(const char *val, struct kernel_param *kp)
{
int result = 0;
if (!strncmp(val, "status", sizeof("status") - 1)) {
ec_event_clearing = ACPI_EC_EVT_TIMING_STATUS;
pr_info("Assuming SCI_EVT clearing on EC_SC accesses\n");
} else if (!strncmp(val, "query", sizeof("query") - 1)) {
ec_event_clearing = ACPI_EC_EVT_TIMING_QUERY;
pr_info("Assuming SCI_EVT clearing on QR_EC writes\n");
} else if (!strncmp(val, "event", sizeof("event") - 1)) {
ec_event_clearing = ACPI_EC_EVT_TIMING_EVENT;
pr_info("Assuming SCI_EVT clearing on event reads\n");
} else
result = -EINVAL;
return result;
}
static int param_get_event_clearing(char *buffer, struct kernel_param *kp)
{
switch (ec_event_clearing) {
case ACPI_EC_EVT_TIMING_STATUS:
return sprintf(buffer, "status");
case ACPI_EC_EVT_TIMING_QUERY:
return sprintf(buffer, "query");
case ACPI_EC_EVT_TIMING_EVENT:
return sprintf(buffer, "event");
default:
return sprintf(buffer, "invalid");
}
return 0;
}
module_param_call(ec_event_clearing, param_set_event_clearing, param_get_event_clearing,
NULL, 0644);
MODULE_PARM_DESC(ec_event_clearing, "Assumed SCI_EVT clearing timing");
static struct acpi_driver acpi_ec_driver = {
.name = "ec",
.class = ACPI_EC_CLASS,
.ids = ec_device_ids,
.ops = {
.add = acpi_ec_add,
.remove = acpi_ec_remove,
},
.drv.pm = &acpi_ec_pm,
};
static inline int acpi_ec_query_init(void)
{
if (!ec_query_wq) {
ec_query_wq = alloc_workqueue("kec_query", 0,
ec_max_queries);
if (!ec_query_wq)
return -ENODEV;
}
return 0;
}
static inline void acpi_ec_query_exit(void)
{
if (ec_query_wq) {
destroy_workqueue(ec_query_wq);
ec_query_wq = NULL;
}
}
int __init acpi_ec_init(void)
{
int result;
int ecdt_fail, dsdt_fail;
/* register workqueue for _Qxx evaluations */
result = acpi_ec_query_init();
if (result)
return result;
/* Drivers must be started after acpi_ec_query_init() */
ecdt_fail = acpi_ec_ecdt_start();
dsdt_fail = acpi_bus_register_driver(&acpi_ec_driver);
return ecdt_fail && dsdt_fail ? -ENODEV : 0;
}
/* EC driver currently not unloadable */
#if 0
static void __exit acpi_ec_exit(void)
{
acpi_bus_unregister_driver(&acpi_ec_driver);
acpi_ec_query_exit();
}
#endif /* 0 */