From 780b1350d316fda28d85fcae17854c778d89cbbe Mon Sep 17 00:00:00 2001 From: Ramesh Shanmugasundaram Date: Mon, 3 Jul 2017 12:04:21 +0100 Subject: [PATCH 0001/1085] regmap: Avoid namespace collision within macro & tidy up Renamed variable "timeout" to "__timeout" & "pollret" to "__ret" to avoid namespace collision. Tidy up macro arguments with parentheses. Signed-off-by: Ramesh Shanmugasundaram Signed-off-by: Mark Brown --- include/linux/regmap.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/regmap.h b/include/linux/regmap.h index e88649225a60..6e1df5e721a9 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -120,23 +120,24 @@ struct reg_sequence { */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ - int pollret; \ + ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \ + int __ret; \ might_sleep_if(sleep_us); \ for (;;) { \ - pollret = regmap_read((map), (addr), &(val)); \ - if (pollret) \ + __ret = regmap_read((map), (addr), &(val)); \ + if (__ret) \ break; \ if (cond) \ break; \ - if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ - pollret = regmap_read((map), (addr), &(val)); \ + if ((timeout_us) && \ + ktime_compare(ktime_get(), __timeout) > 0) { \ + __ret = regmap_read((map), (addr), &(val)); \ break; \ } \ if (sleep_us) \ - usleep_range((sleep_us >> 2) + 1, sleep_us); \ + usleep_range(((sleep_us) >> 2) + 1, sleep_us); \ } \ - pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ + __ret ?: ((cond) ? 0 : -ETIMEDOUT); \ }) #ifdef CONFIG_REGMAP From 9c2e5cb38da2202c6591efec631562116761521f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 11 Aug 2017 08:57:36 +0200 Subject: [PATCH 0002/1085] regmap: constify regmap_bus structures These regmap_bus structures are only passed as the second argument to __devm_regmap_init or __regmap_init, both of which are const, so the regmap_bus structures can be const too. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-spi.c | 2 +- drivers/base/regmap/regmap-spmi.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c index edd9a839d004..c7150dd264d5 100644 --- a/drivers/base/regmap/regmap-spi.c +++ b/drivers/base/regmap/regmap-spi.c @@ -102,7 +102,7 @@ static int regmap_spi_read(void *context, return spi_write_then_read(spi, reg, reg_size, val, val_size); } -static struct regmap_bus regmap_spi = { +static const struct regmap_bus regmap_spi = { .write = regmap_spi_write, .gather_write = regmap_spi_gather_write, .async_write = regmap_spi_async_write, diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c index 4a36e415e938..0bfb8ed244d5 100644 --- a/drivers/base/regmap/regmap-spmi.c +++ b/drivers/base/regmap/regmap-spmi.c @@ -83,7 +83,7 @@ static int regmap_spmi_base_write(void *context, const void *data, count - 1); } -static struct regmap_bus regmap_spmi_base = { +static const struct regmap_bus regmap_spmi_base = { .read = regmap_spmi_base_read, .write = regmap_spmi_base_write, .gather_write = regmap_spmi_base_gather_write, @@ -203,7 +203,7 @@ static int regmap_spmi_ext_write(void *context, const void *data, count - 2); } -static struct regmap_bus regmap_spmi_ext = { +static const struct regmap_bus regmap_spmi_ext = { .read = regmap_spmi_ext_read, .write = regmap_spmi_ext_write, .gather_write = regmap_spmi_ext_gather_write, From 27eae9d4b9d4cd9c204ef81f46078d91362ed41c Mon Sep 17 00:00:00 2001 From: Ravikumar Kattekola Date: Thu, 31 Aug 2017 15:48:45 +0530 Subject: [PATCH 0003/1085] regulator: pbias: Select voltage table based on max-voltage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference manuals of OMAP5x and DRA7x have been updated to reflect the PBIAS regulator max-voltage as 3.3V instead of 3.0V, while OMAP3x and OMAP4x are still quoting 3.0V. So, as of now, the pbias driver needs to support both 3.0V and 3.3V IO voltage based on the max-voltage supported by the PBIAS regulator. Document reference: SWPU249AF - OMAP543x Technical reference manual - August 2016 SPRUI30C – DRA75x, DRA74x Technical reference manual November 2016 Tested on: DRA75x PG 2.0 REV H EVM Signed-off-by: Ravikumar Kattekola Signed-off-by: Sekhar Nori Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Mark Brown --- drivers/regulator/pbias-regulator.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/pbias-regulator.c b/drivers/regulator/pbias-regulator.c index 0cb76ba29e84..8f782d22fdbe 100644 --- a/drivers/regulator/pbias-regulator.c +++ b/drivers/regulator/pbias-regulator.c @@ -34,6 +34,8 @@ struct pbias_reg_info { u32 vmode; unsigned int enable_time; char *name; + const unsigned int *pbias_volt_table; + int n_voltages; }; struct pbias_regulator_data { @@ -49,11 +51,16 @@ struct pbias_of_data { unsigned int offset; }; -static const unsigned int pbias_volt_table[] = { +static const unsigned int pbias_volt_table_3_0V[] = { 1800000, 3000000 }; +static const unsigned int pbias_volt_table_3_3V[] = { + 1800000, + 3300000 +}; + static const struct regulator_ops pbias_regulator_voltage_ops = { .list_voltage = regulator_list_voltage_table, .get_voltage_sel = regulator_get_voltage_sel_regmap, @@ -69,6 +76,8 @@ static const struct pbias_reg_info pbias_mmc_omap2430 = { .vmode = BIT(0), .disable_val = 0, .enable_time = 100, + .pbias_volt_table = pbias_volt_table_3_0V, + .n_voltages = 2, .name = "pbias_mmc_omap2430" }; @@ -77,6 +86,8 @@ static const struct pbias_reg_info pbias_sim_omap3 = { .enable_mask = BIT(9), .vmode = BIT(8), .enable_time = 100, + .pbias_volt_table = pbias_volt_table_3_0V, + .n_voltages = 2, .name = "pbias_sim_omap3" }; @@ -86,6 +97,8 @@ static const struct pbias_reg_info pbias_mmc_omap4 = { .disable_val = BIT(25), .vmode = BIT(21), .enable_time = 100, + .pbias_volt_table = pbias_volt_table_3_0V, + .n_voltages = 2, .name = "pbias_mmc_omap4" }; @@ -95,6 +108,8 @@ static const struct pbias_reg_info pbias_mmc_omap5 = { .disable_val = BIT(25), .vmode = BIT(21), .enable_time = 100, + .pbias_volt_table = pbias_volt_table_3_3V, + .n_voltages = 2, .name = "pbias_mmc_omap5" }; @@ -199,8 +214,8 @@ static int pbias_regulator_probe(struct platform_device *pdev) drvdata[data_idx].desc.owner = THIS_MODULE; drvdata[data_idx].desc.type = REGULATOR_VOLTAGE; drvdata[data_idx].desc.ops = &pbias_regulator_voltage_ops; - drvdata[data_idx].desc.volt_table = pbias_volt_table; - drvdata[data_idx].desc.n_voltages = 2; + drvdata[data_idx].desc.volt_table = info->pbias_volt_table; + drvdata[data_idx].desc.n_voltages = info->n_voltages; drvdata[data_idx].desc.enable_time = info->enable_time; drvdata[data_idx].desc.vsel_reg = offset; drvdata[data_idx].desc.vsel_mask = info->vmode; From 5b65c4677a57a1d4414212f9995aa0e46a21ff80 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 9 Sep 2017 00:56:03 +0300 Subject: [PATCH 0004/1085] mm, x86/mm: Fix performance regression in get_user_pages_fast() The 0-day test bot found a performance regression that was tracked down to switching x86 to the generic get_user_pages_fast() implementation: http://lkml.kernel.org/r/20170710024020.GA26389@yexl-desktop The regression was caused by the fact that we now use local_irq_save() + local_irq_restore() in get_user_pages_fast() to disable interrupts. In x86 implementation local_irq_disable() + local_irq_enable() was used. The fix is to make get_user_pages_fast() use local_irq_disable(), leaving local_irq_save() for __get_user_pages_fast() that can be called with interrupts disabled. Numbers for pinning a gigabyte of memory, one page a time, 20 repeats: Before: Average: 14.91 ms, stddev: 0.45 ms After: Average: 10.76 ms, stddev: 0.18 ms Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Huang Ying Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: linux-mm@kvack.org Fixes: e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") Link: http://lkml.kernel.org/r/20170908215603.9189-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 97 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 23f01c40c88f..4a789f1c6a27 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1618,6 +1618,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static void gup_pgd_range(unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pgd_t *pgdp; + + pgdp = pgd_offset(current->mm, addr); + do { + pgd_t pgd = READ_ONCE(*pgdp); + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + return; + if (unlikely(pgd_huge(pgd))) { + if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + pages, nr)) + return; + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { + if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, write, pages, nr)) + return; + } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + return; + } while (pgdp++, addr = next, addr != end); +} + +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use __get_user_pages_fast() for the range, or + * we need to fall back to the slow version: + */ +bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + return end >= start; +} +#endif + /* * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. It will only return non-negative values. @@ -1625,10 +1666,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp; + unsigned long flags; int nr = 0; start &= PAGE_MASK; @@ -1652,45 +1691,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = READ_ONCE(*pgdp); - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, - pages, &nr)) - break; - } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { - if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, &nr)) - break; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); + if (gup_fast_permitted(start, nr_pages, write)) { + local_irq_save(flags); + gup_pgd_range(addr, end, write, pages, &nr); + local_irq_restore(flags); + } return nr; } -#ifndef gup_fast_permitted -/* - * Check if it's allowed to use __get_user_pages_fast() for the range, or - * we need to fall back to the slow version: - */ -bool gup_fast_permitted(unsigned long start, int nr_pages, int write) -{ - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - return end >= start; -} -#endif - /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -1710,12 +1719,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { + unsigned long addr, len, end; int nr = 0, ret = 0; start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; if (gup_fast_permitted(start, nr_pages, write)) { - nr = __get_user_pages_fast(start, nr_pages, write, pages); + local_irq_disable(); + gup_pgd_range(addr, end, write, pages, &nr); + local_irq_enable(); ret = nr; } From 0f59d7a352c11712de0f226b46cb82775b4fcece Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 1 Sep 2017 10:49:12 -0700 Subject: [PATCH 0005/1085] perf sched timehist: Add pid and tid options Add options to only show event for specific pid(s) and tid(s). Signed-off-by: David Ahern Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1504288152-19690-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 8 ++++++++ tools/perf/builtin-sched.c | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index a092a2499e8f..55b67338548e 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -106,6 +106,14 @@ OPTIONS for 'perf sched timehist' --max-stack:: Maximum number of functions to display in backtrace, default 5. +-p=:: +--pid=:: + Only show events for given process ID (comma separated list). + +-t=:: +--tid=:: + Only show events for given thread ID (comma separated list). + -s:: --summary:: Show only a summary of scheduling by thread with min, max, and average diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 322b4def8411..b7e8812ee80c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3363,6 +3363,10 @@ int cmd_sched(int argc, const char **argv) OPT_STRING(0, "time", &sched.time_str, "str", "Time span for analysis (start,stop)"), OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"), + OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]", + "analyze events only for given process id(s)"), + OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]", + "analyze events only for given thread id(s)"), OPT_PARENT(sched_options) }; From 5a5dfe4b8548d806bf433090995ee0ee4c139f11 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:26 -0700 Subject: [PATCH 0006/1085] perf tools: Support weak groups in 'perf stat' Setting up groups can be complicated due to the complicated scheduling restrictions of different PMUs. User tools usually don't understand all these restrictions. Still in many cases it is useful to set up groups and they work most of the time. However if the group is set up wrong some members will not report any value because they never get scheduled. Add a concept of a 'weak group': try to set up a group, but if it's not schedulable fallback to not using a group. That gives us the best of both worlds: groups if they work, but still a usable fallback if they don't. In theory it would be possible to have more complex fallback strategies (e.g. try to split the group in half), but the simple fallback of not using a group seems to work for now. So far the weak group is only implemented for perf stat, not for record. Here's an unschedulable group (on IvyBridge with SMT on) % perf stat -e '{branches,branch-misses,l1d.replacement,l2_lines_in.all,l2_rqsts.all_code_rd}' -a sleep 1 73,806,067 branches 4,848,144 branch-misses # 6.57% of all branches 14,754,458 l1d.replacement 24,905,558 l2_lines_in.all l2_rqsts.all_code_rd <------- will never report anything With the weak group: % perf stat -e '{branches,branch-misses,l1d.replacement,l2_lines_in.all,l2_rqsts.all_code_rd}:W' -a sleep 1 125,366,055 branches (80.02%) 9,208,402 branch-misses # 7.35% of all branches (80.01%) 24,560,249 l1d.replacement (80.00%) 43,174,971 l2_lines_in.all (80.05%) 31,891,457 l2_rqsts.all_code_rd (79.92%) The extra event scheduled with some extra multiplexing v2: Move fallback code to separate function. Add comment on for_each_group_member Adjust to new perf_evsel__close interface v3: Fix debug print out. Committer testing: Before: # perf stat -e '{branches,branch-misses,l1d.replacement,l2_lines_in.all,l2_rqsts.all_code_rd}' -a sleep 1 Performance counter stats for 'system wide': branches branch-misses l1d.replacement l2_lines_in.all l2_rqsts.all_code_rd 1.002147212 seconds time elapsed # perf stat -e '{branches,l1d.replacement,l2_lines_in.all,l2_rqsts.all_code_rd}' -a sleep 1 Performance counter stats for 'system wide': 83,207,892 branches 11,065,444 l1d.replacement 28,484,024 l2_lines_in.all 12,186,179 l2_rqsts.all_code_rd 1.001739493 seconds time elapsed After: # perf stat -e '{branches,branch-misses,l1d.replacement,l2_lines_in.all,l2_rqsts.all_code_rd}':W -a sleep 1 Performance counter stats for 'system wide': 543,323,909 branches (80.01%) 27,100,512 branch-misses # 4.99% of all branches (80.02%) 50,402,905 l1d.replacement (80.03%) 67,385,892 l2_lines_in.all (80.01%) 21,352,885 l2_rqsts.all_code_rd (79.94%) 1.001086658 seconds time elapsed # Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20170831194036.30146-2-andi@firstfloor.org [ Add a "'perf stat' only, for now" comment in the man page, suggested by Jiri ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 2 ++ tools/perf/builtin-stat.c | 35 ++++++++++++++++++++++++++ tools/perf/util/evsel.h | 1 + tools/perf/util/parse-events.c | 8 +++++- tools/perf/util/parse-events.l | 2 +- 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index f709de54707b..75fc17f47298 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -47,6 +47,8 @@ counted. The following modifiers exist: P - use maximum detected precise level S - read sample value (PERF_SAMPLE_READ) D - pin the event to the PMU + W - group is weak and will fallback to non-group if not schedulable, + only supported in 'perf stat' for now. The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 69523ed55894..7cc61eb0d83b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -582,6 +582,32 @@ static bool perf_evsel__should_store_id(struct perf_evsel *counter) return STAT_RECORD || counter->attr.read_format & PERF_FORMAT_ID; } +static struct perf_evsel *perf_evsel__reset_weak_group(struct perf_evsel *evsel) +{ + struct perf_evsel *c2, *leader; + bool is_open = true; + + leader = evsel->leader; + pr_debug("Weak group for %s/%d failed\n", + leader->name, leader->nr_members); + + /* + * for_each_group_member doesn't work here because it doesn't + * include the first entry. + */ + evlist__for_each_entry(evsel_list, c2) { + if (c2 == evsel) + is_open = false; + if (c2->leader == leader) { + if (is_open) + perf_evsel__close(c2); + c2->leader = c2; + c2->nr_members = 0; + } + } + return leader; +} + static int __run_perf_stat(int argc, const char **argv) { int interval = stat_config.interval; @@ -618,6 +644,15 @@ static int __run_perf_stat(int argc, const char **argv) evlist__for_each_entry(evsel_list, counter) { try_again: if (create_perf_stat_counter(counter) < 0) { + + /* Weak group failed. Reset the group. */ + if (errno == EINVAL && + counter->leader != counter && + counter->weak_group) { + counter = perf_evsel__reset_weak_group(counter); + goto try_again; + } + /* * PPC returns ENXIO for HW counters until 2.6.37 * (behavior changed with commit b0a873e). diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index dd2c4b5112a5..db658785d828 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -137,6 +137,7 @@ struct perf_evsel { const char * metric_name; struct perf_evsel **metric_events; bool collect_stat; + bool weak_group; }; union u64_swap { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f6257fb4f08c..57d7acf890e0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1366,6 +1366,7 @@ struct event_modifier { int exclude_GH; int sample_read; int pinned; + int weak; }; static int get_event_modifier(struct event_modifier *mod, char *str, @@ -1384,6 +1385,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, int exclude = eu | ek | eh; int exclude_GH = evsel ? evsel->exclude_GH : 0; + int weak = 0; memset(mod, 0, sizeof(*mod)); @@ -1421,6 +1423,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str, sample_read = 1; } else if (*str == 'D') { pinned = 1; + } else if (*str == 'W') { + weak = 1; } else break; @@ -1451,6 +1455,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, mod->exclude_GH = exclude_GH; mod->sample_read = sample_read; mod->pinned = pinned; + mod->weak = weak; return 0; } @@ -1464,7 +1469,7 @@ static int check_modifier(char *str) char *p = str; /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppPSDI") - 1)) + if (strlen(str) > (sizeof("ukhGHpppPSDIW") - 1)) return -1; while (*p) { @@ -1504,6 +1509,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add) evsel->exclude_GH = mod.exclude_GH; evsel->sample_read = mod.sample_read; evsel->precise_max = mod.precise_max; + evsel->weak_group = mod.weak; if (perf_evsel__is_group_leader(evsel)) evsel->attr.pinned = mod.pinned; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index c42edeac451f..fdb5bb52f01f 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -161,7 +161,7 @@ name [a-zA-Z_*?][a-zA-Z0-9_*?.]* name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* If you add a modifier you need to update check_modifier() */ -modifier_event [ukhpPGHSDI]+ +modifier_event [ukhpPGHSDIW]+ modifier_bp [rwx]{1,3} %% From 3ba36d3620d08be31f5ee9ae20abb9bf3bdeb05a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:27 -0700 Subject: [PATCH 0007/1085] perf vendor events: Support metric_group and no event name in JSON parser Some enhancements to the JSON parser to prepare for metrics support - Parse the new MetricGroup field - Support JSON events with no event name, that have only MetricName. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-3-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 24 ++++++++++++++++++------ tools/perf/pmu-events/jevents.h | 2 +- tools/perf/pmu-events/pmu-events.h | 1 + 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index d51dc9ca8861..9eb7047bafe4 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -292,7 +292,7 @@ static int print_events_table_entry(void *data, char *name, char *event, char *desc, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, - char *metric_name) + char *metric_name, char *metric_group) { struct perf_entry_data *pd = data; FILE *outfp = pd->outfp; @@ -304,8 +304,10 @@ static int print_events_table_entry(void *data, char *name, char *event, */ fprintf(outfp, "{\n"); - fprintf(outfp, "\t.name = \"%s\",\n", name); - fprintf(outfp, "\t.event = \"%s\",\n", event); + if (name) + fprintf(outfp, "\t.name = \"%s\",\n", name); + if (event) + fprintf(outfp, "\t.event = \"%s\",\n", event); fprintf(outfp, "\t.desc = \"%s\",\n", desc); fprintf(outfp, "\t.topic = \"%s\",\n", topic); if (long_desc && long_desc[0]) @@ -320,6 +322,8 @@ static int print_events_table_entry(void *data, char *name, char *event, fprintf(outfp, "\t.metric_expr = \"%s\",\n", metric_expr); if (metric_name) fprintf(outfp, "\t.metric_name = \"%s\",\n", metric_name); + if (metric_group) + fprintf(outfp, "\t.metric_group = \"%s\",\n", metric_group); fprintf(outfp, "},\n"); return 0; @@ -357,6 +361,9 @@ static char *real_event(const char *name, char *event) { int i; + if (!name) + return NULL; + for (i = 0; fixed[i].name; i++) if (!strcasecmp(name, fixed[i].name)) return (char *)fixed[i].event; @@ -369,7 +376,7 @@ int json_events(const char *fn, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, - char *metric_name), + char *metric_name, char *metric_group), void *data) { int err = -EIO; @@ -397,6 +404,7 @@ int json_events(const char *fn, char *unit = NULL; char *metric_expr = NULL; char *metric_name = NULL; + char *metric_group = NULL; unsigned long long eventcode = 0; struct msrmap *msr = NULL; jsmntok_t *msrval = NULL; @@ -476,6 +484,8 @@ int json_events(const char *fn, addfield(map, &perpkg, "", "", val); } else if (json_streq(map, field, "MetricName")) { addfield(map, &metric_name, "", "", val); + } else if (json_streq(map, field, "MetricGroup")) { + addfield(map, &metric_group, "", "", val); } else if (json_streq(map, field, "MetricExpr")) { addfield(map, &metric_expr, "", "", val); for (s = metric_expr; *s; s++) @@ -501,10 +511,11 @@ int json_events(const char *fn, addfield(map, &event, ",", filter, NULL); if (msr != NULL) addfield(map, &event, ",", msr->pname, msrval); - fixname(name); + if (name) + fixname(name); err = func(data, name, real_event(name, event), desc, long_desc, - pmu, unit, perpkg, metric_expr, metric_name); + pmu, unit, perpkg, metric_expr, metric_name, metric_group); free(event); free(desc); free(name); @@ -516,6 +527,7 @@ int json_events(const char *fn, free(unit); free(metric_expr); free(metric_name); + free(metric_group); if (err) break; tok += j; diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h index 611fac01913d..557994754410 100644 --- a/tools/perf/pmu-events/jevents.h +++ b/tools/perf/pmu-events/jevents.h @@ -6,7 +6,7 @@ int json_events(const char *fn, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, - char *metric_name), + char *metric_name, char *metric_group), void *data); char *get_cpu_str(void); diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index 569eab3688dd..94fa1720f6fd 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -15,6 +15,7 @@ struct pmu_event { const char *perpkg; const char *metric_expr; const char *metric_name; + const char *metric_group; }; /* From bba49af87393ebc8960bf8abdcbb9af53bf1aba1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:28 -0700 Subject: [PATCH 0008/1085] perf stat: Factor out generic metric printing The 'perf stat' shadow metric printing already supports generic metrics. Factor out the code doing that into a separate function that can be re-used in a later patch. No behavior changes. v2: Fix indentation Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-4-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 69 +++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index a04cf56d3517..96aa6cbf24d6 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -627,6 +627,46 @@ static void print_smi_cost(int cpu, struct perf_evsel *evsel, out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); } +static void generic_metric(const char *metric_expr, + struct perf_evsel **metric_events, + char *name, + const char *metric_name, + double avg, + int cpu, + int ctx, + struct perf_stat_output_ctx *out) +{ + print_metric_t print_metric = out->print_metric; + struct parse_ctx pctx; + double ratio; + int i; + void *ctxp = out->ctx; + + expr__ctx_init(&pctx); + expr__add_id(&pctx, name, avg); + for (i = 0; metric_events[i]; i++) { + struct saved_value *v; + + v = saved_value_lookup(metric_events[i], cpu, ctx, false); + if (!v) + break; + expr__add_id(&pctx, metric_events[i]->name, avg_stats(&v->stats)); + } + if (!metric_events[i]) { + const char *p = metric_expr; + + if (expr__parse(&ratio, &pctx, &p) == 0) + print_metric(ctxp, NULL, "%8.1f", + metric_name ? + metric_name : + out->force_header ? name : "", + ratio); + else + print_metric(ctxp, NULL, NULL, "", 0); + } else + print_metric(ctxp, NULL, NULL, "", 0); +} + void perf_stat__print_shadow_stats(struct perf_evsel *evsel, double avg, int cpu, struct perf_stat_output_ctx *out) @@ -819,33 +859,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, else print_metric(ctxp, NULL, NULL, name, 0); } else if (evsel->metric_expr) { - struct parse_ctx pctx; - int i; - - expr__ctx_init(&pctx); - expr__add_id(&pctx, evsel->name, avg); - for (i = 0; evsel->metric_events[i]; i++) { - struct saved_value *v; - - v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false); - if (!v) - break; - expr__add_id(&pctx, evsel->metric_events[i]->name, - avg_stats(&v->stats)); - } - if (!evsel->metric_events[i]) { - const char *p = evsel->metric_expr; - - if (expr__parse(&ratio, &pctx, &p) == 0) - print_metric(ctxp, NULL, "%8.1f", - evsel->metric_name ? - evsel->metric_name : - out->force_header ? evsel->name : "", - ratio); - else - print_metric(ctxp, NULL, NULL, "", 0); - } else - print_metric(ctxp, NULL, NULL, "", 0); + generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name, + evsel->metric_name, avg, cpu, ctx, out); } else if (runtime_nsecs_stats[cpu].n != 0) { char unit = 'M'; char unit_buf[10]; From 4ed962eb38c8a33b8b6ded911410afaefa1ca48c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:29 -0700 Subject: [PATCH 0009/1085] perf stat: Print generic metric header even for failed expressions Print the generic metric header even when the expression evaluation failed. Otherwise an expression that fails on the first collections due to division by zero may suddenly reappear later without an header. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-5-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 96aa6cbf24d6..8c7ab29169b9 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -662,7 +662,9 @@ static void generic_metric(const char *metric_expr, out->force_header ? name : "", ratio); else - print_metric(ctxp, NULL, NULL, "", 0); + print_metric(ctxp, NULL, NULL, + out->force_header ? + (metric_name ? metric_name : name) : "", 0); } else print_metric(ctxp, NULL, NULL, "", 0); } From d77ade9f4199c77c63e2ae382a8c8fbe0582ede2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:30 -0700 Subject: [PATCH 0010/1085] perf pmu: Extract function to get JSON alias map Extract the code to get the per cpu JSON alias into a separate function for reuse. No behavior changes. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-6-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 69 +++++++++++++++++++++++++++---------------- tools/perf/util/pmu.h | 2 ++ 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ac16a9db1fb5..ed25d7f88731 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -516,6 +516,47 @@ char * __weak get_cpuid_str(void) return NULL; } +static char *perf_pmu__getcpuid(void) +{ + char *cpuid; + static bool printed; + + cpuid = getenv("PERF_CPUID"); + if (cpuid) + cpuid = strdup(cpuid); + if (!cpuid) + cpuid = get_cpuid_str(); + if (!cpuid) + return NULL; + + if (!printed) { + pr_debug("Using CPUID %s\n", cpuid); + printed = true; + } + return cpuid; +} + +struct pmu_events_map *perf_pmu__find_map(void) +{ + struct pmu_events_map *map; + char *cpuid = perf_pmu__getcpuid(); + int i; + + i = 0; + for (;;) { + map = &pmu_events_map[i++]; + if (!map->table) { + map = NULL; + break; + } + + if (!strcmp(map->cpuid, cpuid)) + break; + } + free(cpuid); + return map; +} + /* * From the pmu_events_map, find the table of PMU events that corresponds * to the current running CPU. Then, add all PMU events from that table @@ -526,32 +567,11 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name) int i; struct pmu_events_map *map; struct pmu_event *pe; - char *cpuid; - static bool printed; - cpuid = getenv("PERF_CPUID"); - if (cpuid) - cpuid = strdup(cpuid); - if (!cpuid) - cpuid = get_cpuid_str(); - if (!cpuid) + map = perf_pmu__find_map(); + if (!map) return; - if (!printed) { - pr_debug("Using CPUID %s\n", cpuid); - printed = true; - } - - i = 0; - while (1) { - map = &pmu_events_map[i++]; - if (!map->table) - goto out; - - if (!strcmp(map->cpuid, cpuid)) - break; - } - /* * Found a matching PMU events table. Create aliases */ @@ -575,9 +595,6 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name) (char *)pe->metric_expr, (char *)pe->metric_name); } - -out: - free(cpuid); } struct perf_event_attr * __weak diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 389e9729331f..060f6abba8ed 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -90,4 +90,6 @@ int perf_pmu__test(void); struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu); +struct pmu_events_map *perf_pmu__find_map(void); + #endif /* __PMU_H */ From b18f3e365019de1a5b26a851e123f0aedcce881f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:31 -0700 Subject: [PATCH 0011/1085] perf stat: Support JSON metrics in perf stat Add generic support for standalone metrics specified in JSON files to perf stat. A metric is a formula that uses multiple events to compute a higher level result (e.g. IPC). Previously metrics were always tied to an event and automatically enabled with that event. But now change it that we can have standalone metrics. They are in the same JSON data structure as events, but don't have an event name. We also allow to organize the metrics in metric groups, which allows a short cut to select several related metrics at once. Add a new -M / --metrics option to perf stat that adds the metrics or metric groups specified. Add the core code to manage and parse the metric groups. They are collected from the JSON data structures into a separate rblist. When computing shadow values look for metrics in that list. Then they are computed using the existing saved values infrastructure in stat-shadow.c The actual JSON metrics are in a separate pull request. % perf stat -M Summary --metric-only -a sleep 1 Performance counter stats for 'system wide': Instructions CLKS CPU_Utilization GFLOPs SMT_2T_Utilization Kernel_Utilization 317614222.0 1392930775.0 0.0 0.0 0.2 0.1 1.001497549 seconds time elapsed % perf stat -M GFLOPs flops Performance counter stats for 'flops': 3,999,541,471 fp_comp_ops_exe.sse_scalar_single # 1.2 GFLOPs (66.65%) 14 fp_comp_ops_exe.sse_scalar_double (66.65%) 0 fp_comp_ops_exe.sse_packed_double (66.67%) 0 fp_comp_ops_exe.sse_packed_single (66.70%) 0 simd_fp_256.packed_double (66.70%) 0 simd_fp_256.packed_single (66.67%) 0 duration_time 3.238372845 seconds time elapsed v2: Add missing header file v3: Move find_map to pmu.c Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-7-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 7 + tools/perf/builtin-stat.c | 18 +- tools/perf/util/Build | 1 + tools/perf/util/metricgroup.c | 313 +++++++++++++++++++++++++ tools/perf/util/metricgroup.h | 31 +++ tools/perf/util/pmu.c | 5 +- tools/perf/util/stat-shadow.c | 22 +- tools/perf/util/stat.h | 4 +- 8 files changed, 395 insertions(+), 6 deletions(-) create mode 100644 tools/perf/util/metricgroup.c create mode 100644 tools/perf/util/metricgroup.h diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index c37d61682dfb..823fce7674bb 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -199,6 +199,13 @@ Aggregate counts per processor socket for system-wide mode measurements. --per-core:: Aggregate counts per physical processor for system-wide mode measurements. +-M:: +--metrics:: +Print metrics or metricgroups specified in a comma separated list. +For a group all metrics from the group are added. +The events from the metrics are automatically measured. +See perf list output for the possble metrics and metricgroups. + -A:: --no-aggr:: Do not aggregate counts across all monitored CPUs. diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7cc61eb0d83b..874bc6dd8d60 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/tool.h" #include "util/group.h" #include "util/string2.h" +#include "util/metricgroup.h" #include "asm/bug.h" #include @@ -133,6 +134,8 @@ static const char *smi_cost_attrs = { static struct perf_evlist *evsel_list; +static struct rblist metric_events; + static struct target target = { .uid = UINT_MAX, }; @@ -1234,7 +1237,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval, perf_stat__print_shadow_stats(counter, uval, first_shadow_cpu(counter, id), - &out); + &out, &metric_events); if (!csv_output && !metric_only) { print_noise(counter, noise); print_running(run, ena); @@ -1565,7 +1568,8 @@ static void print_metric_headers(const char *prefix, bool no_indent) os.evsel = counter; perf_stat__print_shadow_stats(counter, 0, 0, - &out); + &out, + &metric_events); } fputc('\n', stat_config.output); } @@ -1789,6 +1793,13 @@ static int enable_metric_only(const struct option *opt __maybe_unused, return 0; } +static int parse_metric_groups(const struct option *opt, + const char *str, + int unset __maybe_unused) +{ + return metricgroup__parse_groups(opt, str, &metric_events); +} + static const struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1854,6 +1865,9 @@ static const struct option stat_options[] = { "measure topdown level 1 statistics"), OPT_BOOLEAN(0, "smi-cost", &smi_cost, "measure SMI cost"), + OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", + "monitor specified metrics or metric groups (separated by ,)", + parse_metric_groups), OPT_END() }; diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 94518c1bf8b6..71ab8466714d 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -34,6 +34,7 @@ libperf-y += dso.o libperf-y += symbol.o libperf-y += symbol_fprintf.o libperf-y += color.o +libperf-y += metricgroup.o libperf-y += header.o libperf-y += callchain.o libperf-y += values.o diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c new file mode 100644 index 000000000000..7516b1746594 --- /dev/null +++ b/tools/perf/util/metricgroup.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2017, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +/* Manage metrics and groups of metrics from JSON files */ + +#include "metricgroup.h" +#include "evlist.h" +#include "strbuf.h" +#include "pmu.h" +#include "expr.h" +#include "rblist.h" +#include "pmu.h" +#include +#include +#include +#include "pmu-events/pmu-events.h" +#include "strbuf.h" +#include "strlist.h" +#include +#include + +struct metric_event *metricgroup__lookup(struct rblist *metric_events, + struct perf_evsel *evsel, + bool create) +{ + struct rb_node *nd; + struct metric_event me = { + .evsel = evsel + }; + nd = rblist__find(metric_events, &me); + if (nd) + return container_of(nd, struct metric_event, nd); + if (create) { + rblist__add_node(metric_events, &me); + nd = rblist__find(metric_events, &me); + if (nd) + return container_of(nd, struct metric_event, nd); + } + return NULL; +} + +static int metric_event_cmp(struct rb_node *rb_node, const void *entry) +{ + struct metric_event *a = container_of(rb_node, + struct metric_event, + nd); + const struct metric_event *b = entry; + + if (a->evsel == b->evsel) + return 0; + if ((char *)a->evsel < (char *)b->evsel) + return -1; + return +1; +} + +static struct rb_node *metric_event_new(struct rblist *rblist __maybe_unused, + const void *entry) +{ + struct metric_event *me = malloc(sizeof(struct metric_event)); + + if (!me) + return NULL; + memcpy(me, entry, sizeof(struct metric_event)); + me->evsel = ((struct metric_event *)entry)->evsel; + INIT_LIST_HEAD(&me->head); + return &me->nd; +} + +static void metricgroup__rblist_init(struct rblist *metric_events) +{ + rblist__init(metric_events); + metric_events->node_cmp = metric_event_cmp; + metric_events->node_new = metric_event_new; +} + +struct egroup { + struct list_head nd; + int idnum; + const char **ids; + const char *metric_name; + const char *metric_expr; +}; + +static struct perf_evsel *find_evsel(struct perf_evlist *perf_evlist, + const char **ids, + int idnum, + struct perf_evsel **metric_events) +{ + struct perf_evsel *ev, *start = NULL; + int ind = 0; + + evlist__for_each_entry (perf_evlist, ev) { + if (!strcmp(ev->name, ids[ind])) { + metric_events[ind] = ev; + if (ind == 0) + start = ev; + if (++ind == idnum) { + metric_events[ind] = NULL; + return start; + } + } else { + ind = 0; + start = NULL; + } + } + /* + * This can happen when an alias expands to multiple + * events, like for uncore events. + * We don't support this case for now. + */ + return NULL; +} + +static int metricgroup__setup_events(struct list_head *groups, + struct perf_evlist *perf_evlist, + struct rblist *metric_events_list) +{ + struct metric_event *me; + struct metric_expr *expr; + int i = 0; + int ret = 0; + struct egroup *eg; + struct perf_evsel *evsel; + + list_for_each_entry (eg, groups, nd) { + struct perf_evsel **metric_events; + + metric_events = calloc(sizeof(void *), eg->idnum + 1); + if (!metric_events) { + ret = -ENOMEM; + break; + } + evsel = find_evsel(perf_evlist, eg->ids, eg->idnum, + metric_events); + if (!evsel) { + pr_debug("Cannot resolve %s: %s\n", + eg->metric_name, eg->metric_expr); + continue; + } + for (i = 0; i < eg->idnum; i++) + metric_events[i]->collect_stat = true; + me = metricgroup__lookup(metric_events_list, evsel, true); + if (!me) { + ret = -ENOMEM; + break; + } + expr = malloc(sizeof(struct metric_expr)); + if (!expr) { + ret = -ENOMEM; + break; + } + expr->metric_expr = eg->metric_expr; + expr->metric_name = eg->metric_name; + expr->metric_events = metric_events; + list_add(&expr->nd, &me->head); + } + return ret; +} + +static bool match_metric(const char *n, const char *list) +{ + int len; + char *m; + + if (!list) + return false; + if (!strcmp(list, "all")) + return true; + if (!n) + return !strcasecmp(list, "No_group"); + len = strlen(list); + m = strcasestr(n, list); + if (!m) + return false; + if ((m == n || m[-1] == ';' || m[-1] == ' ') && + (m[len] == 0 || m[len] == ';')) + return true; + return false; +} + +static int metricgroup__add_metric(const char *metric, struct strbuf *events, + struct list_head *group_list) +{ + struct pmu_events_map *map = perf_pmu__find_map(); + struct pmu_event *pe; + int ret = -EINVAL; + int i, j; + + strbuf_init(events, 100); + strbuf_addf(events, "%s", ""); + + if (!map) + return 0; + + for (i = 0; ; i++) { + pe = &map->table[i]; + + if (!pe->name && !pe->metric_group && !pe->metric_name) + break; + if (!pe->metric_expr) + continue; + if (match_metric(pe->metric_group, metric) || + match_metric(pe->metric_name, metric)) { + const char **ids; + int idnum; + struct egroup *eg; + + pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); + + if (expr__find_other(pe->metric_expr, + NULL, &ids, &idnum) < 0) + continue; + if (events->len > 0) + strbuf_addf(events, ","); + for (j = 0; j < idnum; j++) { + pr_debug("found event %s\n", ids[j]); + strbuf_addf(events, "%s%s", + j == 0 ? "{" : ",", + ids[j]); + } + strbuf_addf(events, "}:W"); + + eg = malloc(sizeof(struct egroup)); + if (!eg) { + ret = -ENOMEM; + break; + } + eg->ids = ids; + eg->idnum = idnum; + eg->metric_name = pe->metric_name; + eg->metric_expr = pe->metric_expr; + list_add_tail(&eg->nd, group_list); + ret = 0; + } + } + return ret; +} + +static int metricgroup__add_metric_list(const char *list, struct strbuf *events, + struct list_head *group_list) +{ + char *llist, *nlist, *p; + int ret = -EINVAL; + + nlist = strdup(list); + if (!nlist) + return -ENOMEM; + llist = nlist; + while ((p = strsep(&llist, ",")) != NULL) { + ret = metricgroup__add_metric(p, events, group_list); + if (ret == -EINVAL) { + fprintf(stderr, "Cannot find metric or group `%s'\n", + p); + break; + } + } + free(nlist); + return ret; +} + +static void metricgroup__free_egroups(struct list_head *group_list) +{ + struct egroup *eg, *egtmp; + int i; + + list_for_each_entry_safe (eg, egtmp, group_list, nd) { + for (i = 0; i < eg->idnum; i++) + free((char *)eg->ids[i]); + free(eg->ids); + free(eg); + } +} + +int metricgroup__parse_groups(const struct option *opt, + const char *str, + struct rblist *metric_events) +{ + struct parse_events_error parse_error; + struct perf_evlist *perf_evlist = *(struct perf_evlist **)opt->value; + struct strbuf extra_events; + LIST_HEAD(group_list); + int ret; + + if (metric_events->nr_entries == 0) + metricgroup__rblist_init(metric_events); + ret = metricgroup__add_metric_list(str, &extra_events, &group_list); + if (ret) + return ret; + pr_debug("adding %s\n", extra_events.buf); + memset(&parse_error, 0, sizeof(struct parse_events_error)); + ret = parse_events(perf_evlist, extra_events.buf, &parse_error); + if (ret) { + pr_err("Cannot set up events %s\n", extra_events.buf); + goto out; + } + strbuf_release(&extra_events); + ret = metricgroup__setup_events(&group_list, perf_evlist, + metric_events); +out: + metricgroup__free_egroups(&group_list); + return ret; +} diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h new file mode 100644 index 000000000000..06854e125ee7 --- /dev/null +++ b/tools/perf/util/metricgroup.h @@ -0,0 +1,31 @@ +#ifndef METRICGROUP_H +#define METRICGROUP_H 1 + +#include "linux/list.h" +#include "rblist.h" +#include +#include "evlist.h" +#include "strbuf.h" + +struct metric_event { + struct rb_node nd; + struct perf_evsel *evsel; + struct list_head head; /* list of metric_expr */ +}; + +struct metric_expr { + struct list_head nd; + const char *metric_expr; + const char *metric_name; + struct perf_evsel **metric_events; +}; + +struct metric_event *metricgroup__lookup(struct rblist *metric_events, + struct perf_evsel *evsel, + bool create); +int metricgroup__parse_groups(const struct option *opt, + const char *str, + struct rblist *metric_events); + +void metricgroup__print(bool metrics, bool groups, char *filter, bool raw); +#endif diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ed25d7f88731..7070638ab600 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -580,8 +580,11 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name) const char *pname; pe = &map->table[i++]; - if (!pe->name) + if (!pe->name) { + if (pe->metric_group || pe->metric_name) + continue; break; + } pname = pe->pmu ? pe->pmu : "cpu"; if (strncmp(pname, name, strlen(pname))) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 8c7ab29169b9..42e6c17be7ff 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -6,6 +6,7 @@ #include "rblist.h" #include "evlist.h" #include "expr.h" +#include "metricgroup.h" enum { CTX_BIT_USER = 1 << 0, @@ -671,13 +672,16 @@ static void generic_metric(const char *metric_expr, void perf_stat__print_shadow_stats(struct perf_evsel *evsel, double avg, int cpu, - struct perf_stat_output_ctx *out) + struct perf_stat_output_ctx *out, + struct rblist *metric_events) { void *ctxp = out->ctx; print_metric_t print_metric = out->print_metric; double total, ratio = 0.0, total2; const char *color = NULL; int ctx = evsel_context(evsel); + struct metric_event *me; + int num = 1; if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { total = avg_stats(&runtime_cycles_stats[ctx][cpu]); @@ -880,6 +884,20 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { print_smi_cost(cpu, evsel, out); } else { - print_metric(ctxp, NULL, NULL, NULL, 0); + num = 0; } + + if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) { + struct metric_expr *mexp; + + list_for_each_entry (mexp, &me->head, nd) { + if (num++ > 0) + out->new_line(ctxp); + generic_metric(mexp->metric_expr, mexp->metric_events, + evsel->name, mexp->metric_name, + avg, cpu, ctx, out); + } + } + if (num == 0) + print_metric(ctxp, NULL, NULL, NULL, 0); } diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index eacaf958e19d..47915df346fb 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -91,9 +91,11 @@ struct perf_stat_output_ctx { bool force_header; }; +struct rblist; void perf_stat__print_shadow_stats(struct perf_evsel *evsel, double avg, int cpu, - struct perf_stat_output_ctx *out); + struct perf_stat_output_ctx *out, + struct rblist *metric_events); void perf_stat__collect_metric_expr(struct perf_evlist *); int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw); From 71b0acce78d12e99eeda6fd6642ba89cc2b2b49c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:32 -0700 Subject: [PATCH 0012/1085] perf list: Add metric groups to perf list Add code to perf list to print metric groups, and metrics that don't have an event name. The metricgroup code collects the eventgroups and events into a rblist, and then prints them according to the configured filters. The metricgroups are printed by default, but can be limited by perf list metric or perf list metricgroup % perf list metricgroup .. Metric Groups: DSB: DSB_Coverage [Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)] FLOPS: GFLOPs [Giga Floating Point Operations Per Second] Frontend: IFetch_Line_Utilization [Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions] Frontend_Bandwidth: DSB_Coverage [Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)] Memory_BW: MLP [Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)] v2: Check return value of asprintf to fix warning on FC26 Fix key in lookup/addition for the groups list Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-8-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 7 +- tools/perf/builtin-list.c | 7 + tools/perf/util/metricgroup.c | 176 +++++++++++++++++++++++++ tools/perf/util/parse-events.c | 3 + 4 files changed, 192 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 75fc17f47298..24679aed90b7 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -8,7 +8,8 @@ perf-list - List all symbolic event types SYNOPSIS -------- [verse] -'perf list' [--no-desc] [--long-desc] [hw|sw|cache|tracepoint|pmu|sdt|event_glob] +'perf list' [--no-desc] [--long-desc] + [hw|sw|cache|tracepoint|pmu|sdt|metric|metricgroup|event_glob] DESCRIPTION ----------- @@ -248,6 +249,10 @@ To limit the list use: . 'sdt' to list all Statically Defined Tracepoint events. +. 'metric' to list metrics + +. 'metricgroup' to list metricgroups with metrics. + . If none of the above is matched, it will apply the supplied glob to all events, printing the ones that match. diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 4bf2cb4d25aa..b2d2ad3dd478 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -15,6 +15,7 @@ #include "util/cache.h" #include "util/pmu.h" #include "util/debug.h" +#include "util/metricgroup.h" #include static bool desc_flag = true; @@ -79,6 +80,10 @@ int cmd_list(int argc, const char **argv) long_desc_flag, details_flag); else if (strcmp(argv[i], "sdt") == 0) print_sdt_events(NULL, NULL, raw_dump); + else if (strcmp(argv[i], "metric") == 0) + metricgroup__print(true, false, NULL, raw_dump); + else if (strcmp(argv[i], "metricgroup") == 0) + metricgroup__print(false, true, NULL, raw_dump); else if ((sep = strchr(argv[i], ':')) != NULL) { int sep_idx; @@ -96,6 +101,7 @@ int cmd_list(int argc, const char **argv) s[sep_idx] = '\0'; print_tracepoint_events(s, s + sep_idx + 1, raw_dump); print_sdt_events(s, s + sep_idx + 1, raw_dump); + metricgroup__print(true, true, s, raw_dump); free(s); } else { if (asprintf(&s, "*%s*", argv[i]) < 0) { @@ -112,6 +118,7 @@ int cmd_list(int argc, const char **argv) details_flag); print_tracepoint_events(NULL, s, raw_dump); print_sdt_events(NULL, s, raw_dump); + metricgroup__print(true, true, NULL, raw_dump); free(s); } } diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 7516b1746594..2d60114f1870 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -189,6 +189,182 @@ static bool match_metric(const char *n, const char *list) return false; } +struct mep { + struct rb_node nd; + const char *name; + struct strlist *metrics; +}; + +static int mep_cmp(struct rb_node *rb_node, const void *entry) +{ + struct mep *a = container_of(rb_node, struct mep, nd); + struct mep *b = (struct mep *)entry; + + return strcmp(a->name, b->name); +} + +static struct rb_node *mep_new(struct rblist *rl __maybe_unused, + const void *entry) +{ + struct mep *me = malloc(sizeof(struct mep)); + + if (!me) + return NULL; + memcpy(me, entry, sizeof(struct mep)); + me->name = strdup(me->name); + if (!me->name) + goto out_me; + me->metrics = strlist__new(NULL, NULL); + if (!me->metrics) + goto out_name; + return &me->nd; +out_name: + free((char *)me->name); +out_me: + free(me); + return NULL; +} + +static struct mep *mep_lookup(struct rblist *groups, const char *name) +{ + struct rb_node *nd; + struct mep me = { + .name = name + }; + nd = rblist__find(groups, &me); + if (nd) + return container_of(nd, struct mep, nd); + rblist__add_node(groups, &me); + nd = rblist__find(groups, &me); + if (nd) + return container_of(nd, struct mep, nd); + return NULL; +} + +static void mep_delete(struct rblist *rl __maybe_unused, + struct rb_node *nd) +{ + struct mep *me = container_of(nd, struct mep, nd); + + strlist__delete(me->metrics); + free((void *)me->name); + free(me); +} + +static void metricgroup__print_strlist(struct strlist *metrics, bool raw) +{ + struct str_node *sn; + int n = 0; + + strlist__for_each_entry (sn, metrics) { + if (raw) + printf("%s%s", n > 0 ? " " : "", sn->s); + else + printf(" %s\n", sn->s); + n++; + } + if (raw) + putchar('\n'); +} + +void metricgroup__print(bool metrics, bool metricgroups, char *filter, + bool raw) +{ + struct pmu_events_map *map = perf_pmu__find_map(); + struct pmu_event *pe; + int i; + struct rblist groups; + struct rb_node *node, *next; + struct strlist *metriclist = NULL; + + if (!map) + return; + + if (!metricgroups) { + metriclist = strlist__new(NULL, NULL); + if (!metriclist) + return; + } + + rblist__init(&groups); + groups.node_new = mep_new; + groups.node_cmp = mep_cmp; + groups.node_delete = mep_delete; + for (i = 0; ; i++) { + const char *g; + pe = &map->table[i]; + + if (!pe->name && !pe->metric_group && !pe->metric_name) + break; + if (!pe->metric_expr) + continue; + g = pe->metric_group; + if (!g && pe->metric_name) { + if (pe->name) + continue; + g = "No_group"; + } + if (g) { + char *omg; + char *mg = strdup(g); + + if (!mg) + return; + omg = mg; + while ((g = strsep(&mg, ";")) != NULL) { + struct mep *me; + char *s; + + if (*g == 0) + g = "No_group"; + while (isspace(*g)) + g++; + if (filter && !strstr(g, filter)) + continue; + if (raw) + s = (char *)pe->metric_name; + else { + if (asprintf(&s, "%s\n\t[%s]", + pe->metric_name, pe->desc) < 0) + return; + } + + if (!s) + continue; + + if (!metricgroups) { + strlist__add(metriclist, s); + } else { + me = mep_lookup(&groups, g); + if (!me) + continue; + strlist__add(me->metrics, s); + } + } + free(omg); + } + } + + if (metricgroups && !raw) + printf("\nMetric Groups:\n\n"); + else if (metrics && !raw) + printf("\nMetrics:\n\n"); + + for (node = rb_first(&groups.entries); node; node = next) { + struct mep *me = container_of(node, struct mep, nd); + + if (metricgroups) + printf("%s%s%s", me->name, metrics ? ":" : "", raw ? " " : "\n"); + if (metrics) + metricgroup__print_strlist(me->metrics, raw); + next = rb_next(node); + rblist__remove_node(&groups, node); + } + if (!metricgroups) + metricgroup__print_strlist(metriclist, raw); + strlist__delete(metriclist); +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 57d7acf890e0..75588920fccc 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -28,6 +28,7 @@ #include "probe-file.h" #include "asm/bug.h" #include "util/parse-branch-options.h" +#include "metricgroup.h" #define MAX_NAME_LEN 100 @@ -2380,6 +2381,8 @@ void print_events(const char *event_glob, bool name_only, bool quiet_flag, print_tracepoint_events(NULL, NULL, name_only); print_sdt_events(NULL, NULL, name_only); + + metricgroup__print(true, true, NULL, name_only); } int parse_events__is_hardcoded_term(struct parse_events_term *term) From 4e1a096380e3b558ef021afc08e193ce5d1be478 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:33 -0700 Subject: [PATCH 0013/1085] perf stat: Don't use ctx for saved values lookup We don't need to use ctx to look up events for saved values. The context is already part of the evsel pointer, which is the primary key. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-9-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 42e6c17be7ff..664f49a9b012 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -56,7 +56,6 @@ struct saved_value { struct rb_node rb_node; struct perf_evsel *evsel; int cpu; - int ctx; struct stats stats; }; @@ -67,8 +66,6 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry) rb_node); const struct saved_value *b = entry; - if (a->ctx != b->ctx) - return a->ctx - b->ctx; if (a->cpu != b->cpu) return a->cpu - b->cpu; if (a->evsel == b->evsel) @@ -90,13 +87,12 @@ static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, } static struct saved_value *saved_value_lookup(struct perf_evsel *evsel, - int cpu, int ctx, + int cpu, bool create) { struct rb_node *nd; struct saved_value dm = { .cpu = cpu, - .ctx = ctx, .evsel = evsel, }; nd = rblist__find(&runtime_saved_values, &dm); @@ -232,8 +228,7 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); if (counter->collect_stat) { - struct saved_value *v = saved_value_lookup(counter, cpu, ctx, - true); + struct saved_value *v = saved_value_lookup(counter, cpu, true); update_stats(&v->stats, count[0]); } } @@ -634,7 +629,6 @@ static void generic_metric(const char *metric_expr, const char *metric_name, double avg, int cpu, - int ctx, struct perf_stat_output_ctx *out) { print_metric_t print_metric = out->print_metric; @@ -648,7 +642,7 @@ static void generic_metric(const char *metric_expr, for (i = 0; metric_events[i]; i++) { struct saved_value *v; - v = saved_value_lookup(metric_events[i], cpu, ctx, false); + v = saved_value_lookup(metric_events[i], cpu, false); if (!v) break; expr__add_id(&pctx, metric_events[i]->name, avg_stats(&v->stats)); @@ -866,7 +860,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, print_metric(ctxp, NULL, NULL, name, 0); } else if (evsel->metric_expr) { generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name, - evsel->metric_name, avg, cpu, ctx, out); + evsel->metric_name, avg, cpu, out); } else if (runtime_nsecs_stats[cpu].n != 0) { char unit = 'M'; char unit_buf[10]; @@ -895,7 +889,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, out->new_line(ctxp); generic_metric(mexp->metric_expr, mexp->metric_events, evsel->name, mexp->metric_name, - avg, cpu, ctx, out); + avg, cpu, out); } } if (num == 0) From fd48aad9b0f3f7654433dfae3a72ceda36e2de28 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:34 -0700 Subject: [PATCH 0014/1085] perf stat: Support duration_time for metrics Some of the metrics formulas (like GFLOPs) need to know how long the measurement period is. Support an internal event called duration_time, which reports time in second. It maps to the dummy event, but is special cased for statistics to report the walltime duration. So far it is not printed, but only used internally for metrics. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-10-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.l | 1 + tools/perf/util/stat-shadow.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index fdb5bb52f01f..ea2426daf7e8 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -288,6 +288,7 @@ cpu-migrations|migrations { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COU alignment-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); } emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); } dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } +duration_time { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } /* diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 664f49a9b012..a2c12d1ef32a 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -641,11 +641,20 @@ static void generic_metric(const char *metric_expr, expr__add_id(&pctx, name, avg); for (i = 0; metric_events[i]; i++) { struct saved_value *v; + struct stats *stats; + double scale; - v = saved_value_lookup(metric_events[i], cpu, false); - if (!v) - break; - expr__add_id(&pctx, metric_events[i]->name, avg_stats(&v->stats)); + if (!strcmp(metric_events[i]->name, "duration_time")) { + stats = &walltime_nsecs_stats; + scale = 1e-9; + } else { + v = saved_value_lookup(metric_events[i], cpu, false); + if (!v) + break; + stats = &v->stats; + scale = 1.0; + } + expr__add_id(&pctx, metric_events[i]->name, avg_stats(stats)*scale); } if (!metric_events[i]) { const char *p = metric_expr; From e864c5ca145e49bfce4847bd14b47b5f8549b2b1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:35 -0700 Subject: [PATCH 0015/1085] perf stat: Hide internal duration_time counter Some perf stat metrics use an internal "duration_time" metric. It is not correctly printed however. So hide it during output to avoid confusing users with 0 counts. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-11-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 874bc6dd8d60..855890e0b70b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -195,6 +195,11 @@ static struct perf_stat_config stat_config = { .scale = true, }; +static bool is_duration_time(struct perf_evsel *evsel) +{ + return !strcmp(evsel->name, "duration_time"); +} + static inline void diff_timespec(struct timespec *r, struct timespec *a, struct timespec *b) { @@ -1363,6 +1368,9 @@ static void print_aggr(char *prefix) ad.id = id = aggr_map->map[s]; first = true; evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; + ad.val = ad.ena = ad.run = 0; ad.nr = 0; if (!collect_data(counter, aggr_cb, &ad)) @@ -1506,6 +1514,8 @@ static void print_no_aggr_metric(char *prefix) if (prefix) fputs(prefix, stat_config.output); evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; if (first) { aggr_printout(counter, cpu, 0); first = false; @@ -1560,6 +1570,8 @@ static void print_metric_headers(const char *prefix, bool no_indent) /* Print metrics headers only */ evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; os.evsel = counter; out.ctx = &os; out.print_metric = print_metric_header; @@ -1707,12 +1719,18 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) print_aggr(prefix); break; case AGGR_THREAD: - evlist__for_each_entry(evsel_list, counter) + evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; print_aggr_thread(counter, prefix); + } break; case AGGR_GLOBAL: - evlist__for_each_entry(evsel_list, counter) + evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; print_counter_aggr(counter, prefix); + } if (metric_only) fputc('\n', stat_config.output); break; @@ -1720,8 +1738,11 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) if (metric_only) print_no_aggr_metric(prefix); else { - evlist__for_each_entry(evsel_list, counter) + evlist__for_each_entry(evsel_list, counter) { + if (is_duration_time(counter)) + continue; print_counter(counter, prefix); + } } break; case AGGR_UNSET: From b90f1333ef08d2a497ae239798868b046f4e3a97 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 12:40:36 -0700 Subject: [PATCH 0016/1085] perf stat: Update walltime_nsecs_stats in interval mode Some metrics (like GFLOPs) need walltime_nsecs_stats for each interval. Compute it for each interval instead of only at the end. Pointed out by Jiri. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170831194036.30146-12-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 855890e0b70b..88f1d5fbdb48 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -415,6 +415,8 @@ static void process_interval(void) pr_err("failed to write stat round event\n"); } + init_stats(&walltime_nsecs_stats); + update_stats(&walltime_nsecs_stats, stat_config.interval * 1000000); print_counters(&rs, 0, NULL); } From 84c417422798c897f637b0249f64a52807b4a61b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Sep 2017 10:00:28 -0700 Subject: [PATCH 0017/1085] perf record: Support direct --user-regs arguments USER_REGS can currently only collected implicitely with call graph recording. Sometimes it is useful to see them separately, and filter them. Add a new --user-regs option to record that is similar to --intr-regs, but acts on user regs. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170905170029.19722-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 2 ++ tools/perf/builtin-record.c | 3 +++ tools/perf/perf.h | 1 + tools/perf/util/evsel.c | 7 ++++++- 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index e397453e5a46..68a1ffb0a8a5 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -377,6 +377,8 @@ symbolic names, e.g. on x86, ax, si. To list the available registers use --intr-regs=\?. To name registers, pass a comma separated list such as --intr-regs=ax,bx. The list of register is architecture dependent. +--user-regs:: +Capture user registers at sample time. Same arguments as -I. --running-time:: Record running and enabled time for read events (:S) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 56f8142ff97f..9b379f3a3d99 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1643,6 +1643,9 @@ static struct option __record_options[] = { OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", "sample selected machine registers on interrupt," " use -I ? to list register names", parse_regs), + OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", + "sample selected machine registers on interrupt," + " use -I ? to list register names", parse_regs), OPT_BOOLEAN(0, "running-time", &record.opts.running_time, "Record running/enabled time of read (:S) events"), OPT_CALLBACK('k', "clockid", &record.opts, diff --git a/tools/perf/perf.h b/tools/perf/perf.h index dc442ba21bf6..fbb0a9cd0ac6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -65,6 +65,7 @@ struct record_opts { unsigned int user_freq; u64 branch_stack; u64 sample_intr_regs; + u64 sample_user_regs; u64 default_interval; u64 user_interval; size_t auxtrace_snapshot_size; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4bb89373eb52..7389746c0dc4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -678,7 +678,7 @@ void perf_evsel__config_callchain(struct perf_evsel *evsel, if (!function) { perf_evsel__set_sample_bit(evsel, REGS_USER); perf_evsel__set_sample_bit(evsel, STACK_USER); - attr->sample_regs_user = PERF_REGS_MASK; + attr->sample_regs_user |= PERF_REGS_MASK; attr->sample_stack_user = param->dump_size; attr->exclude_callchain_user = 1; } else { @@ -931,6 +931,11 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, perf_evsel__set_sample_bit(evsel, REGS_INTR); } + if (opts->sample_user_regs) { + attr->sample_regs_user |= opts->sample_user_regs; + perf_evsel__set_sample_bit(evsel, REGS_USER); + } + if (target__has_cpu(&opts->target) || opts->sample_cpu) perf_evsel__set_sample_bit(evsel, CPU); From b1491ace8eb2e92677cd9ee966763f8f53d29d16 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Sep 2017 11:40:57 -0700 Subject: [PATCH 0018/1085] perf script: Support user regs Teach perf script to print user regs. % perf record --user-regs=ip,sp ... % perf script -F ip,sym,uregs ... ffffffff9e060c24 native_write_msr ABI:2 SP:0x7ffd0ea06c38 IP:0x7fe77f55b637 ffffffff9e060c24 native_write_msr ABI:2 SP:0x7ffd0ea06c38 IP:0x7fe77f55b637 ffffffff9e060c24 native_write_msr ABI:2 SP:0x7ffd0ea06c38 IP:0x7fe77f55b637 ffffffff9e060c24 native_write_msr ABI:2 SP:0x7ffd0ea06c38 IP:0x7fe77f55b637 ffffffff9e00cc12 intel_pmu_handle_irq ABI:2 SP:0x7ffd0ea06c38 IP:0x7fe77f55b637 v2: Rebased on top of phys-addr patches Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170905184057.26135-1-andi@firstfloor.org [ Use PRIu64 for regs->abi in print_sample_uregs() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 4 ++-- tools/perf/builtin-script.c | 30 +++++++++++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 18dfcfa38454..bcc1ba35a2d8 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -116,8 +116,8 @@ OPTIONS --fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, - srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff, - callindent, insn, insnlen, synth, phys_addr. + srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn, + brstackoff, callindent, insn, insnlen, synth, phys_addr. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3d4c3b5e1868..725dbd3dd104 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -88,6 +88,7 @@ enum perf_output_field { PERF_OUTPUT_BRSTACKOFF = 1U << 24, PERF_OUTPUT_SYNTH = 1U << 25, PERF_OUTPUT_PHYS_ADDR = 1U << 26, + PERF_OUTPUT_UREGS = 1U << 27, }; struct output_option { @@ -109,6 +110,7 @@ struct output_option { {.str = "srcline", .field = PERF_OUTPUT_SRCLINE}, {.str = "period", .field = PERF_OUTPUT_PERIOD}, {.str = "iregs", .field = PERF_OUTPUT_IREGS}, + {.str = "uregs", .field = PERF_OUTPUT_UREGS}, {.str = "brstack", .field = PERF_OUTPUT_BRSTACK}, {.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM}, {.str = "data_src", .field = PERF_OUTPUT_DATA_SRC}, @@ -385,6 +387,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, PERF_OUTPUT_IREGS)) return -EINVAL; + if (PRINT_FIELD(UREGS) && + perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_USER, "UREGS", + PERF_OUTPUT_UREGS)) + return -EINVAL; + if (PRINT_FIELD(PHYS_ADDR) && perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR)) @@ -509,6 +516,24 @@ static void print_sample_iregs(struct perf_sample *sample, } } +static void print_sample_uregs(struct perf_sample *sample, + struct perf_event_attr *attr) +{ + struct regs_dump *regs = &sample->user_regs; + uint64_t mask = attr->sample_regs_user; + unsigned i = 0, r; + + if (!regs || !regs->regs) + return; + + printf(" ABI:%" PRIu64 " ", regs->abi); + + for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { + u64 val = regs->regs[i++]; + printf("%5s:0x%"PRIx64" ", perf_reg_name(r), val); + } +} + static void print_sample_start(struct perf_sample *sample, struct thread *thread, struct perf_evsel *evsel) @@ -1444,6 +1469,9 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(IREGS)) print_sample_iregs(sample, attr); + if (PRINT_FIELD(UREGS)) + print_sample_uregs(sample, attr); + if (PRINT_FIELD(BRSTACK)) print_sample_brstack(sample, thread, attr); else if (PRINT_FIELD(BRSTACKSYM)) @@ -2739,7 +2767,7 @@ int cmd_script(int argc, const char **argv) "+field to add and -field to remove." "Valid types: hw,sw,trace,raw,synth. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," - "addr,symoff,period,iregs,brstack,brstacksym,flags," + "addr,symoff,period,iregs,uregs,brstack,brstacksym,flags," "bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, From 80f873557112fc163f011cd131d4cfe4959100a6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Sep 2017 10:46:21 +0200 Subject: [PATCH 0019/1085] perf tools: Add python-clean target To be able to cleanup only python related binaries. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170908084621.31595-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 91ef44bfaf3e..1df93b4c4648 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -173,7 +173,7 @@ AWK = awk # non-config cases config := 1 -NON_CONFIG_TARGETS := clean TAGS tags cscope help install-doc install-man install-html install-info install-pdf doc man html info pdf +NON_CONFIG_TARGETS := clean python-clean TAGS tags cscope help install-doc install-man install-html install-info install-pdf doc man html info pdf ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),) @@ -802,7 +802,10 @@ config-clean: $(call QUIET_CLEAN, config) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null -clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean +python-clean: + $(python-clean) + +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean python-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected @@ -819,7 +822,6 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)$(vhost_virtio_ioctl_array) \ $(OUTPUT)$(perf_ioctl_array) $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean - $(python-clean) # # To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) From 25cc4eb44b0c840eff0e5a46a85b9ccbde77401b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Sep 2017 14:05:09 +0200 Subject: [PATCH 0020/1085] perf ui progress: Add ui specific init function Adding ui specific init function allowing to setup the progress bar width based on current screen scales. Adding TUI init function to get more grained update of the progress bar. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170908120510.22515-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/progress.c | 2 ++ tools/perf/ui/progress.h | 1 + tools/perf/ui/tui/progress.c | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/perf/ui/progress.c b/tools/perf/ui/progress.c index ae91c8148edf..3e2b5d64c55e 100644 --- a/tools/perf/ui/progress.c +++ b/tools/perf/ui/progress.c @@ -34,6 +34,8 @@ void ui_progress__init(struct ui_progress *p, u64 total, const char *title) p->total = total; p->title = title; + if (ui_progress__ops->init) + ui_progress__ops->init(p); } void ui_progress__finish(void) diff --git a/tools/perf/ui/progress.h b/tools/perf/ui/progress.h index 717d39d3052b..e5f434a2070b 100644 --- a/tools/perf/ui/progress.h +++ b/tools/perf/ui/progress.h @@ -14,6 +14,7 @@ void ui_progress__init(struct ui_progress *p, u64 total, const char *title); void ui_progress__update(struct ui_progress *p, u64 adv); struct ui_progress_ops { + void (*init)(struct ui_progress *p); void (*update)(struct ui_progress *p); void (*finish)(void); }; diff --git a/tools/perf/ui/tui/progress.c b/tools/perf/ui/tui/progress.c index c4b99008e2c9..f6b8f52aad7e 100644 --- a/tools/perf/ui/tui/progress.c +++ b/tools/perf/ui/tui/progress.c @@ -5,6 +5,11 @@ #include "tui.h" #include "../browser.h" +static void __tui_progress__init(struct ui_progress *p) +{ + p->next = p->step = p->total / (SLtt_Screen_Cols - 2) ?: 1; +} + static void tui_progress__update(struct ui_progress *p) { int bar, y; @@ -49,8 +54,8 @@ static void tui_progress__finish(void) pthread_mutex_unlock(&ui__lock); } -static struct ui_progress_ops tui_progress__ops = -{ +static struct ui_progress_ops tui_progress__ops = { + .init = __tui_progress__init, .update = tui_progress__update, .finish = tui_progress__finish, }; From 8233822f403b67bbaa1d58e8fa6b8f821fe7626d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Sep 2017 14:05:10 +0200 Subject: [PATCH 0021/1085] perf ui progress: Add size info into progress bar Adding the size values '[current/total]' into progress bar, to show more detailed progress of data reading. Adding new ui_progress__init_size function to specify we want to display the size. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170908120510.22515-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/progress.c | 4 +++- tools/perf/ui/progress.h | 11 ++++++++++- tools/perf/ui/tui/progress.c | 23 ++++++++++++++++++++++- tools/perf/util/session.c | 2 +- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/tools/perf/ui/progress.c b/tools/perf/ui/progress.c index 3e2b5d64c55e..7ade387d511c 100644 --- a/tools/perf/ui/progress.c +++ b/tools/perf/ui/progress.c @@ -27,12 +27,14 @@ void ui_progress__update(struct ui_progress *p, u64 adv) } } -void ui_progress__init(struct ui_progress *p, u64 total, const char *title) +void __ui_progress__init(struct ui_progress *p, u64 total, + const char *title, bool size) { p->curr = 0; p->next = p->step = total / 16 ?: 1; p->total = total; p->title = title; + p->size = size; if (ui_progress__ops->init) ui_progress__ops->init(p); diff --git a/tools/perf/ui/progress.h b/tools/perf/ui/progress.h index e5f434a2070b..fbaa1507ebfe 100644 --- a/tools/perf/ui/progress.h +++ b/tools/perf/ui/progress.h @@ -8,9 +8,18 @@ void ui_progress__finish(void); struct ui_progress { const char *title; u64 curr, next, step, total; + bool size; }; -void ui_progress__init(struct ui_progress *p, u64 total, const char *title); +void __ui_progress__init(struct ui_progress *p, u64 total, + const char *title, bool size); + +#define ui_progress__init(p, total, title) \ + __ui_progress__init(p, total, title, false) + +#define ui_progress__init_size(p, total, title) \ + __ui_progress__init(p, total, title, true) + void ui_progress__update(struct ui_progress *p, u64 adv); struct ui_progress_ops { diff --git a/tools/perf/ui/tui/progress.c b/tools/perf/ui/tui/progress.c index f6b8f52aad7e..68f6144ea603 100644 --- a/tools/perf/ui/tui/progress.c +++ b/tools/perf/ui/tui/progress.c @@ -1,8 +1,10 @@ +#include #include "../cache.h" #include "../progress.h" #include "../libslang.h" #include "../ui.h" #include "tui.h" +#include "units.h" #include "../browser.h" static void __tui_progress__init(struct ui_progress *p) @@ -10,8 +12,22 @@ static void __tui_progress__init(struct ui_progress *p) p->next = p->step = p->total / (SLtt_Screen_Cols - 2) ?: 1; } +static int get_title(struct ui_progress *p, char *buf, size_t size) +{ + char buf_cur[20]; + char buf_tot[20]; + int ret; + + ret = unit_number__scnprintf(buf_cur, sizeof(buf_cur), p->curr); + ret += unit_number__scnprintf(buf_tot, sizeof(buf_tot), p->total); + + return ret + scnprintf(buf, size, "%s [%s/%s]", + p->title, buf_cur, buf_tot); +} + static void tui_progress__update(struct ui_progress *p) { + char buf[100], *title = (char *) p->title; int bar, y; /* * FIXME: We should have a per UI backend way of showing progress, @@ -23,13 +39,18 @@ static void tui_progress__update(struct ui_progress *p) if (p->total == 0) return; + if (p->size) { + get_title(p, buf, sizeof(buf)); + title = buf; + } + ui__refresh_dimensions(false); pthread_mutex_lock(&ui__lock); y = SLtt_Screen_Rows / 2 - 2; SLsmg_set_color(0); SLsmg_draw_box(y, 0, 3, SLtt_Screen_Cols); SLsmg_gotorc(y++, 1); - SLsmg_write_string((char *)p->title); + SLsmg_write_string(title); SLsmg_fill_region(y, 1, 1, SLtt_Screen_Cols - 2, ' '); SLsmg_set_color(HE_COLORSET_SELECTED); bar = ((SLtt_Screen_Cols - 2) * p->curr) / p->total; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a7ebd9fe8e40..ceac0848469d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1847,7 +1847,7 @@ static int __perf_session__process_events(struct perf_session *session, if (data_offset + data_size < file_size) file_size = data_offset + data_size; - ui_progress__init(&prog, file_size, "Processing events..."); + ui_progress__init_size(&prog, file_size, "Processing events..."); mmap_size = MMAP_SIZE; if (mmap_size > file_size) { From ecdad24d7a4480c9af0ff6dbe00ac8bbae720d19 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 7 Sep 2017 10:55:46 -0700 Subject: [PATCH 0022/1085] perf tools: Use scandir() to replace readdir() In perf_event__synthesize_threads() perf goes through all proc files serially by readdir. scandir() does a snapshoot of /proc, which is multithreading friendly. It's possible that some threads which are added during event synthesize. But the number of lost threads should be small. They should not impact the final analysis. Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1504806954-150842-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.c | 45 +++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 1c905ba3641b..17c21ea68a72 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -683,12 +683,14 @@ int perf_event__synthesize_threads(struct perf_tool *tool, bool mmap_data, unsigned int proc_map_timeout) { - DIR *proc; - char proc_path[PATH_MAX]; - struct dirent *dirent; union perf_event *comm_event, *mmap_event, *fork_event; union perf_event *namespaces_event; + char proc_path[PATH_MAX]; + struct dirent **dirent; int err = -1; + char *end; + pid_t pid; + int n, i; if (machine__is_default_guest(machine)) return 0; @@ -712,29 +714,32 @@ int perf_event__synthesize_threads(struct perf_tool *tool, goto out_free_fork; snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); - proc = opendir(proc_path); + n = scandir(proc_path, &dirent, 0, alphasort); - if (proc == NULL) + if (n < 0) goto out_free_namespaces; - while ((dirent = readdir(proc)) != NULL) { - char *end; - pid_t pid = strtol(dirent->d_name, &end, 10); - - if (*end) /* only interested in proper numerical dirents */ + for (i = 0; i < n; i++) { + if (!isdigit(dirent[i]->d_name[0])) continue; - /* - * We may race with exiting thread, so don't stop just because - * one thread couldn't be synthesized. - */ - __event__synthesize_thread(comm_event, mmap_event, fork_event, - namespaces_event, pid, 1, process, - tool, machine, mmap_data, - proc_map_timeout); - } + pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); + /* only interested in proper numerical dirents */ + if (!*end) { + /* + * We may race with exiting thread, so don't stop just because + * one thread couldn't be synthesized. + */ + __event__synthesize_thread(comm_event, mmap_event, fork_event, + namespaces_event, pid, 1, process, + tool, machine, mmap_data, + proc_map_timeout); + } + free(dirent[i]); + } + free(dirent); err = 0; - closedir(proc); + out_free_namespaces: free(namespaces_event); out_free_fork: From 5c2615556d4410baebc9b336f14befe0bb32cde4 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Thu, 7 Sep 2017 12:18:51 +0900 Subject: [PATCH 0023/1085] perf config: Write a config file just once Currently set_config() can be repeatedly called for each input config on the below case: $ perf config kmem.default=slab report.children=false ... But it's a waste, so only once write a config file gathering all given config key=value pairs. Signed-off-by: Taeung Song Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1504754331-9776-1-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-config.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index a1d82e33282c..b89417d9305e 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -34,8 +34,7 @@ static struct option config_options[] = { OPT_END() }; -static int set_config(struct perf_config_set *set, const char *file_name, - const char *var, const char *value) +static int set_config(struct perf_config_set *set, const char *file_name) { struct perf_config_section *section = NULL; struct perf_config_item *item = NULL; @@ -49,7 +48,6 @@ static int set_config(struct perf_config_set *set, const char *file_name, if (!fp) return -1; - perf_config_set__collect(set, file_name, var, value); fprintf(fp, "%s\n", first_line); /* overwrite configvariables */ @@ -161,6 +159,7 @@ int cmd_config(int argc, const char **argv) struct perf_config_set *set; char *user_config = mkpath("%s/.perfconfig", getenv("HOME")); const char *config_filename; + bool changed = false; argc = parse_options(argc, argv, config_options, config_usage, PARSE_OPT_STOP_AT_NON_OPTION); @@ -231,15 +230,26 @@ int cmd_config(int argc, const char **argv) goto out_err; } } else { - if (set_config(set, config_filename, var, value) < 0) { - pr_err("Failed to set '%s=%s' on %s\n", - var, value, config_filename); + if (perf_config_set__collect(set, config_filename, + var, value) < 0) { + pr_err("Failed to add '%s=%s'\n", + var, value); free(arg); goto out_err; } + changed = true; } free(arg); } + + if (!changed) + break; + + if (set_config(set, config_filename) < 0) { + pr_err("Failed to set the configs on %s\n", + config_filename); + goto out_err; + } } ret = 0; From 55421b4fb7054f85274b1b6a321e204dac696133 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Thu, 7 Sep 2017 12:18:56 +0900 Subject: [PATCH 0024/1085] perf config: Allow creating empty config set for config file autogeneration When there isn't a config file (e.g. ~/.perfconfig) or it has nothing, the config set wasn't created. If the config set does not exist, a config file can't be autogenerated. So allow creating a empty config set in the above case, then we can support the config file autogeneration. Before: $ rm -f ~/.perfconfig $ perf config --user report.children=false $ cat ~/.perfconfig cat: /root/.perfconfig: No such file or directory But I think it should work even if there isn't a config file. After: $ rm -f ~/.perfconfig $ perf config --user report.children=false $ cat ~/.perfconfig # this file is auto-generated. [report] children = false NOTE: As a result, if perf_config_set__init() fails, it looks as if the config set isn't freed. But it isn't a problem. Because the config set will be freed by perf_config_set__delete() at the end of cmd_config(). Signed-off-by: Taeung Song Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1504754336-9824-1-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index bc75596f9e79..d2b6983b1779 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -700,10 +700,7 @@ struct perf_config_set *perf_config_set__new(void) if (set) { INIT_LIST_HEAD(&set->sections); - if (perf_config_set__init(set) < 0) { - perf_config_set__delete(set); - set = NULL; - } + perf_config_set__init(set); } return set; From c23c2a0f236601c635d9a9d18d7993641e72aa8c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Sep 2017 10:50:26 -0300 Subject: [PATCH 0025/1085] perf tools: Make copyfile_offset() static There are no usage outside util.c and this is the only remaining reason for fcntl.h to be included in util.h, to get the loff_t definition in Alpine Linux, so make it static. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-2dzlsao7k6ihozs5karw6kpx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 1 + tools/perf/util/data.c | 1 + tools/perf/util/dso.c | 1 + tools/perf/util/event.c | 1 + tools/perf/util/evlist.h | 1 + tools/perf/util/namespaces.c | 1 + tools/perf/util/pmu.c | 1 + tools/perf/util/probe-file.c | 1 + tools/perf/util/util.c | 3 ++- tools/perf/util/util.h | 2 -- tools/perf/util/zlib.c | 1 + 11 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 377bea009163..d0fee35db0e7 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -3,6 +3,7 @@ * * Builtin regression testing command: ever growing number of sanity tests */ +#include #include #include #include diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 263f5a906ba5..1123b30e3033 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index b9e087fb8247..ffd723179a4b 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "compress.h" #include "path.h" #include "symbol.h" diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 17c21ea68a72..10366b87d0b5 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index bf2c4936e35f..b1c14f1fdc27 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "../perf.h" #include "event.h" diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c index a58e91197729..5be021701f34 100644 --- a/tools/perf/util/namespaces.c +++ b/tools/perf/util/namespaces.c @@ -11,6 +11,7 @@ #include "event.h" #include #include +#include #include #include #include diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7070638ab600..0b11dfc0af44 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index cdf8d83a484c..4ae1123c6794 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -15,6 +15,7 @@ * */ #include +#include #include #include #include diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 4c360daa4e24..3c9c39711343 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -174,7 +175,7 @@ out: return err; } -int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) +static int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) { void *ptr; loff_t pgoff; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index b136c271125f..03946d5e4741 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -5,7 +5,6 @@ /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */ #define _DEFAULT_SOURCE 1 -#include #include #include #include @@ -35,7 +34,6 @@ bool lsdir_no_dot_filter(const char *name, struct dirent *d); int copyfile(const char *from, const char *to); int copyfile_mode(const char *from, const char *to, mode_t mode); int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi); -int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size); ssize_t readn(int fd, void *buf, size_t n); ssize_t writen(int fd, const void *buf, size_t n); diff --git a/tools/perf/util/zlib.c b/tools/perf/util/zlib.c index 1329d843eb7b..7c1175310a12 100644 --- a/tools/perf/util/zlib.c +++ b/tools/perf/util/zlib.c @@ -1,3 +1,4 @@ +#include #include #include #include From 35c1980eb3d1acb3cac11c38252339399dca77e3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Sep 2017 14:13:24 -0700 Subject: [PATCH 0026/1085] perf stat: Fall weak group back even for EBADF It's not possible to run a package event and a per cpu event in the same group. This is used by some of the power metrics. They work correctly when not using a group. Normally weak groups should handle that, but in this case EBADF is returned instead of the normal EINVAL. $ strace -e perf_event_open ./perf stat -v -e '{cstate_pkg/c2-residency/,msr/tsc/}:W' -a sleep 1 Using CPUID GenuineIntel-6-3E perf_event_open({type=0x17 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, -1, PERF_FLAG_FD_CLOEXEC) = -1 EINVAL (Invalid argument) perf_event_open({type=0x17 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, -1, 0) = -1 EINVAL (Invalid argument) perf_event_open({type=0x17 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, -1, 0) = -1 EINVAL (Invalid argument) perf_event_open({type=0x17 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, -1, 0) = -1 EINVAL (Invalid argument) perf_event_open({type=0x17 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, -1, 0) = 3 perf_event_open({type=0x7 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 0, 3, 0) = 4 perf_event_open({type=0x7 /* PERF_TYPE_??? */, size=PERF_ATTR_SIZE_VER5, config=0, ...}, -1, 1, 0, 0) = -1 EBADF (Bad file descriptor) and perf errors out. Make weak groups trigger a fall back for EBADF too. Then this case works correctly: $ perf stat -v -e '{cstate_pkg/c2-residency/,msr/tsc/}:W' -a sleep 1 Using CPUID GenuineIntel-6-3E Weak group for cstate_pkg/c2-residency//2 failed cstate_pkg/c2-residency/: 476709882 1000598460 1000598460 msr/tsc/: 39625837911 12007369110 12007369110 Performance counter stats for 'system wide': 476,709,882 cstate_pkg/c2-residency/ 39,625,837,911 msr/tsc/ 1.000697588 seconds time elapsed This fixes perf stat -M Power ... $ perf stat -M Power --metric-only -a sleep 1 Performance counter stats for 'system wide': Turbo_Utilization C3_Core_Residency C6_Core_Residency C7_Core_Residency C2_Pkg_Residency C3_Pkg_Residency C6_Pkg_Residency C7_Pkg_Residency 1.0 0.7 30.0 0.0 0.9 0.1 0.4 0.0 1.001240740 seconds time elapsed Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170905211324.32427-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 88f1d5fbdb48..dd525417880a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -656,7 +656,7 @@ try_again: if (create_perf_stat_counter(counter) < 0) { /* Weak group failed. Reset the group. */ - if (errno == EINVAL && + if ((errno == EINVAL || errno == EBADF) && counter->leader != counter && counter->weak_group) { counter = perf_evsel__reset_weak_group(counter); From cf97962308ba6b6afdeb038505032c7c0972bdfa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:53:25 -0700 Subject: [PATCH 0027/1085] perf vendor events: Add JSON metrics for Broadwell Add JSON metrics for Broadwell. Commiter testing: # uname -a Linux jouet 4.13.0-rc7+ #3 SMP Sat Sep 2 09:04:44 -03 2017 x86_64 x86_64 x86_64 GNU/Linux # grep "model name" /proc/cpuinfo | head -1 model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz # perf list metricgroup List of pre-defined events (to be used in -e): Metric Groups: DSB FLOPS Frontend Frontend_Bandwidth Memory_BW Memory_Bound Memory_Lat Pipeline Ports_Utilization Power SMT Summary TLB TopDownL1 Unknown_Branches # perf stat -M Power --metric-only -a sleep 1 Performance counter stats for 'system wide': Turbo_Utilization C3_Core_Residency C6_Core_Residency C7_Core_Residency C2_Pkg_Residency C3_Pkg_Residency C6_Pkg_Residency C7_Pkg_Residency 1.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.003502904 seconds time elapsed # # perf stat -M Memory_BW --metric-only -a sleep 1 Performance counter stats for 'system wide': MLP 1.7 1.001364525 seconds time elapsed # # perf stat -M TLB --metric-only -a sleep 1 Performance counter stats for 'system wide': Page_Walks_Utilization 0.1 1.005962198 seconds time elapsed # # perf stat -M Summary --metric-only -a sleep 1 Performance counter stats for 'system wide': Instructions CPI CLKS CPU_Utilization GFLOPs SMT_2T_Utilization Kernel_Utilization 7281856697.0 0.0 11150898087.0 1.0 0.0 1.0 0.7 1.012134025 seconds time elapsed # Running in verbose mode shows which counters and expressions are being used: # perf stat -v -M Summary --metric-only -a sleep 1 Using CPUID GenuineIntel-6-3D metric expr 1 / inst_retired.any / cycles for CPI found event inst_retired.any found event cycles metric expr cpu_clk_unhalted.thread for CLKS found event cpu_clk_unhalted.thread metric expr inst_retired.any for Instructions found event inst_retired.any metric expr cpu_clk_unhalted.ref_tsc / msr@tsc@ for CPU_Utilization found event cpu_clk_unhalted.ref_tsc found event msr/tsc/ metric expr ( 1*( fp_arith_inst_retired.scalar_single + fp_arith_inst_retired.scalar_double ) + 2* fp_arith_inst_retired.128b_packed_double + 4*( fp_arith_inst_retired.128b_packed_single + fp_arith_inst_retired.256b_packed_double ) + 8* fp_arith_inst_retired.256b_packed_single ) / 1000000000 / duration_time for GFLOPs found event fp_arith_inst_retired.scalar_single found event fp_arith_inst_retired.scalar_double found event fp_arith_inst_retired.128b_packed_double found event fp_arith_inst_retired.128b_packed_single found event fp_arith_inst_retired.256b_packed_double found event fp_arith_inst_retired.256b_packed_single found event duration_time metric expr 1 - cpu_clk_thread_unhalted.one_thread_active / ( cpu_clk_thread_unhalted.ref_xclk_any / 2 ) if #smt_on else 0 for SMT_2T_Utilization found event cpu_clk_thread_unhalted.one_thread_active found event cpu_clk_thread_unhalted.ref_xclk_any metric expr cpu_clk_unhalted.ref_tsc:u / cpu_clk_unhalted.ref_tsc for Kernel_Utilization found event cpu_clk_unhalted.ref_tsc:u found event cpu_clk_unhalted.ref_tsc adding {inst_retired.any,cycles}:W,{cpu_clk_unhalted.thread}:W,{inst_retired.any}:W,{cpu_clk_unhalted.ref_tsc,msr/tsc/}:W,{fp_arith_inst_retired.scalar_single,fp_arith_inst_retired.scalar_double,fp_arith_inst_retired.128b_packed_double,fp_arith_inst_retired.128b_packed_single,fp_arith_inst_retired.256b_packed_double,fp_arith_inst_retired.256b_packed_single,duration_time}:W,{cpu_clk_thread_unhalted.one_thread_active,cpu_clk_thread_unhalted.ref_xclk_any}:W,{cpu_clk_unhalted.ref_tsc:u,cpu_clk_unhalted.ref_tsc}:W inst_retired.any -> cpu/event=0xc0/ cpu_clk_unhalted.thread -> cpu/event=0x3c/ inst_retired.any -> cpu/event=0xc0/ cpu_clk_unhalted.ref_tsc -> cpu/umask=0x3,period=2000003,event=0/ fp_arith_inst_retired.scalar_single -> cpu/umask=0x2,period=2000003,event=0xc7/ fp_arith_inst_retired.scalar_double -> cpu/umask=0x1,period=2000003,event=0xc7/ fp_arith_inst_retired.128b_packed_double -> cpu/umask=0x4,period=2000003,event=0xc7/ fp_arith_inst_retired.128b_packed_single -> cpu/umask=0x8,period=2000003,event=0xc7/ fp_arith_inst_retired.256b_packed_double -> cpu/umask=0x10,period=2000003,event=0xc7/ fp_arith_inst_retired.256b_packed_single -> cpu/umask=0x20,period=2000003,event=0xc7/ cpu_clk_thread_unhalted.one_thread_active -> cpu/umask=0x2,period=2000003,event=0x3c/ cpu_clk_thread_unhalted.ref_xclk_any -> cpu/umask=0x1,any=1,period=2000003,event=0x3c/ cpu_clk_unhalted.ref_tsc -> cpu/umask=0x3,period=2000003,event=0/ cpu_clk_unhalted.ref_tsc -> cpu/umask=0x3,period=2000003,event=0/ Weak group for fp_arith_inst_retired.scalar_single/7 failed Weak group for cpu_clk_unhalted.ref_tsc:u/2 failed inst_retired.any: 8704146437 4026374016 619883741 cycles: 11180800018 4026374016 619883741 cpu_clk_unhalted.thread: 11140030295 4026323772 931621933 inst_retired.any: 8643115117 4026260510 1243595906 cpu_clk_unhalted.ref_tsc: 10201638510 4026184297 1247351077 msr/tsc/: 10378022785 4026184297 1247351077 fp_arith_inst_retired.scalar_single: 134697 4026102728 1559210545 fp_arith_inst_retired.scalar_double: 274339 4026007348 1870014984 fp_arith_inst_retired.128b_packed_double: 1639 4025886054 1866736918 fp_arith_inst_retired.128b_packed_single: 0 4025776614 2175106569 fp_arith_inst_retired.256b_packed_double: 0 4025681734 1235551129 fp_arith_inst_retired.256b_packed_single: 0 4025582962 1232398454 duration_time: 0 4025552913 4025552913 cpu_clk_thread_unhalted.one_thread_active: 10505 4025474649 923893076 cpu_clk_thread_unhalted.ref_xclk_any: 394992110 4025474649 923893076 cpu_clk_unhalted.ref_tsc:u: 5341421014 4025360315 1231634198 cpu_clk_unhalted.ref_tsc: 10258278508 4025252611 307909362 Performance counter stats for 'system wide': Instructions CPI CLKS CPU_Utilization GFLOPs SMT_2T_Utilization Kernel_Utilization 8704146437.0 0.0 11140030295.0 1.0 0.0 1.0 0.5 1.006783654 seconds time elapsed # Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170905195235.GW2482@two.firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwell/bdw-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json new file mode 100644 index 000000000000..49c5f123d811 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 2e006a24127ad88422632f9e5d6a8039a40f01da Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:54:49 -0700 Subject: [PATCH 0028/1085] perf vendor events: Add JSON metrics for Skylake Add JSON metrics for Skylake. Committer testing: # grep "model name" /proc/cpuinfo | head -1 model name : Intel(R) Core(TM) i5-7500 CPU @ 3.40GHz # uname -a Linux seventh 4.12.0-rc6+ #1 SMP Fri Jun 30 16:40:55 -03 2017 x86_64 x86_64 x86_64 GNU/Linux # perf stat --metric-only -M Summary -a sleep 1 Performance counter stats for 'system wide': Instructions CPI CLKS CPU_Utilization GFLOPs SMT_2T_Utilization Kernel_Utilization 34021097.0 0.0 119424171.0 0.0 0.0 0.0 0.0 1.001001793 seconds time elapsed # perf list metricgroup List of pre-defined events (to be used in -e): Metric Groups: DSB FLOPS Frontend Frontend_Bandwidth Memory_BW Memory_Bound Memory_Lat Pipeline Ports_Utilization Power SMT Summary TLB TopDownL1 Unknown_Branches # perf stat --metric-only -M Ports_Utilization -a sleep 1 Performance counter stats for 'system wide': ILP 1475828.0 1.000688547 seconds time elapsed # perf stat -v --metric-only -M Ports_Utilization -a sleep 1 Using CPUID GenuineIntel-6-9E metric expr uops_executed.thread / ( uops_executed.core_cycles_ge_1 / 2) if #smt_on else uops_executed.core_cycles_ge_1 for ILP found event uops_executed.thread found event uops_executed.core_cycles_ge_1 adding {uops_executed.thread,uops_executed.core_cycles_ge_1}:W uops_executed.thread -> cpu/umask=0x1,period=2000003,event=0xb1/ uops_executed.core_cycles_ge_1 -> cpu/umask=0x2,period=2000003,cmask=1,event=0xb1/ uops_executed.thread: 8115271 4002547654 4002547654 uops_executed.core_cycles_ge_1: 3282969 4002547654 4002547654 Performance counter stats for 'system wide': ILP 3282969.0 1.000719870 seconds time elapsed # Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170905195235.GW2482@two.firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/skylake/skl-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json new file mode 100644 index 000000000000..411f9411ec9e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles )", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 97dca6715d0a058a6af028a3019432740b4a0011 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:50:34 -0700 Subject: [PATCH 0029/1085] perf vendor events: Add JSON metrics for Sandy Bridge Add JSON metrics for Sandy Bridge. Committer testing: # grep "model name" /proc/cpuinfo | head -1 model name : Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz # perf list metricgroup List of pre-defined events (to be used in -e): Metric Groups: DSB FLOPS Frontend Frontend_Bandwidth Pipeline Ports_Utilization Power SMT Summary TopDownL1 # perf stat -M Power --metric-only -a sleep 1 Performance counter stats for 'system wide': Turbo_Utilization C3_Core_Residency C6_Core_Residency C7_Core_Residency C2_Pkg_Residency C3_Pkg_Residency C6_Pkg_Residency C7_Pkg_Residency 0.8 0.0 98.1 0.0 0.0 0.0 23.4 0.0 1.001153658 seconds time elapsed # perf stat -v -M Power --metric-only -a sleep 1 Using CPUID GenuineIntel-6-2A metric expr cpu_clk_unhalted.thread / cpu_clk_unhalted.ref_tsc for Turbo_Utilization found event cpu_clk_unhalted.thread found event cpu_clk_unhalted.ref_tsc metric expr (cstate_core@c3\-residency@ / msr@tsc@) * 100 for C3_Core_Residency found event cstate_core/c3-residency/ found event msr/tsc/ metric expr (cstate_core@c6\-residency@ / msr@tsc@) * 100 for C6_Core_Residency found event cstate_core/c6-residency/ found event msr/tsc/ metric expr (cstate_core@c7\-residency@ / msr@tsc@) * 100 for C7_Core_Residency found event cstate_core/c7-residency/ found event msr/tsc/ metric expr (cstate_pkg@c2\-residency@ / msr@tsc@) * 100 for C2_Pkg_Residency found event cstate_pkg/c2-residency/ found event msr/tsc/ metric expr (cstate_pkg@c3\-residency@ / msr@tsc@) * 100 for C3_Pkg_Residency found event cstate_pkg/c3-residency/ found event msr/tsc/ metric expr (cstate_pkg@c6\-residency@ / msr@tsc@) * 100 for C6_Pkg_Residency found event cstate_pkg/c6-residency/ found event msr/tsc/ metric expr (cstate_pkg@c7\-residency@ / msr@tsc@) * 100 for C7_Pkg_Residency found event cstate_pkg/c7-residency/ found event msr/tsc/ adding {cpu_clk_unhalted.thread,cpu_clk_unhalted.ref_tsc}:W,{cstate_core/c3-residency/,msr/tsc/}:W,{cstate_core/c6-residency/,msr/tsc/}:W,{cstate_core/c7-residency/,msr/tsc/}:W,{cstate_pkg/c2-residency/,msr/tsc/}:W,{cstate_pkg/c3-residency/,msr/tsc/}:W,{cstate_pkg/c6-residency/,msr/tsc/}:W,{cstate_pkg/c7-residency/,msr/tsc/}:W cpu_clk_unhalted.thread -> cpu/event=0x3c/ cpu_clk_unhalted.ref_tsc -> cpu/umask=0x3,period=2000003,event=0/ Weak group for cstate_pkg/c2-residency//2 failed Weak group for cstate_pkg/c3-residency//2 failed Weak group for cstate_pkg/c6-residency//2 failed Weak group for cstate_pkg/c7-residency//2 failed cpu_clk_unhalted.thread: 5564185 4002833569 4002833569 cpu_clk_unhalted.ref_tsc: 7325424 4002833569 4002833569 cstate_core/c3-residency/: 68293 4003027101 4003027101 msr/tsc/: 12451294472 4003027101 4003027101 cstate_core/c6-residency/: 12238830163 4003260984 4003260984 msr/tsc/: 12452017806 4003260984 4003260984 cstate_core/c7-residency/: 0 4003489648 4003489648 msr/tsc/: 12452725162 4003489648 4003489648 cstate_pkg/c2-residency/: 1830054 1000913138 1000913138 msr/tsc/: 12453441079 4003717513 4003717513 cstate_pkg/c3-residency/: 0 1000973570 1000973570 msr/tsc/: 12454177865 4003954758 4003954758 cstate_pkg/c6-residency/: 2940448859 1001032370 1001032370 msr/tsc/: 12454833890 4004166118 4004166118 cstate_pkg/c7-residency/: 0 1001049818 1001049818 msr/tsc/: 12454919470 4004194204 4004194204 Performance counter stats for 'system wide': Turbo_Utilization C3_Core_Residency C6_Core_Residency C7_Core_Residency C2_Pkg_Residency C3_Pkg_Residency C6_Pkg_Residency C7_Pkg_Residency 0.8 0.0 98.3 0.0 0.0 0.0 23.6 0.0 1.001126519 seconds time elapsed # Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170905195235.GW2482@two.firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/sandybridge/snb-metrics.json | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json new file mode 100644 index 000000000000..b35b1c153c8a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -0,0 +1,140 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 28bc0ddb3a89a09b6f2d4dc97b85e28b2a70db1e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:51:07 -0700 Subject: [PATCH 0030/1085] perf vendor events: Add JSON metrics for Sandy Bridge EP Add JSON metrics for Sandy Bridge EP. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/jaketown/jkt-metrics.json | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json new file mode 100644 index 000000000000..b35b1c153c8a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -0,0 +1,140 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 8853d2de0efe25572d3a7033bbb87c9b1208391e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:51:25 -0700 Subject: [PATCH 0031/1085] perf vendor events: Add JSON metrics for Ivy Bridge Add JSON metrics for Ivy Bridge. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/ivybridge/ivb-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json new file mode 100644 index 000000000000..057a832f1a6a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 2099f51d1851c8955ed3406683be0b961c7792cb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:51:47 -0700 Subject: [PATCH 0032/1085] perf vendor events: Add JSON metrics for Haswell Add JSON metrics for Haswell. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/haswell/hsw-metrics.json | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json new file mode 100644 index 000000000000..03d894988f0f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -0,0 +1,158 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else (UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@)", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 43fd36a19d501483d5ec243dd28268646f717cde Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:55:16 -0700 Subject: [PATCH 0033/1085] perf vendor events: Add JSON metrics for Ivy Town Add JSON metrics for Ivy Town. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/ivytown/ivt-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json new file mode 100644 index 000000000000..057a832f1a6a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 5e49f7321b06428ac68c51096cff9a4a55c3d4d6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:55:39 -0700 Subject: [PATCH 0034/1085] perf vendor events: Add JSON metrics for Haswell EP Add JSON metrics for Haswell EP. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/haswellx/hsx-metrics.json | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json new file mode 100644 index 000000000000..0bae5eda9fb3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -0,0 +1,158 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 6d75abd3e84596933aa2ca91751744271cfec6cb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 21:55:59 -0700 Subject: [PATCH 0035/1085] perf vendor events: Add JSON metrics for Broadwell Server Add JSON metrics for Broadwell Server. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwellx/bdx-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json new file mode 100644 index 000000000000..4248b029d199 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 69e932139db1d23e978256652dbb179d6ed75204 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 23 Jul 2017 22:00:42 -0700 Subject: [PATCH 0036/1085] perf vendor events: Add JSON metrics for Broadwell DE Add JSON metrics for Broadwell DE Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwellde/bdwde-metrics.json | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json new file mode 100644 index 000000000000..49c5f123d811 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -0,0 +1,164 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 56de5b63ffaff859f75c19aff057ee10f20c6c07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Sep 2017 16:26:13 -0700 Subject: [PATCH 0037/1085] perf vendor events: Add JSON metrics for Skylake server Add JSON metrics for Skylake server Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20170908180133.GA20128@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/skylakex/skx-metrics.json | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json new file mode 100644 index 000000000000..68f12a26c816 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -0,0 +1,182 @@ +[ + { + "BriefDescription": "Instructions Per Cycle (per logical thread)", + "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "TopDownL1", + "MetricName": "IPC" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", + "MetricGroup": "Pipeline", + "MetricName": "UPI" + }, + { + "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", + "MetricGroup": "Frontend", + "MetricName": "IFetch_Line_Utilization" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", + "MetricGroup": "DSB; Frontend_Bandwidth", + "MetricName": "DSB_Coverage" + }, + { + "BriefDescription": "Cycles Per Instruction (threaded)", + "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricGroup": "Pipeline;Summary", + "MetricName": "CPI" + }, + { + "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Summary", + "MetricName": "CLKS" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-core)", + "MetricExpr": "4*cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "MetricGroup": "TopDownL1", + "MetricName": "SLOTS" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary", + "MetricName": "Instructions" + }, + { + "BriefDescription": "Instructions Per Cycle (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "MetricGroup": "SMT", + "MetricName": "CoreIPC" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1", + "MetricGroup": "Pipeline;Ports_Utilization", + "MetricName": "ILP" + }, + { + "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", + "MetricExpr": "( RS_EVENTS.EMPTY_CYCLES - (ICACHE_16B.IFDATA_STALL +2* ICACHE_16B.IFDATA_STALL:c1:e1) - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END", + "MetricGroup": "Unknown_Branches", + "MetricName": "BAClear_Cost" + }, + { + "BriefDescription": "Core actual clocks when any thread is active on the physical core", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "MetricGroup": "SMT", + "MetricName": "CORE_CLKS" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS_PS + MEM_LOAD_RETIRED.FB_HIT_PS )", + "MetricGroup": "Memory_Bound;Memory_Lat", + "MetricName": "Load_Miss_Real_Latency" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Memory_Bound;Memory_BW", + "MetricName": "MLP" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) )", + "MetricGroup": "TLB", + "MetricName": "Page_Walks_Utilization" + }, + { + "BriefDescription": "L1 cache miss per kilo instruction for demand loads", + "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS_PS / INST_RETIRED.ANY", + "MetricGroup": "Cache_Misses;", + "MetricName": "L1MPKI" + }, + { + "BriefDescription": "L2 cache miss per kilo instruction for demand loads", + "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS_PS / INST_RETIRED.ANY", + "MetricGroup": "Cache_Misses;", + "MetricName": "L2MPKI" + }, + { + "BriefDescription": "L3 cache miss per kilo instruction for demand loads", + "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS_PS / INST_RETIRED.ANY", + "MetricGroup": "Cache_Misses;", + "MetricName": "L3MPKI" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", + "MetricGroup": "Summary", + "MetricName": "CPU_Utilization" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16* FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000 / duration_time", + "MetricGroup": "FLOPS;Summary", + "MetricName": "GFLOPs" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "Turbo_Utilization" + }, + { + "BriefDescription": "Fraction of cycles where both hardware threads were active", + "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricGroup": "SMT;Summary", + "MetricName": "SMT_2T_Utilization" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Summary", + "MetricName": "Kernel_Utilization" + }, + { + "BriefDescription": "C3 residency percent per core", + "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Core_Residency" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency" + } +] From 91e467bc568f15da2eac688e131010601e889184 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 10 Sep 2017 19:23:14 -0700 Subject: [PATCH 0038/1085] perf machine: Use hashtable for machine threads To process any events, it needs to find the thread in the machine first. The machine maintains a rb tree to store all threads. The rb tree is protected by a rw lock. It is not a problem for current perf which serially processing events. However, it will have scalability performance issue to process events in parallel, especially on a heavy load system which have many threads. Introduce a hashtable to divide the big rb tree into many samll rb tree for threads. The index is thread id % hashtable size. It can reduce the lock contention. Committer notes: Renamed some variables and function names to reduce semantic confusion: 'struct threads' pointers: thread -> threads threads hastable index: tid -> hash_bucket struct threads *machine__thread() -> machine__threads() Cast tid to (unsigned int) to handle -1 in machine__threads() (Kan Liang) Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1505096603-215017-2-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 23 +++--- tools/perf/util/machine.c | 136 ++++++++++++++++++++++-------------- tools/perf/util/machine.h | 23 ++++-- tools/perf/util/rb_resort.h | 5 +- 4 files changed, 119 insertions(+), 68 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 771ddab94bb0..ee8c6e814437 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2730,20 +2730,23 @@ DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_event static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) { - DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); size_t printed = trace__fprintf_threads_header(fp); struct rb_node *nd; + int i; - if (threads == NULL) { - fprintf(fp, "%s", "Error sorting output by nr_events!\n"); - return 0; + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); + + if (threads == NULL) { + fprintf(fp, "%s", "Error sorting output by nr_events!\n"); + return 0; + } + + resort_rb__for_each_entry(nd, threads) + printed += trace__fprintf_thread(fp, threads_entry->thread, trace); + + resort_rb__delete(threads); } - - resort_rb__for_each_entry(nd, threads) - printed += trace__fprintf_thread(fp, threads_entry->thread, trace); - - resort_rb__delete(threads); - return printed; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index df709363ef69..f4f926753209 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -33,6 +33,20 @@ static void dsos__init(struct dsos *dsos) pthread_rwlock_init(&dsos->lock, NULL); } +static void machine__threads_init(struct machine *machine) +{ + int i; + + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads *threads = &machine->threads[i]; + threads->entries = RB_ROOT; + pthread_rwlock_init(&threads->lock, NULL); + threads->nr = 0; + INIT_LIST_HEAD(&threads->dead); + threads->last_match = NULL; + } +} + int machine__init(struct machine *machine, const char *root_dir, pid_t pid) { memset(machine, 0, sizeof(*machine)); @@ -40,11 +54,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid) RB_CLEAR_NODE(&machine->rb_node); dsos__init(&machine->dsos); - machine->threads = RB_ROOT; - pthread_rwlock_init(&machine->threads_lock, NULL); - machine->nr_threads = 0; - INIT_LIST_HEAD(&machine->dead_threads); - machine->last_match = NULL; + machine__threads_init(machine); machine->vdso_info = NULL; machine->env = NULL; @@ -141,27 +151,37 @@ static void dsos__exit(struct dsos *dsos) void machine__delete_threads(struct machine *machine) { struct rb_node *nd; + int i; - pthread_rwlock_wrlock(&machine->threads_lock); - nd = rb_first(&machine->threads); - while (nd) { - struct thread *t = rb_entry(nd, struct thread, rb_node); + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads *threads = &machine->threads[i]; + pthread_rwlock_wrlock(&threads->lock); + nd = rb_first(&threads->entries); + while (nd) { + struct thread *t = rb_entry(nd, struct thread, rb_node); - nd = rb_next(nd); - __machine__remove_thread(machine, t, false); + nd = rb_next(nd); + __machine__remove_thread(machine, t, false); + } + pthread_rwlock_unlock(&threads->lock); } - pthread_rwlock_unlock(&machine->threads_lock); } void machine__exit(struct machine *machine) { + int i; + machine__destroy_kernel_maps(machine); map_groups__exit(&machine->kmaps); dsos__exit(&machine->dsos); machine__exit_vdso(machine); zfree(&machine->root_dir); zfree(&machine->current_tid); - pthread_rwlock_destroy(&machine->threads_lock); + + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads *threads = &machine->threads[i]; + pthread_rwlock_destroy(&threads->lock); + } } void machine__delete(struct machine *machine) @@ -382,7 +402,8 @@ static struct thread *____machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid, bool create) { - struct rb_node **p = &machine->threads.rb_node; + struct threads *threads = machine__threads(machine, tid); + struct rb_node **p = &threads->entries.rb_node; struct rb_node *parent = NULL; struct thread *th; @@ -391,14 +412,14 @@ static struct thread *____machine__findnew_thread(struct machine *machine, * so most of the time we dont have to look up * the full rbtree: */ - th = machine->last_match; + th = threads->last_match; if (th != NULL) { if (th->tid == tid) { machine__update_thread_pid(machine, th, pid); return thread__get(th); } - machine->last_match = NULL; + threads->last_match = NULL; } while (*p != NULL) { @@ -406,7 +427,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, th = rb_entry(parent, struct thread, rb_node); if (th->tid == tid) { - machine->last_match = th; + threads->last_match = th; machine__update_thread_pid(machine, th, pid); return thread__get(th); } @@ -423,7 +444,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, th = thread__new(pid, tid); if (th != NULL) { rb_link_node(&th->rb_node, parent, p); - rb_insert_color(&th->rb_node, &machine->threads); + rb_insert_color(&th->rb_node, &threads->entries); /* * We have to initialize map_groups separately @@ -434,7 +455,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, * leader and that would screwed the rb tree. */ if (thread__init_map_groups(th, machine)) { - rb_erase_init(&th->rb_node, &machine->threads); + rb_erase_init(&th->rb_node, &threads->entries); RB_CLEAR_NODE(&th->rb_node); thread__put(th); return NULL; @@ -443,8 +464,8 @@ static struct thread *____machine__findnew_thread(struct machine *machine, * It is now in the rbtree, get a ref */ thread__get(th); - machine->last_match = th; - ++machine->nr_threads; + threads->last_match = th; + ++threads->nr; } return th; @@ -458,21 +479,24 @@ struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid) { + struct threads *threads = machine__threads(machine, tid); struct thread *th; - pthread_rwlock_wrlock(&machine->threads_lock); + pthread_rwlock_wrlock(&threads->lock); th = __machine__findnew_thread(machine, pid, tid); - pthread_rwlock_unlock(&machine->threads_lock); + pthread_rwlock_unlock(&threads->lock); return th; } struct thread *machine__find_thread(struct machine *machine, pid_t pid, pid_t tid) { + struct threads *threads = machine__threads(machine, tid); struct thread *th; - pthread_rwlock_rdlock(&machine->threads_lock); + + pthread_rwlock_rdlock(&threads->lock); th = ____machine__findnew_thread(machine, pid, tid, false); - pthread_rwlock_unlock(&machine->threads_lock); + pthread_rwlock_unlock(&threads->lock); return th; } @@ -719,21 +743,24 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) size_t machine__fprintf(struct machine *machine, FILE *fp) { - size_t ret; struct rb_node *nd; + size_t ret; + int i; - pthread_rwlock_rdlock(&machine->threads_lock); + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + struct threads *threads = &machine->threads[i]; + pthread_rwlock_rdlock(&threads->lock); - ret = fprintf(fp, "Threads: %u\n", machine->nr_threads); + ret = fprintf(fp, "Threads: %u\n", threads->nr); - for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { - struct thread *pos = rb_entry(nd, struct thread, rb_node); + for (nd = rb_first(&threads->entries); nd; nd = rb_next(nd)) { + struct thread *pos = rb_entry(nd, struct thread, rb_node); - ret += thread__fprintf(pos, fp); + ret += thread__fprintf(pos, fp); + } + + pthread_rwlock_unlock(&threads->lock); } - - pthread_rwlock_unlock(&machine->threads_lock); - return ret; } @@ -1479,23 +1506,25 @@ out_problem: static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock) { - if (machine->last_match == th) - machine->last_match = NULL; + struct threads *threads = machine__threads(machine, th->tid); + + if (threads->last_match == th) + threads->last_match = NULL; BUG_ON(refcount_read(&th->refcnt) == 0); if (lock) - pthread_rwlock_wrlock(&machine->threads_lock); - rb_erase_init(&th->rb_node, &machine->threads); + pthread_rwlock_wrlock(&threads->lock); + rb_erase_init(&th->rb_node, &threads->entries); RB_CLEAR_NODE(&th->rb_node); - --machine->nr_threads; + --threads->nr; /* * Move it first to the dead_threads list, then drop the reference, * if this is the last reference, then the thread__delete destructor * will be called and we will remove it from the dead_threads list. */ - list_add_tail(&th->node, &machine->dead_threads); + list_add_tail(&th->node, &threads->dead); if (lock) - pthread_rwlock_unlock(&machine->threads_lock); + pthread_rwlock_unlock(&threads->lock); thread__put(th); } @@ -2140,21 +2169,26 @@ int machine__for_each_thread(struct machine *machine, int (*fn)(struct thread *thread, void *p), void *priv) { + struct threads *threads; struct rb_node *nd; struct thread *thread; int rc = 0; + int i; - for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { - thread = rb_entry(nd, struct thread, rb_node); - rc = fn(thread, priv); - if (rc != 0) - return rc; - } + for (i = 0; i < THREADS__TABLE_SIZE; i++) { + threads = &machine->threads[i]; + for (nd = rb_first(&threads->entries); nd; nd = rb_next(nd)) { + thread = rb_entry(nd, struct thread, rb_node); + rc = fn(thread, priv); + if (rc != 0) + return rc; + } - list_for_each_entry(thread, &machine->dead_threads, node) { - rc = fn(thread, priv); - if (rc != 0) - return rc; + list_for_each_entry(thread, &threads->dead, node) { + rc = fn(thread, priv); + if (rc != 0) + return rc; + } } return rc; } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 3cdb1340f917..fe2f05848050 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -23,6 +23,17 @@ extern const char *ref_reloc_sym_names[]; struct vdso_info; +#define THREADS__TABLE_BITS 8 +#define THREADS__TABLE_SIZE (1 << THREADS__TABLE_BITS) + +struct threads { + struct rb_root entries; + pthread_rwlock_t lock; + unsigned int nr; + struct list_head dead; + struct thread *last_match; +}; + struct machine { struct rb_node rb_node; pid_t pid; @@ -30,11 +41,7 @@ struct machine { bool comm_exec; bool kptr_restrict_warned; char *root_dir; - struct rb_root threads; - pthread_rwlock_t threads_lock; - unsigned int nr_threads; - struct list_head dead_threads; - struct thread *last_match; + struct threads threads[THREADS__TABLE_SIZE]; struct vdso_info *vdso_info; struct perf_env *env; struct dsos dsos; @@ -48,6 +55,12 @@ struct machine { }; }; +static inline struct threads *machine__threads(struct machine *machine, pid_t tid) +{ + /* Cast it to handle tid == -1 */ + return &machine->threads[(unsigned int)tid % THREADS__TABLE_SIZE]; +} + static inline struct map *__machine__kernel_map(struct machine *machine, enum map_type type) { diff --git a/tools/perf/util/rb_resort.h b/tools/perf/util/rb_resort.h index 808cc45611fe..b30746f5f613 100644 --- a/tools/perf/util/rb_resort.h +++ b/tools/perf/util/rb_resort.h @@ -143,7 +143,8 @@ struct __name##_sorted *__name = __name##_sorted__new __ilist->rblist.nr_entries) /* For 'struct machine->threads' */ -#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine) \ - DECLARE_RESORT_RB(__name)(&__machine->threads, __machine->nr_threads) +#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine, hash_bucket) \ + DECLARE_RESORT_RB(__name)(&__machine->threads[hash_bucket].entries, \ + __machine->threads[hash_bucket].nr) #endif /* _PERF_RESORT_RB_H_ */ From 75e45e432052c7b1a5da866cff88192db8be1445 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Sep 2017 16:16:34 -0300 Subject: [PATCH 0039/1085] perf machine: Optimize a bit the machine__findnew_thread() methods In some cases we already have calculated the hash bucket, so reuse it. Cc: Adrian Hunter Cc: Andi Kleen Cc: David Ahern Cc: Jiri Olsa Cc: Kan Liang Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-800zehjsyy03er4s4jf0e99v@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f4f926753209..ddeea05eae86 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -399,10 +399,10 @@ out_err: * lookup/new thread inserted. */ static struct thread *____machine__findnew_thread(struct machine *machine, + struct threads *threads, pid_t pid, pid_t tid, bool create) { - struct threads *threads = machine__threads(machine, tid); struct rb_node **p = &threads->entries.rb_node; struct rb_node *parent = NULL; struct thread *th; @@ -473,7 +473,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid) { - return ____machine__findnew_thread(machine, pid, tid, true); + return ____machine__findnew_thread(machine, machine__threads(machine, tid), pid, tid, true); } struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, @@ -495,7 +495,7 @@ struct thread *machine__find_thread(struct machine *machine, pid_t pid, struct thread *th; pthread_rwlock_rdlock(&threads->lock); - th = ____machine__findnew_thread(machine, pid, tid, false); + th = ____machine__findnew_thread(machine, threads, pid, tid, false); pthread_rwlock_unlock(&threads->lock); return th; } From 333b566559019b146905c623bde7f455c1d5add3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 13 Sep 2017 14:50:06 -0700 Subject: [PATCH 0040/1085] perf pmu: Improve error messages for missing PMUs When a PMU is missing print a better error message mentioning the missing PMU. % mkdir empty % mount --bind empty /sys/devices/msr % perf stat -M Summary true event syntax error: '{inst_retired.any,cycles}:W,{cpu_clk_unhalted.thread}:W,{inst_retired.any}:W,{cpu_clk_unhalted.ref_tsc,msr/tsc/}:W,{fp_comp_ops_exe.sse_scalar..' \___ Cannot find PMU `msr'. Missing kernel support? It still cannot find the right column for aliases, but it's already a vast improvement. v2: Check asprintf Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170913215006.32222-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 2 +- tools/perf/util/parse-events.c | 18 ++++++++++++------ tools/perf/util/parse-events.h | 3 +++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 2d60114f1870..fa37ef79517a 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -477,7 +477,7 @@ int metricgroup__parse_groups(const struct option *opt, memset(&parse_error, 0, sizeof(struct parse_events_error)); ret = parse_events(perf_evlist, extra_events.buf, &parse_error); if (ret) { - pr_err("Cannot set up events %s\n", extra_events.buf); + parse_events_print_error(&parse_error, extra_events.buf); goto out; } strbuf_release(&extra_events); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 75588920fccc..9183913a6174 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1219,11 +1219,17 @@ static int __parse_events_add_pmu(struct parse_events_state *parse_state, struct perf_pmu_info info; struct perf_pmu *pmu; struct perf_evsel *evsel; + struct parse_events_error *err = parse_state->error; LIST_HEAD(config_terms); pmu = perf_pmu__find(name); - if (!pmu) + if (!pmu) { + if (asprintf(&err->str, + "Cannot find PMU `%s'. Missing kernel support?", + name) < 0) + err->str = NULL; return -EINVAL; + } if (pmu->default_config) { memcpy(&attr, pmu->default_config, @@ -1733,8 +1739,8 @@ static int get_term_width(void) return ws.ws_col > MAX_WIDTH ? MAX_WIDTH : ws.ws_col; } -static void parse_events_print_error(struct parse_events_error *err, - const char *event) +void parse_events_print_error(struct parse_events_error *err, + const char *event) { const char *str = "invalid or unsupported event: "; char _buf[MAX_WIDTH]; @@ -1789,8 +1795,6 @@ static void parse_events_print_error(struct parse_events_error *err, zfree(&err->str); zfree(&err->help); } - - fprintf(stderr, "Run 'perf list' for a list of valid events\n"); } #undef MAX_WIDTH @@ -1802,8 +1806,10 @@ int parse_events_option(const struct option *opt, const char *str, struct parse_events_error err = { .idx = 0, }; int ret = parse_events(evlist, str, &err); - if (ret) + if (ret) { parse_events_print_error(&err, str); + fprintf(stderr, "Run 'perf list' for a list of valid events\n"); + } return ret; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 635135125111..3909ca0639f2 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -202,6 +202,9 @@ int is_valid_tracepoint(const char *event_string); int valid_event_mount(const char *eventfs); char *parse_events_formats_error_string(char *additional_terms); +void parse_events_print_error(struct parse_events_error *err, + const char *event); + #ifdef HAVE_LIBELF_SUPPORT /* * If the probe point starts with '%', From c896f85a7c15ab9d040ffac8b8003e47996602a2 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Wed, 13 Sep 2017 21:14:19 +0200 Subject: [PATCH 0041/1085] perf tools: Fix leaking rec_argv in error cases Let's free the allocated rec_argv in case we return early, in order to avoid leaking memory. This adds free() at a few very similar places across the tree where it was missing. Signed-off-by: Martin Kepplinger Cc: Alexander Shishkin Cc: Martin kepplinger Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170913191419.29806-1-martink@posteo.de Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 1 + tools/perf/builtin-mem.c | 1 + tools/perf/builtin-timechart.c | 4 +++- tools/perf/builtin-trace.c | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 475999e48f66..bb1ee22bd221 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2732,6 +2732,7 @@ static int perf_c2c__record(int argc, const char **argv) if (!perf_mem_events[j].supported) { pr_err("failed: event '%s' not supported\n", perf_mem_events[j].name); + free(rec_argv); return -1; } diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 0f15634ef82c..6940490bc3f9 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -112,6 +112,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) if (!perf_mem_events[j].supported) { pr_err("failed: event '%s' not supported\n", perf_mem_events__name(j)); + free(rec_argv); return -1; } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 4e2e61695986..01de01ca14f2 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1732,8 +1732,10 @@ static int timechart__io_record(int argc, const char **argv) if (rec_argv == NULL) return -ENOMEM; - if (asprintf(&filter, "common_pid != %d", getpid()) < 0) + if (asprintf(&filter, "common_pid != %d", getpid()) < 0) { + free(rec_argv); return -ENOMEM; + } p = rec_argv; for (i = 0; i < common_args_nr; i++) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index ee8c6e814437..967bd351b58d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2078,6 +2078,7 @@ static int trace__record(struct trace *trace, int argc, const char **argv) rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; else { pr_err("Neither raw_syscalls nor syscalls events exist.\n"); + free(rec_argv); return -1; } } From 2f329595b8db42ceb3de93595a740ac7b24eddba Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 15 Sep 2017 15:29:15 +0800 Subject: [PATCH 0042/1085] spi: Add Spreadtrum ADI controller documentation This patch adds the binding documentation for Spreadtrum ADI controller device. Signed-off-by: Baolin Wang Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/spi-sprd-adi.txt | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/spi/spi-sprd-adi.txt diff --git a/Documentation/devicetree/bindings/spi/spi-sprd-adi.txt b/Documentation/devicetree/bindings/spi/spi-sprd-adi.txt new file mode 100644 index 000000000000..8de589b376ce --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-sprd-adi.txt @@ -0,0 +1,58 @@ +Spreadtrum ADI controller + +ADI is the abbreviation of Anolog-Digital interface, which is used to access +analog chip (such as PMIC) from digital chip. ADI controller follows the SPI +framework for its hardware implementation is alike to SPI bus and its timing +is compatile to SPI timing. + +ADI controller has 50 channels including 2 software read/write channels and +48 hardware channels to access analog chip. For 2 software read/write channels, +users should set ADI registers to access analog chip. For hardware channels, +we can configure them to allow other hardware components to use it independently, +which means we can just link one analog chip address to one hardware channel, +then users can access the mapped analog chip address by this hardware channel +triggered by hardware components instead of ADI software channels. + +Thus we introduce one property named "sprd,hw-channels" to configure hardware +channels, the first value specifies the hardware channel id which is used to +transfer data triggered by hardware automatically, and the second value specifies +the analog chip address where user want to access by hardware components. + +Since we have multi-subsystems will use unique ADI to access analog chip, when +one system is reading/writing data by ADI software channels, that should be under +one hardware spinlock protection to prevent other systems from reading/writing +data by ADI software channels at the same time, or two parallel routine of setting +ADI registers will make ADI controller registers chaos to lead incorrect results. +Then we need one hardware spinlock to synchronize between the multiple subsystems. + +Required properties: +- compatible: Should be "sprd,sc9860-adi". +- reg: Offset and length of ADI-SPI controller register space. +- hwlocks: Reference to a phandle of a hwlock provider node. +- hwlock-names: Reference to hwlock name strings defined in the same order + as the hwlocks, should be "adi". +- #address-cells: Number of cells required to define a chip select address + on the ADI-SPI bus. Should be set to 1. +- #size-cells: Size of cells required to define a chip select address size + on the ADI-SPI bus. Should be set to 0. + +Optional properties: +- sprd,hw-channels: This is an array of channel values up to 49 channels. + The first value specifies the hardware channel id which is used to + transfer data triggered by hardware automatically, and the second + value specifies the analog chip address where user want to access + by hardware components. + +SPI slave nodes must be children of the SPI controller node and can contain +properties described in Documentation/devicetree/bindings/spi/spi-bus.txt. + +Example: + adi_bus: spi@40030000 { + compatible = "sprd,sc9860-adi"; + reg = <0 0x40030000 0 0x10000>; + hwlocks = <&hwlock1 0>; + hwlock-names = "adi"; + #address-cells = <1>; + #size-cells = <0>; + sprd,hw-channels = <30 0x8c20>; + }; From 7e2903cb91df1a8b0a202a7cf5c9e3d2f654bc57 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 15 Sep 2017 15:29:16 +0800 Subject: [PATCH 0043/1085] spi: Add ADI driver for Spreadtrum platform This patch adds ADI driver based on SPI framework for Spreadtrum SC9860 platform. Signed-off-by: Baolin Wang Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 6 + drivers/spi/Makefile | 1 + drivers/spi/spi-sprd-adi.c | 419 +++++++++++++++++++++++++++++++++++++ 3 files changed, 426 insertions(+) create mode 100644 drivers/spi/spi-sprd-adi.c diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index a75f2a2cf780..0c38a5bfcd74 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -626,6 +626,12 @@ config SPI_SIRF help SPI driver for CSR SiRFprimaII SoCs +config SPI_SPRD_ADI + tristate "Spreadtrum ADI controller" + depends on ARCH_SPRD || COMPILE_TEST + help + ADI driver based on SPI for Spreadtrum SoCs. + config SPI_STM32 tristate "STMicroelectronics STM32 SPI controller" depends on ARCH_STM32 || COMPILE_TEST diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index a3ae2b70cdc3..de5ae2af75f9 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -90,6 +90,7 @@ obj-$(CONFIG_SPI_SH_HSPI) += spi-sh-hspi.o obj-$(CONFIG_SPI_SH_MSIOF) += spi-sh-msiof.o obj-$(CONFIG_SPI_SH_SCI) += spi-sh-sci.o obj-$(CONFIG_SPI_SIRF) += spi-sirf.o +obj-$(CONFIG_SPI_SPRD_ADI) += spi-sprd-adi.o obj-$(CONFIG_SPI_STM32) += spi-stm32.o obj-$(CONFIG_SPI_ST_SSC4) += spi-st-ssc4.o obj-$(CONFIG_SPI_SUN4I) += spi-sun4i.o diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c new file mode 100644 index 000000000000..0d481f8a46c2 --- /dev/null +++ b/drivers/spi/spi-sprd-adi.c @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2017 Spreadtrum Communications Inc. + * + * SPDX-License-Identifier: GPL-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Registers definitions for ADI controller */ +#define REG_ADI_CTRL0 0x4 +#define REG_ADI_CHN_PRIL 0x8 +#define REG_ADI_CHN_PRIH 0xc +#define REG_ADI_INT_EN 0x10 +#define REG_ADI_INT_RAW 0x14 +#define REG_ADI_INT_MASK 0x18 +#define REG_ADI_INT_CLR 0x1c +#define REG_ADI_GSSI_CFG0 0x20 +#define REG_ADI_GSSI_CFG1 0x24 +#define REG_ADI_RD_CMD 0x28 +#define REG_ADI_RD_DATA 0x2c +#define REG_ADI_ARM_FIFO_STS 0x30 +#define REG_ADI_STS 0x34 +#define REG_ADI_EVT_FIFO_STS 0x38 +#define REG_ADI_ARM_CMD_STS 0x3c +#define REG_ADI_CHN_EN 0x40 +#define REG_ADI_CHN_ADDR(id) (0x44 + (id - 2) * 4) +#define REG_ADI_CHN_EN1 0x20c + +/* Bits definitions for register REG_ADI_GSSI_CFG0 */ +#define BIT_CLK_ALL_ON BIT(30) + +/* Bits definitions for register REG_ADI_RD_DATA */ +#define BIT_RD_CMD_BUSY BIT(31) +#define RD_ADDR_SHIFT 16 +#define RD_VALUE_MASK GENMASK(15, 0) +#define RD_ADDR_MASK GENMASK(30, 16) + +/* Bits definitions for register REG_ADI_ARM_FIFO_STS */ +#define BIT_FIFO_FULL BIT(11) +#define BIT_FIFO_EMPTY BIT(10) + +/* + * ADI slave devices include RTC, ADC, regulator, charger, thermal and so on. + * The slave devices address offset is always 0x8000 and size is 4K. + */ +#define ADI_SLAVE_ADDR_SIZE SZ_4K +#define ADI_SLAVE_OFFSET 0x8000 + +/* Timeout (ms) for the trylock of hardware spinlocks */ +#define ADI_HWSPINLOCK_TIMEOUT 5000 +/* + * ADI controller has 50 channels including 2 software channels + * and 48 hardware channels. + */ +#define ADI_HW_CHNS 50 + +#define ADI_FIFO_DRAIN_TIMEOUT 1000 +#define ADI_READ_TIMEOUT 2000 +#define REG_ADDR_LOW_MASK GENMASK(11, 0) + +struct sprd_adi { + struct spi_controller *ctlr; + struct device *dev; + void __iomem *base; + struct hwspinlock *hwlock; + unsigned long slave_vbase; + unsigned long slave_pbase; +}; + +static int sprd_adi_check_paddr(struct sprd_adi *sadi, u32 paddr) +{ + if (paddr < sadi->slave_pbase || paddr > + (sadi->slave_pbase + ADI_SLAVE_ADDR_SIZE)) { + dev_err(sadi->dev, + "slave physical address is incorrect, addr = 0x%x\n", + paddr); + return -EINVAL; + } + + return 0; +} + +static unsigned long sprd_adi_to_vaddr(struct sprd_adi *sadi, u32 paddr) +{ + return (paddr - sadi->slave_pbase + sadi->slave_vbase); +} + +static int sprd_adi_drain_fifo(struct sprd_adi *sadi) +{ + u32 timeout = ADI_FIFO_DRAIN_TIMEOUT; + u32 sts; + + do { + sts = readl_relaxed(sadi->base + REG_ADI_ARM_FIFO_STS); + if (sts & BIT_FIFO_EMPTY) + break; + + cpu_relax(); + } while (--timeout); + + if (timeout == 0) { + dev_err(sadi->dev, "drain write fifo timeout\n"); + return -EBUSY; + } + + return 0; +} + +static int sprd_adi_fifo_is_full(struct sprd_adi *sadi) +{ + return readl_relaxed(sadi->base + REG_ADI_ARM_FIFO_STS) & BIT_FIFO_FULL; +} + +static int sprd_adi_read(struct sprd_adi *sadi, u32 reg_paddr, u32 *read_val) +{ + int read_timeout = ADI_READ_TIMEOUT; + u32 val, rd_addr; + + /* + * Set the physical register address need to read into RD_CMD register, + * then ADI controller will start to transfer automatically. + */ + writel_relaxed(reg_paddr, sadi->base + REG_ADI_RD_CMD); + + /* + * Wait read operation complete, the BIT_RD_CMD_BUSY will be set + * simultaneously when writing read command to register, and the + * BIT_RD_CMD_BUSY will be cleared after the read operation is + * completed. + */ + do { + val = readl_relaxed(sadi->base + REG_ADI_RD_DATA); + if (!(val & BIT_RD_CMD_BUSY)) + break; + + cpu_relax(); + } while (--read_timeout); + + if (read_timeout == 0) { + dev_err(sadi->dev, "ADI read timeout\n"); + return -EBUSY; + } + + /* + * The return value includes data and read register address, from bit 0 + * to bit 15 are data, and from bit 16 to bit 30 are read register + * address. Then we can check the returned register address to validate + * data. + */ + rd_addr = (val & RD_ADDR_MASK ) >> RD_ADDR_SHIFT; + + if (rd_addr != (reg_paddr & REG_ADDR_LOW_MASK)) { + dev_err(sadi->dev, "read error, reg addr = 0x%x, val = 0x%x\n", + reg_paddr, val); + return -EIO; + } + + *read_val = val & RD_VALUE_MASK; + return 0; +} + +static int sprd_adi_write(struct sprd_adi *sadi, unsigned long reg, u32 val) +{ + u32 timeout = ADI_FIFO_DRAIN_TIMEOUT; + int ret; + + ret = sprd_adi_drain_fifo(sadi); + if (ret < 0) + return ret; + + /* + * we should wait for write fifo is empty before writing data to PMIC + * registers. + */ + do { + if (!sprd_adi_fifo_is_full(sadi)) { + writel_relaxed(val, (void __iomem *)reg); + break; + } + + cpu_relax(); + } while (--timeout); + + if (timeout == 0) { + dev_err(sadi->dev, "write fifo is full\n"); + return -EBUSY; + } + + return 0; +} + +static int sprd_adi_transfer_one(struct spi_controller *ctlr, + struct spi_device *spi_dev, + struct spi_transfer *t) +{ + struct sprd_adi *sadi = spi_controller_get_devdata(ctlr); + unsigned long flags, virt_reg; + u32 phy_reg, val; + int ret; + + if (t->rx_buf) { + phy_reg = *(u32 *)t->rx_buf + sadi->slave_pbase; + + ret = sprd_adi_check_paddr(sadi, phy_reg); + if (ret) + return ret; + + ret = hwspin_lock_timeout_irqsave(sadi->hwlock, + ADI_HWSPINLOCK_TIMEOUT, + &flags); + if (ret) { + dev_err(sadi->dev, "get the hw lock failed\n"); + return ret; + } + + ret = sprd_adi_read(sadi, phy_reg, &val); + hwspin_unlock_irqrestore(sadi->hwlock, &flags); + if (ret) + return ret; + + *(u32 *)t->rx_buf = val; + } else if (t->tx_buf) { + u32 *p = (u32 *)t->tx_buf; + + /* + * Get the physical register address need to write and convert + * the physical address to virtual address. Since we need + * virtual register address to write. + */ + phy_reg = *p++ + sadi->slave_pbase; + ret = sprd_adi_check_paddr(sadi, phy_reg); + if (ret) + return ret; + + virt_reg = sprd_adi_to_vaddr(sadi, phy_reg); + val = *p; + + ret = hwspin_lock_timeout_irqsave(sadi->hwlock, + ADI_HWSPINLOCK_TIMEOUT, + &flags); + if (ret) { + dev_err(sadi->dev, "get the hw lock failed\n"); + return ret; + } + + ret = sprd_adi_write(sadi, virt_reg, val); + hwspin_unlock_irqrestore(sadi->hwlock, &flags); + if (ret) + return ret; + } else { + dev_err(sadi->dev, "no buffer for transfer\n"); + return -EINVAL; + } + + return 0; +} + +static void sprd_adi_hw_init(struct sprd_adi *sadi) +{ + struct device_node *np = sadi->dev->of_node; + int i, size, chn_cnt; + const __be32 *list; + u32 tmp; + + /* Address bits select default 12 bits */ + writel_relaxed(0, sadi->base + REG_ADI_CTRL0); + + /* Set all channels as default priority */ + writel_relaxed(0, sadi->base + REG_ADI_CHN_PRIL); + writel_relaxed(0, sadi->base + REG_ADI_CHN_PRIH); + + /* Set clock auto gate mode */ + tmp = readl_relaxed(sadi->base + REG_ADI_GSSI_CFG0); + tmp &= ~BIT_CLK_ALL_ON; + writel_relaxed(tmp, sadi->base + REG_ADI_GSSI_CFG0); + + /* Set hardware channels setting */ + list = of_get_property(np, "sprd,hw-channels", &size); + if (!size || !list) { + dev_info(sadi->dev, "no hw channels setting in node\n"); + return; + } + + chn_cnt = size / 8; + for (i = 0; i < chn_cnt; i++) { + u32 value; + u32 chn_id = be32_to_cpu(*list++); + u32 chn_config = be32_to_cpu(*list++); + + /* Channel 0 and 1 are software channels */ + if (chn_id < 2) + continue; + + writel_relaxed(chn_config, sadi->base + + REG_ADI_CHN_ADDR(chn_id)); + + if (chn_id < 31) { + value = readl_relaxed(sadi->base + REG_ADI_CHN_EN); + value |= BIT(chn_id); + writel_relaxed(value, sadi->base + REG_ADI_CHN_EN); + } else if (chn_id < ADI_HW_CHNS) { + value = readl_relaxed(sadi->base + REG_ADI_CHN_EN1); + value |= BIT(chn_id - 32); + writel_relaxed(value, sadi->base + REG_ADI_CHN_EN1); + } + } +} + +static int sprd_adi_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct spi_controller *ctlr; + struct sprd_adi *sadi; + struct resource *res; + u32 num_chipselect; + int ret; + + if (!np) { + dev_err(&pdev->dev, "can not find the adi bus node\n"); + return -ENODEV; + } + + pdev->id = of_alias_get_id(np, "spi"); + num_chipselect = of_get_child_count(np); + + ctlr = spi_alloc_master(&pdev->dev, sizeof(struct sprd_adi)); + if (!ctlr) + return -ENOMEM; + + dev_set_drvdata(&pdev->dev, ctlr); + sadi = spi_controller_get_devdata(ctlr); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + sadi->base = devm_ioremap_resource(&pdev->dev, res); + if (!sadi->base) { + ret = -ENOMEM; + goto put_ctlr; + } + + sadi->slave_vbase = (unsigned long)sadi->base + ADI_SLAVE_OFFSET; + sadi->slave_pbase = res->start + ADI_SLAVE_OFFSET; + sadi->ctlr = ctlr; + sadi->dev = &pdev->dev; + ret = of_hwspin_lock_get_id(np, 0); + if (ret < 0) { + dev_err(&pdev->dev, "can not get the hardware spinlock\n"); + goto put_ctlr; + } + + sadi->hwlock = hwspin_lock_request_specific(ret); + if (!sadi->hwlock) { + ret = -ENXIO; + goto put_ctlr; + } + + sprd_adi_hw_init(sadi); + + ctlr->dev.of_node = pdev->dev.of_node; + ctlr->bus_num = pdev->id; + ctlr->num_chipselect = num_chipselect; + ctlr->flags = SPI_MASTER_HALF_DUPLEX; + ctlr->bits_per_word_mask = 0; + ctlr->transfer_one = sprd_adi_transfer_one; + + ret = devm_spi_register_controller(&pdev->dev, ctlr); + if (ret) { + dev_err(&pdev->dev, "failed to register SPI controller\n"); + goto free_hwlock; + } + + return 0; + +free_hwlock: + hwspin_lock_free(sadi->hwlock); +put_ctlr: + spi_controller_put(ctlr); + return ret; +} + +static int sprd_adi_remove(struct platform_device *pdev) +{ + struct spi_controller *ctlr = dev_get_drvdata(&pdev->dev); + struct sprd_adi *sadi = spi_controller_get_devdata(ctlr); + + hwspin_lock_free(sadi->hwlock); + return 0; +} + +static const struct of_device_id sprd_adi_of_match[] = { + { + .compatible = "sprd,sc9860-adi", + }, + { }, +}; +MODULE_DEVICE_TABLE(of, sprd_adi_of_match); + +static struct platform_driver sprd_adi_driver = { + .driver = { + .name = "sprd-adi", + .owner = THIS_MODULE, + .of_match_table = sprd_adi_of_match, + }, + .probe = sprd_adi_probe, + .remove = sprd_adi_remove, +}; +module_platform_driver(sprd_adi_driver); + +MODULE_DESCRIPTION("Spreadtrum ADI Controller Driver"); +MODULE_AUTHOR("Baolin Wang "); +MODULE_LICENSE("GPL v2"); From 32c30f73687a7ea5fe6add62b7bc3b520115d0d9 Mon Sep 17 00:00:00 2001 From: Franklin Cooper Date: Fri, 8 Sep 2017 15:46:36 -0500 Subject: [PATCH 0044/1085] spi: spi-davinci: Update binding for 66AK2Gx pwr dm property Add pm-domains property which is required for 66AK2Gx. Also document 66AK2G unique clocks property usage. Signed-off-by: Franklin S Cooper Jr Acked-by: Rob Herring Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-davinci.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/devicetree/bindings/spi/spi-davinci.txt b/Documentation/devicetree/bindings/spi/spi-davinci.txt index f5916c92fe91..1925277bfc1e 100644 --- a/Documentation/devicetree/bindings/spi/spi-davinci.txt +++ b/Documentation/devicetree/bindings/spi/spi-davinci.txt @@ -24,6 +24,16 @@ Required properties: based on a specific SoC configuration. - interrupts: interrupt number mapped to CPU. - clocks: spi clk phandle + For 66AK2G this property should be set per binding, + Documentation/devicetree/bindings/clock/ti,sci-clk.txt + +SoC-specific Required Properties: + +The following are mandatory properties for Keystone 2 66AK2G SoCs only: + +- power-domains: Should contain a phandle to a PM domain provider node + and an args specifier containing the SPI device id + value. This property is as per the binding, Optional: - cs-gpios: gpio chip selects From 10c1705eced11d6ad710fddcdb57aaa9f85a6f98 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Fri, 8 Sep 2017 09:07:15 +0100 Subject: [PATCH 0045/1085] spi: rspi: Add r8a7743/5 to the compatible list Signed-off-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-rspi.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/spi/spi-rspi.txt b/Documentation/devicetree/bindings/spi/spi-rspi.txt index 8f4169f63936..3b02b3a7cfb2 100644 --- a/Documentation/devicetree/bindings/spi/spi-rspi.txt +++ b/Documentation/devicetree/bindings/spi/spi-rspi.txt @@ -5,11 +5,14 @@ Required properties: "renesas,rspi-", "renesas,rspi" as fallback. For Renesas Serial Peripheral Interface on RZ/A1H: "renesas,rspi-", "renesas,rspi-rz" as fallback. - For Quad Serial Peripheral Interface on R-Car Gen2: + For Quad Serial Peripheral Interface on R-Car Gen2 and + RZ/G1 devices: "renesas,qspi-", "renesas,qspi" as fallback. Examples with soctypes are: - "renesas,rspi-sh7757" (SH) - "renesas,rspi-r7s72100" (RZ/A1H) + - "renesas,qspi-r8a7743" (RZ/G1M) + - "renesas,qspi-r8a7745" (RZ/G1E) - "renesas,qspi-r8a7790" (R-Car H2) - "renesas,qspi-r8a7791" (R-Car M2-W) - "renesas,qspi-r8a7792" (R-Car V2H) From 71abd29057cb17b6b9532421821dc443427399ed Mon Sep 17 00:00:00 2001 From: jiada wang Date: Tue, 5 Sep 2017 14:12:32 +0900 Subject: [PATCH 0046/1085] spi: imx: Add support for SPI Slave mode Previously i.MX SPI controller only works in Master mode. This patch adds support to i.MX51, i.MX53 and i.MX6 ECSPI controller to work also in Slave mode. Currently SPI Slave mode support patch has the following limitations: 1. The stale data in RXFIFO will be dropped when the Slave does any new transfer. 2. One transfer can be finished only after all transfer->len data been transferred to master device 3. Slave device only accepts transfer->len data. Any data longer than this from master device will be dropped. Any data shorter than this from master will cause SPI to stuck due to mentioned HW limitation 2. 4. Only PIO transfer is supported in Slave mode. 5. Dynamic burst size adjust isn't supported in Slave mode. Following HW limitation applies: 1. ECSPI has a HW issue when works in Slave mode, after 64 words written to TXFIFO, even TXFIFO becomes empty, ECSPI_TXDATA keeps shift out the last word data, so we have to disable ECSPI when in slave mode after the transfer completes 2. Due to Freescale errata ERR003775 "eCSPI: Burst completion by Chip Select (SS) signal in Slave mode is not functional" burst size must be set exactly to the size of the transfer. This limit SPI transaction with maximum 2^12 bits. This errata affects i.MX53 and i.MX6 ECSPI controllers. Signed-off-by: Jiada Wang Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 227 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 196 insertions(+), 31 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index babb15f07995..fe35aaea323b 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -53,10 +53,13 @@ /* generic defines to abstract from the different register layouts */ #define MXC_INT_RR (1 << 0) /* Receive data ready interrupt */ #define MXC_INT_TE (1 << 1) /* Transmit FIFO empty interrupt */ +#define MXC_INT_RDR BIT(4) /* Receive date threshold interrupt */ /* The maximum bytes that a sdma BD can transfer.*/ #define MAX_SDMA_BD_BYTES (1 << 15) #define MX51_ECSPI_CTRL_MAX_BURST 512 +/* The maximum bytes that IMX53_ECSPI can transfer in slave mode.*/ +#define MX53_MAX_TRANSFER_BYTES 512 enum spi_imx_devtype { IMX1_CSPI, @@ -76,7 +79,9 @@ struct spi_imx_devtype_data { void (*trigger)(struct spi_imx_data *); int (*rx_available)(struct spi_imx_data *); void (*reset)(struct spi_imx_data *); + void (*disable)(struct spi_imx_data *); bool has_dmamode; + bool has_slavemode; unsigned int fifo_size; bool dynamic_burst; enum spi_imx_devtype devtype; @@ -108,6 +113,11 @@ struct spi_imx_data { unsigned int dynamic_burst, read_u32; unsigned int word_mask; + /* Slave mode */ + bool slave_mode; + bool slave_aborted; + unsigned int slave_burst; + /* DMA */ bool usedma; u32 wml; @@ -221,6 +231,9 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi, if (!master->dma_rx) return false; + if (spi_imx->slave_mode) + return false; + bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word); if (bytes_per_word != 1 && bytes_per_word != 2 && bytes_per_word != 4) @@ -262,6 +275,7 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi, #define MX51_ECSPI_INT 0x10 #define MX51_ECSPI_INT_TEEN (1 << 0) #define MX51_ECSPI_INT_RREN (1 << 3) +#define MX51_ECSPI_INT_RDREN (1 << 4) #define MX51_ECSPI_DMA 0x14 #define MX51_ECSPI_DMA_TX_WML(wml) ((wml) & 0x3f) @@ -378,6 +392,44 @@ static void spi_imx_buf_tx_swap(struct spi_imx_data *spi_imx) spi_imx_buf_tx_u16(spi_imx); } +static void mx53_ecspi_rx_slave(struct spi_imx_data *spi_imx) +{ + u32 val = be32_to_cpu(readl(spi_imx->base + MXC_CSPIRXDATA)); + + if (spi_imx->rx_buf) { + int n_bytes = spi_imx->slave_burst % sizeof(val); + + if (!n_bytes) + n_bytes = sizeof(val); + + memcpy(spi_imx->rx_buf, + ((u8 *)&val) + sizeof(val) - n_bytes, n_bytes); + + spi_imx->rx_buf += n_bytes; + spi_imx->slave_burst -= n_bytes; + } +} + +static void mx53_ecspi_tx_slave(struct spi_imx_data *spi_imx) +{ + u32 val = 0; + int n_bytes = spi_imx->count % sizeof(val); + + if (!n_bytes) + n_bytes = sizeof(val); + + if (spi_imx->tx_buf) { + memcpy(((u8 *)&val) + sizeof(val) - n_bytes, + spi_imx->tx_buf, n_bytes); + val = cpu_to_be32(val); + spi_imx->tx_buf += n_bytes; + } + + spi_imx->count -= n_bytes; + + writel(val, spi_imx->base + MXC_CSPITXDATA); +} + /* MX51 eCSPI */ static unsigned int mx51_ecspi_clkdiv(struct spi_imx_data *spi_imx, unsigned int fspi, unsigned int *fres) @@ -427,6 +479,9 @@ static void mx51_ecspi_intctrl(struct spi_imx_data *spi_imx, int enable) if (enable & MXC_INT_RR) val |= MX51_ECSPI_INT_RREN; + if (enable & MXC_INT_RDR) + val |= MX51_ECSPI_INT_RDREN; + writel(val, spi_imx->base + MX51_ECSPI_INT); } @@ -439,6 +494,15 @@ static void mx51_ecspi_trigger(struct spi_imx_data *spi_imx) writel(reg, spi_imx->base + MX51_ECSPI_CTRL); } +static void mx51_ecspi_disable(struct spi_imx_data *spi_imx) +{ + u32 ctrl; + + ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL); + ctrl &= ~MX51_ECSPI_CTRL_ENABLE; + writel(ctrl, spi_imx->base + MX51_ECSPI_CTRL); +} + static int mx51_ecspi_config(struct spi_device *spi) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); @@ -446,14 +510,11 @@ static int mx51_ecspi_config(struct spi_device *spi) u32 clk = spi_imx->speed_hz, delay, reg; u32 cfg = readl(spi_imx->base + MX51_ECSPI_CONFIG); - /* - * The hardware seems to have a race condition when changing modes. The - * current assumption is that the selection of the channel arrives - * earlier in the hardware than the mode bits when they are written at - * the same time. - * So set master mode for all channels as we do not support slave mode. - */ - ctrl |= MX51_ECSPI_CTRL_MODE_MASK; + /* set Master or Slave mode */ + if (spi_imx->slave_mode) + ctrl &= ~MX51_ECSPI_CTRL_MODE_MASK; + else + ctrl |= MX51_ECSPI_CTRL_MODE_MASK; /* * Enable SPI_RDY handling (falling edge/level triggered). @@ -468,9 +529,22 @@ static int mx51_ecspi_config(struct spi_device *spi) /* set chip select to use */ ctrl |= MX51_ECSPI_CTRL_CS(spi->chip_select); - ctrl |= (spi_imx->bits_per_word - 1) << MX51_ECSPI_CTRL_BL_OFFSET; + if (spi_imx->slave_mode && is_imx53_ecspi(spi_imx)) + ctrl |= (spi_imx->slave_burst * 8 - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; + else + ctrl |= (spi_imx->bits_per_word - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; - cfg |= MX51_ECSPI_CONFIG_SBBCTRL(spi->chip_select); + /* + * eCSPI burst completion by Chip Select signal in Slave mode + * is not functional for imx53 Soc, config SPI burst completed when + * BURST_LENGTH + 1 bits are received + */ + if (spi_imx->slave_mode && is_imx53_ecspi(spi_imx)) + cfg &= ~MX51_ECSPI_CONFIG_SBBCTRL(spi->chip_select); + else + cfg |= MX51_ECSPI_CONFIG_SBBCTRL(spi->chip_select); if (spi->mode & SPI_CPHA) cfg |= MX51_ECSPI_CONFIG_SCLKPHA(spi->chip_select); @@ -805,6 +879,7 @@ static struct spi_imx_devtype_data imx1_cspi_devtype_data = { .fifo_size = 8, .has_dmamode = false, .dynamic_burst = false, + .has_slavemode = false, .devtype = IMX1_CSPI, }; @@ -817,6 +892,7 @@ static struct spi_imx_devtype_data imx21_cspi_devtype_data = { .fifo_size = 8, .has_dmamode = false, .dynamic_burst = false, + .has_slavemode = false, .devtype = IMX21_CSPI, }; @@ -830,6 +906,7 @@ static struct spi_imx_devtype_data imx27_cspi_devtype_data = { .fifo_size = 8, .has_dmamode = false, .dynamic_burst = false, + .has_slavemode = false, .devtype = IMX27_CSPI, }; @@ -842,6 +919,7 @@ static struct spi_imx_devtype_data imx31_cspi_devtype_data = { .fifo_size = 8, .has_dmamode = false, .dynamic_burst = false, + .has_slavemode = false, .devtype = IMX31_CSPI, }; @@ -855,6 +933,7 @@ static struct spi_imx_devtype_data imx35_cspi_devtype_data = { .fifo_size = 8, .has_dmamode = true, .dynamic_burst = false, + .has_slavemode = false, .devtype = IMX35_CSPI, }; @@ -867,6 +946,8 @@ static struct spi_imx_devtype_data imx51_ecspi_devtype_data = { .fifo_size = 64, .has_dmamode = true, .dynamic_burst = true, + .has_slavemode = true, + .disable = mx51_ecspi_disable, .devtype = IMX51_ECSPI, }; @@ -878,6 +959,8 @@ static struct spi_imx_devtype_data imx53_ecspi_devtype_data = { .reset = mx51_ecspi_reset, .fifo_size = 64, .has_dmamode = true, + .has_slavemode = true, + .disable = mx51_ecspi_disable, .devtype = IMX53_ECSPI, }; @@ -945,14 +1028,16 @@ static void spi_imx_push(struct spi_imx_data *spi_imx) spi_imx->txfifo++; } - spi_imx->devtype_data->trigger(spi_imx); + if (!spi_imx->slave_mode) + spi_imx->devtype_data->trigger(spi_imx); } static irqreturn_t spi_imx_isr(int irq, void *dev_id) { struct spi_imx_data *spi_imx = dev_id; - while (spi_imx->devtype_data->rx_available(spi_imx)) { + while (spi_imx->txfifo && + spi_imx->devtype_data->rx_available(spi_imx)) { spi_imx->rx(spi_imx); spi_imx->txfifo--; } @@ -1034,7 +1119,7 @@ static int spi_imx_setupxfer(struct spi_device *spi, spi_imx->speed_hz = t->speed_hz; /* Initialize the functions for transfer */ - if (spi_imx->devtype_data->dynamic_burst) { + if (spi_imx->devtype_data->dynamic_burst && !spi_imx->slave_mode) { u32 mask; spi_imx->dynamic_burst = 0; @@ -1078,6 +1163,12 @@ static int spi_imx_setupxfer(struct spi_device *spi, return ret; } + if (is_imx53_ecspi(spi_imx) && spi_imx->slave_mode) { + spi_imx->rx = mx53_ecspi_rx_slave; + spi_imx->tx = mx53_ecspi_tx_slave; + spi_imx->slave_burst = t->len; + } + spi_imx->devtype_data->config(spi); return 0; @@ -1262,11 +1353,61 @@ static int spi_imx_pio_transfer(struct spi_device *spi, return transfer->len; } +static int spi_imx_pio_transfer_slave(struct spi_device *spi, + struct spi_transfer *transfer) +{ + struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); + int ret = transfer->len; + + if (is_imx53_ecspi(spi_imx) && + transfer->len > MX53_MAX_TRANSFER_BYTES) { + dev_err(&spi->dev, "Transaction too big, max size is %d bytes\n", + MX53_MAX_TRANSFER_BYTES); + return -EMSGSIZE; + } + + spi_imx->tx_buf = transfer->tx_buf; + spi_imx->rx_buf = transfer->rx_buf; + spi_imx->count = transfer->len; + spi_imx->txfifo = 0; + + reinit_completion(&spi_imx->xfer_done); + spi_imx->slave_aborted = false; + + spi_imx_push(spi_imx); + + spi_imx->devtype_data->intctrl(spi_imx, MXC_INT_TE | MXC_INT_RDR); + + if (wait_for_completion_interruptible(&spi_imx->xfer_done) || + spi_imx->slave_aborted) { + dev_dbg(&spi->dev, "interrupted\n"); + ret = -EINTR; + } + + /* ecspi has a HW issue when works in Slave mode, + * after 64 words writtern to TXFIFO, even TXFIFO becomes empty, + * ECSPI_TXDATA keeps shift out the last word data, + * so we have to disable ECSPI when in slave mode after the + * transfer completes + */ + if (spi_imx->devtype_data->disable) + spi_imx->devtype_data->disable(spi_imx); + + return ret; +} + static int spi_imx_transfer(struct spi_device *spi, struct spi_transfer *transfer) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); + /* flush rxfifo before transfer */ + while (spi_imx->devtype_data->rx_available(spi_imx)) + spi_imx->rx(spi_imx); + + if (spi_imx->slave_mode) + return spi_imx_pio_transfer_slave(spi, transfer); + if (spi_imx->usedma) return spi_imx_dma_transfer(spi_imx, transfer); else @@ -1323,6 +1464,16 @@ spi_imx_unprepare_message(struct spi_master *master, struct spi_message *msg) return 0; } +static int spi_imx_slave_abort(struct spi_master *master) +{ + struct spi_imx_data *spi_imx = spi_master_get_devdata(master); + + spi_imx->slave_aborted = true; + complete(&spi_imx->xfer_done); + + return 0; +} + static int spi_imx_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -1334,13 +1485,23 @@ static int spi_imx_probe(struct platform_device *pdev) struct spi_imx_data *spi_imx; struct resource *res; int i, ret, irq, spi_drctl; + const struct spi_imx_devtype_data *devtype_data = of_id ? of_id->data : + (struct spi_imx_devtype_data *)pdev->id_entry->driver_data; + bool slave_mode; if (!np && !mxc_platform_info) { dev_err(&pdev->dev, "can't get the platform data\n"); return -EINVAL; } - master = spi_alloc_master(&pdev->dev, sizeof(struct spi_imx_data)); + slave_mode = devtype_data->has_slavemode && + of_property_read_bool(np, "spi-slave"); + if (slave_mode) + master = spi_alloc_slave(&pdev->dev, + sizeof(struct spi_imx_data)); + else + master = spi_alloc_master(&pdev->dev, + sizeof(struct spi_imx_data)); if (!master) return -ENOMEM; @@ -1358,9 +1519,9 @@ static int spi_imx_probe(struct platform_device *pdev) spi_imx = spi_master_get_devdata(master); spi_imx->bitbang.master = master; spi_imx->dev = &pdev->dev; + spi_imx->slave_mode = slave_mode; - spi_imx->devtype_data = of_id ? of_id->data : - (struct spi_imx_devtype_data *)pdev->id_entry->driver_data; + spi_imx->devtype_data = devtype_data; if (mxc_platform_info) { master->num_chipselect = mxc_platform_info->num_chipselect; @@ -1380,6 +1541,7 @@ static int spi_imx_probe(struct platform_device *pdev) spi_imx->bitbang.master->cleanup = spi_imx_cleanup; spi_imx->bitbang.master->prepare_message = spi_imx_prepare_message; spi_imx->bitbang.master->unprepare_message = spi_imx_unprepare_message; + spi_imx->bitbang.master->slave_abort = spi_imx_slave_abort; spi_imx->bitbang.master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH \ | SPI_NO_CS; if (is_imx35_cspi(spi_imx) || is_imx51_ecspi(spi_imx) || @@ -1457,23 +1619,26 @@ static int spi_imx_probe(struct platform_device *pdev) goto out_clk_put; } - if (!master->cs_gpios) { - dev_err(&pdev->dev, "No CS GPIOs available\n"); - ret = -EINVAL; - goto out_clk_put; - } - - for (i = 0; i < master->num_chipselect; i++) { - if (!gpio_is_valid(master->cs_gpios[i])) - continue; - - ret = devm_gpio_request(&pdev->dev, master->cs_gpios[i], - DRIVER_NAME); - if (ret) { - dev_err(&pdev->dev, "Can't get CS GPIO %i\n", - master->cs_gpios[i]); + if (!spi_imx->slave_mode) { + if (!master->cs_gpios) { + dev_err(&pdev->dev, "No CS GPIOs available\n"); + ret = -EINVAL; goto out_clk_put; } + + for (i = 0; i < master->num_chipselect; i++) { + if (!gpio_is_valid(master->cs_gpios[i])) + continue; + + ret = devm_gpio_request(&pdev->dev, + master->cs_gpios[i], + DRIVER_NAME); + if (ret) { + dev_err(&pdev->dev, "Can't get CS GPIO %i\n", + master->cs_gpios[i]); + goto out_clk_put; + } + } } dev_info(&pdev->dev, "probed\n"); From da394712324e9a0c91ae05cc24396af6d99f3ad6 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 Sep 2017 01:17:17 -0300 Subject: [PATCH 0047/1085] spi: Kconfig: Remove old comments now that SPI slave is supported Since commit 6c364062bfed ("spi: core: Add support for registering SPI slave controllers") SPI slave is also supported, so remove the old comments that say SPI slave is unsupported. Signed-off-by: Fabio Estevam Acked-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index a75f2a2cf780..948a74bb2795 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -1,10 +1,6 @@ # # SPI driver configuration # -# NOTE: the reason this doesn't show SPI slave support is mostly that -# nobody's needed a slave side API yet. The master-role API is not -# fully appropriate there, so it'd need some thought to do well. -# menuconfig SPI bool "SPI support" depends on HAS_IOMEM From 75f029c3a83f3e7a1d0d928efa4fe47dd6a8a9eb Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 21 Sep 2017 12:16:56 +0200 Subject: [PATCH 0048/1085] EDAC: Handle return value of kasprintf() kasprintf() can fail and we must check its return value. Signed-off-by: Arvind Yadav Cc: linux-edac@vger.kernel.org [ Merged into a single patch, small formatting fixups. ] Signed-off-by: Borislav Petkov --- drivers/edac/i7core_edac.c | 11 +++++++++-- drivers/edac/sb_edac.c | 5 +++++ drivers/edac/skx_edac.c | 9 +++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index c16c3b931b3d..8c5540160a23 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -2159,8 +2159,13 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = "i7core_edac.c"; - mci->ctl_name = kasprintf(GFP_KERNEL, "i7 core #%d", - i7core_dev->socket); + + mci->ctl_name = kasprintf(GFP_KERNEL, "i7 core #%d", i7core_dev->socket); + if (!mci->ctl_name) { + rc = -ENOMEM; + goto fail1; + } + mci->dev_name = pci_name(i7core_dev->pdev[0]); mci->ctl_page_to_phys = NULL; @@ -2214,6 +2219,8 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) fail0: kfree(mci->ctl_name); + +fail1: edac_mc_free(mci); i7core_dev->mci = NULL; return rc; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index dc0591654011..2078ee414568 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3287,6 +3287,11 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) break; } + if (!mci->ctl_name) { + rc = -ENOMEM; + goto fail0; + } + /* Get dimm basic config and the memory layout */ rc = get_dimm_config(mci); if (rc < 0) { diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index 16dea97568a1..3fe85b0ac274 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -464,8 +464,12 @@ static int skx_register_mci(struct skx_imc *imc) pvt = mci->pvt_info; pvt->imc = imc; - mci->ctl_name = kasprintf(GFP_KERNEL, "Skylake Socket#%d IMC#%d", - imc->node_id, imc->lmc); + mci->ctl_name = kasprintf(GFP_KERNEL, "Skylake Socket#%d IMC#%d", imc->node_id, imc->lmc); + if (!mci->ctl_name) { + rc = -ENOMEM; + goto fail0; + } + mci->mtype_cap = MEM_FLAG_DDR4; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; @@ -491,6 +495,7 @@ static int skx_register_mci(struct skx_imc *imc) fail: kfree(mci->ctl_name); +fail0: edac_mc_free(mci); imc->mci = NULL; return rc; From 411bc316f3365363959c2c895af2618389534583 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:57:35 -0700 Subject: [PATCH 0049/1085] perf stat: Fix adding multiple event groups The -M metric group parser threw away the events of earlier groups when multiple groups were specified. Fix this here by not overwriting the string incorrectly. Now this works correctly: % perf stat -M Summary,SMT --metric-only -a sleep 1 Performance counter stats for 'system wide': Instructions CPI CLKS CPU_Utilization GFLOPs SMT_2T_Utilization SMT_2T_Utilization Kernel_Utilization CoreIPC CORE_CLKS 900907376.0 2.7 2398954144.0 0.1 0.0 0.2 0.2 0.1 0.4 2080822855.5 while previously it would only show the SMT metrics. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20170914205735.18431-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index fa37ef79517a..0ddd9c199227 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -373,9 +373,6 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, int ret = -EINVAL; int i, j; - strbuf_init(events, 100); - strbuf_addf(events, "%s", ""); - if (!map) return 0; @@ -433,6 +430,10 @@ static int metricgroup__add_metric_list(const char *list, struct strbuf *events, if (!nlist) return -ENOMEM; llist = nlist; + + strbuf_init(events, 100); + strbuf_addf(events, "%s", ""); + while ((p = strsep(&llist, ",")) != NULL) { ret = metricgroup__add_metric(p, events, group_list); if (ret == -EINVAL) { From 5c9295bfe6f5f59f3f2eee78f58b0523d117897e Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Tue, 19 Sep 2017 12:57:37 +0800 Subject: [PATCH 0050/1085] perf tests: Remove Intel CQM perf test Intel CQM perf test is obsolete for perf PMU code has been removed in commit c39a0e2c8850 ("x86/perf/cqm: Wipe out perf based cqm"). Signed-off-by: Xiaochen Shen Cc: Alexander Shishkin Cc: Fenghua Yu Cc: Matt Fleming Cc: Pei P Jia Cc: Peter Zijlstra Cc: Tony Luck Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1505797057-16300-1-git-send-email-xiaochen.shen@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/include/arch-tests.h | 1 - tools/perf/arch/x86/tests/Build | 1 - tools/perf/arch/x86/tests/arch-tests.c | 4 - tools/perf/arch/x86/tests/intel-cqm.c | 127 ----------------------- 4 files changed, 133 deletions(-) delete mode 100644 tools/perf/arch/x86/tests/intel-cqm.c diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h index 4e0b806a7a0f..01ad4208bcdf 100644 --- a/tools/perf/arch/x86/include/arch-tests.h +++ b/tools/perf/arch/x86/include/arch-tests.h @@ -8,7 +8,6 @@ struct test; int test__rdpmc(struct test *test __maybe_unused, int subtest); int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest); int test__insn_x86(struct test *test __maybe_unused, int subtest); -int test__intel_cqm_count_nmi_context(struct test *test __maybe_unused, int subtest); #ifdef HAVE_DWARF_UNWIND_SUPPORT struct thread; diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index cbb7e978166b..8e2c5a38c3b9 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -5,4 +5,3 @@ libperf-y += arch-tests.o libperf-y += rdpmc.o libperf-y += perf-time-to-tsc.o libperf-$(CONFIG_AUXTRACE) += insn-x86.o -libperf-y += intel-cqm.o diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c index 99d66191e56c..271c0a0f95d7 100644 --- a/tools/perf/arch/x86/tests/arch-tests.c +++ b/tools/perf/arch/x86/tests/arch-tests.c @@ -23,10 +23,6 @@ struct test arch_tests[] = { .func = test__insn_x86, }, #endif - { - .desc = "Intel cqm nmi context read", - .func = test__intel_cqm_count_nmi_context, - }, { .func = NULL, }, diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c deleted file mode 100644 index 57f86b6e7d6f..000000000000 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ /dev/null @@ -1,127 +0,0 @@ -#include "tests/tests.h" -#include "perf.h" -#include "cloexec.h" -#include "debug.h" -#include "evlist.h" -#include "evsel.h" -#include "arch-tests.h" - -#include -#include -#include -#include -#include - -static pid_t spawn(void) -{ - pid_t pid; - - pid = fork(); - if (pid) - return pid; - - while(1) - sleep(5); - return 0; -} - -/* - * Create an event group that contains both a sampled hardware - * (cpu-cycles) and software (intel_cqm/llc_occupancy/) event. We then - * wait for the hardware perf counter to overflow and generate a PMI, - * which triggers an event read for both of the events in the group. - * - * Since reading Intel CQM event counters requires sending SMP IPIs, the - * CQM pmu needs to handle the above situation gracefully, and return - * the last read counter value to avoid triggering a WARN_ON_ONCE() in - * smp_call_function_many() caused by sending IPIs from NMI context. - */ -int test__intel_cqm_count_nmi_context(struct test *test __maybe_unused, int subtest __maybe_unused) -{ - struct perf_evlist *evlist = NULL; - struct perf_evsel *evsel = NULL; - struct perf_event_attr pe; - int i, fd[2], flag, ret; - size_t mmap_len; - void *event; - pid_t pid; - int err = TEST_FAIL; - - flag = perf_event_open_cloexec_flag(); - - evlist = perf_evlist__new(); - if (!evlist) { - pr_debug("perf_evlist__new failed\n"); - return TEST_FAIL; - } - - ret = parse_events(evlist, "intel_cqm/llc_occupancy/", NULL); - if (ret) { - pr_debug("parse_events failed, is \"intel_cqm/llc_occupancy/\" available?\n"); - err = TEST_SKIP; - goto out; - } - - evsel = perf_evlist__first(evlist); - if (!evsel) { - pr_debug("perf_evlist__first failed\n"); - goto out; - } - - memset(&pe, 0, sizeof(pe)); - pe.size = sizeof(pe); - - pe.type = PERF_TYPE_HARDWARE; - pe.config = PERF_COUNT_HW_CPU_CYCLES; - pe.read_format = PERF_FORMAT_GROUP; - - pe.sample_period = 128; - pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ; - - pid = spawn(); - - fd[0] = sys_perf_event_open(&pe, pid, -1, -1, flag); - if (fd[0] < 0) { - pr_debug("failed to open event\n"); - goto out; - } - - memset(&pe, 0, sizeof(pe)); - pe.size = sizeof(pe); - - pe.type = evsel->attr.type; - pe.config = evsel->attr.config; - - fd[1] = sys_perf_event_open(&pe, pid, -1, fd[0], flag); - if (fd[1] < 0) { - pr_debug("failed to open event\n"); - goto out; - } - - /* - * Pick a power-of-two number of pages + 1 for the meta-data - * page (struct perf_event_mmap_page). See tools/perf/design.txt. - */ - mmap_len = page_size * 65; - - event = mmap(NULL, mmap_len, PROT_READ, MAP_SHARED, fd[0], 0); - if (event == (void *)(-1)) { - pr_debug("failed to mmap %d\n", errno); - goto out; - } - - sleep(1); - - err = TEST_OK; - - munmap(event, mmap_len); - - for (i = 0; i < 2; i++) - close(fd[i]); - - kill(pid, SIGKILL); - wait(NULL); -out: - perf_evlist__delete(evlist); - return err; -} From 5a54c2f5e1717264f6e24687f703937eaca5f55d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 12:30:36 -0300 Subject: [PATCH 0051/1085] perf trace beauty madvise: Generate 'behavior' string table from kernel headers This is one more case where the way that syscall parameter values are defined in kernel headers are easy to parse using a shell script that will then generate the string table that gets used by the madvise 'behaviour' argument beautifier. This way as soon as the header syncronization mechanism in perf's build system detects a change in a copy of a kernel ABI header and that file is syncronized, we get 'perf trace' updated automagically. So, when we syncronize this: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h' We'll get these: #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Rik van Riel Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-dolb0ghds4ui7wc1npgkchvc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 9 +++++ tools/perf/trace/beauty/madvise_behavior.sh | 10 ++++++ tools/perf/trace/beauty/mmap.c | 38 +++++++-------------- 3 files changed, 31 insertions(+), 26 deletions(-) create mode 100755 tools/perf/trace/beauty/madvise_behavior.sh diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 1df93b4c4648..5f7408118a2d 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -441,6 +441,13 @@ perf_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/perf_ioctl.sh $(perf_ioctl_array): $(perf_hdr_dir)/perf_event.h $(perf_ioctl_tbl) $(Q)$(SHELL) '$(perf_ioctl_tbl)' $(perf_hdr_dir) > $@ +madvise_behavior_array := $(beauty_outdir)/madvise_behavior_array.c +madvise_hdr_dir := $(srctree)/tools/include/uapi/asm-generic/ +madvise_behavior_tbl := $(srctree)/tools/perf/trace/beauty/madvise_behavior.sh + +$(madvise_behavior_array): $(madvise_hdr_dir)/mman-common.h $(madvise_behavior_tbl) + $(Q)$(SHELL) '$(madvise_behavior_tbl)' $(madvise_hdr_dir) > $@ + all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) @@ -541,6 +548,7 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(sndrv_ctl_ioctl_array) \ $(kvm_ioctl_array) \ $(vhost_virtio_ioctl_array) \ + $(madvise_behavior_array) \ $(perf_ioctl_array) $(OUTPUT)%.o: %.c prepare FORCE @@ -814,6 +822,7 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)util/intel-pt-decoder/inat-tables.c \ $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ $(OUTPUT)pmu-events/pmu-events.c \ + $(OUTPUT)$(madvise_behavior_array) \ $(OUTPUT)$(drm_ioctl_array) \ $(OUTPUT)$(pkey_alloc_access_rights_array) \ $(OUTPUT)$(sndrv_ctl_ioctl_array) \ diff --git a/tools/perf/trace/beauty/madvise_behavior.sh b/tools/perf/trace/beauty/madvise_behavior.sh new file mode 100755 index 000000000000..60ef8640ee70 --- /dev/null +++ b/tools/perf/trace/beauty/madvise_behavior.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +header_dir=$1 + +printf "static const char *madvise_advices[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MADV_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*' +egrep $regex ${header_dir}/mman-common.h | \ + sed -r "s/$regex/\2 \1/g" | \ + sort -n | xargs printf "\t[%s] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c index 754558f9009d..45b1e0c0018f 100644 --- a/tools/perf/trace/beauty/mmap.c +++ b/tools/perf/trace/beauty/mmap.c @@ -94,35 +94,21 @@ static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size, #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags +static size_t madvise__scnprintf_behavior(int behavior, char *bf, size_t size) +{ +#include "trace/beauty/generated/madvise_behavior_array.c" + static DEFINE_STRARRAY(madvise_advices); + + if (behavior < strarray__madvise_advices.nr_entries && strarray__madvise_advices.entries[behavior] != NULL) + return scnprintf(bf, size, "MADV_%s", strarray__madvise_advices.entries[behavior]); + + return scnprintf(bf, size, "%#", behavior); +} + static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, struct syscall_arg *arg) { - int behavior = arg->val; - - switch (behavior) { -#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) - P_MADV_BHV(NORMAL); - P_MADV_BHV(RANDOM); - P_MADV_BHV(SEQUENTIAL); - P_MADV_BHV(WILLNEED); - P_MADV_BHV(DONTNEED); - P_MADV_BHV(FREE); - P_MADV_BHV(REMOVE); - P_MADV_BHV(DONTFORK); - P_MADV_BHV(DOFORK); - P_MADV_BHV(HWPOISON); - P_MADV_BHV(SOFT_OFFLINE); - P_MADV_BHV(MERGEABLE); - P_MADV_BHV(UNMERGEABLE); - P_MADV_BHV(HUGEPAGE); - P_MADV_BHV(NOHUGEPAGE); - P_MADV_BHV(DONTDUMP); - P_MADV_BHV(DODUMP); -#undef P_MADV_BHV - default: break; - } - - return scnprintf(bf, size, "%#x", behavior); + return madvise__scnprintf_behavior(arg->val, bf, size); } #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior From 492e05b0654126bd6a04473028ac4c8cfc22ccec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 12:41:57 -0300 Subject: [PATCH 0052/1085] tools: Update asm-generic/mman-common.h copy from the kernel To get the defines introduced in the commit aafd4562dfee ("mm: arch: consolidate mmap hugetlb size encodings"), that doesn't brings anything interesting for tools/, but also the ones from d2cd9ede6e19 ("mm,fork: introduce MADV_WIPEONFORK"), which does, and ends up triggering an auto-update to the tools/perf/trace/beauty/generated/madvise_behavior_array.c file, supporting the newly introduced 'behavior' values. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h' Testing it: # cat madvise.c #include #include #ifndef MADV_WIPEONFORK #define MADV_WIPEONFORK 18 #endif #ifndef MADV_KEEPONFORK #define MADV_KEEPONFORK 19 #endif int main(void) { void *ptr = mmap(NULL, 4096, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); madvise(ptr, 4096, MADV_WIPEONFORK); madvise(ptr, 4096, MADV_KEEPONFORK); return 0; } [root@jouet c]# perf trace -e mmap,madvise ./madvise 0.000 ( 0.013 ms): madvise/11732 mmap(len: 8192, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS ) = 0x7fba6e015000 0.047 ( 0.004 ms): madvise/11732 mmap(len: 160164, prot: READ, flags: PRIVATE, fd: 3 ) = 0x7fba6dfed000 0.084 ( 0.009 ms): madvise/11732 mmap(len: 4000096, prot: EXEC|READ, flags: PRIVATE|DENYWRITE, fd: 3 ) = 0x7fba6da20000 0.109 ( 0.006 ms): madvise/11732 mmap(addr: 0x7fba6dde7000, len: 24576, prot: READ|WRITE, flags: PRIVATE|DENYWRITE|FIXED, fd: 3, off: 1863680) = 0x7fba6dde7000 0.125 ( 0.004 ms): madvise/11732 mmap(addr: 0x7fba6dded000, len: 14688, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS|FIXED) = 0x7fba6dded000 0.150 ( 0.006 ms): madvise/11732 mmap(len: 12288, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS ) = 0x7fba6dfea000 0.288 ( 0.003 ms): madvise/11732 mmap(len: 4096, flags: PRIVATE|ANONYMOUS ) = 0x7fba6e014000 0.292 ( 0.002 ms): madvise/11732 madvise(start: 0x7fba6e014000, len_in: 4096, behavior: MADV_WIPEONFORK) = 0 0.295 ( 0.001 ms): madvise/11732 madvise(start: 0x7fba6e014000, len_in: 4096, behavior: MADV_KEEPONFORK) = 0 # uname -a Linux jouet 4.13.0+ #2 SMP Mon Sep 18 17:22:46 -03 2017 x86_64 x86_64 x86_64 GNU/Linux # Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Mike Kravetz Cc: Namhyung Kim Cc: Rik van Riel Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-yev9rexu02cl7cjeozzmrl9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/mman-common.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 8c27db0c5c08..203268f9231e 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -58,20 +58,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ From 0e1eed80885fdb09993bffb16f5662f4b53c1a08 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 16:41:34 -0300 Subject: [PATCH 0053/1085] perf tools: Get all of tools/{arch,include}/ in the MANIFEST Now that I'm switching the container builds from using a local volume pointing to the kernel repository with the perf sources, instead getting a detached tarball to be able to use a container cluster, some places broke because I forgot to put some of the required files in tools/perf/MANIFEST, namely some bitsperlong.h files. So, to fix it do the same as for tools/build/ and pack the whole tools/arch/ directory. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wmenpjfjsobwdnfde30qqncj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 87 ++------------------------------------------- 1 file changed, 2 insertions(+), 85 deletions(-) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 62072822dc85..627b7cada144 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,34 +1,8 @@ tools/perf -tools/arch/alpha/include/asm/barrier.h -tools/arch/arm/include/asm/barrier.h -tools/arch/arm64/include/asm/barrier.h -tools/arch/ia64/include/asm/barrier.h -tools/arch/mips/include/asm/barrier.h -tools/arch/powerpc/include/asm/barrier.h -tools/arch/s390/include/asm/barrier.h -tools/arch/sh/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier_32.h -tools/arch/sparc/include/asm/barrier_64.h -tools/arch/tile/include/asm/barrier.h -tools/arch/x86/include/asm/barrier.h -tools/arch/x86/include/asm/cmpxchg.h -tools/arch/x86/include/asm/cpufeatures.h -tools/arch/x86/include/asm/disabled-features.h -tools/arch/x86/include/asm/required-features.h -tools/arch/x86/include/uapi/asm/svm.h -tools/arch/x86/include/uapi/asm/vmx.h -tools/arch/x86/include/uapi/asm/kvm.h -tools/arch/x86/include/uapi/asm/kvm_perf.h -tools/arch/x86/lib/memcpy_64.S -tools/arch/x86/lib/memset_64.S -tools/arch/s390/include/uapi/asm/kvm_perf.h -tools/arch/s390/include/uapi/asm/sie.h -tools/arch/xtensa/include/asm/barrier.h +tools/arch tools/scripts tools/build -tools/arch/x86/include/asm/atomic.h -tools/arch/x86/include/asm/rmwcc.h +tools/include tools/lib/traceevent tools/lib/api tools/lib/bpf @@ -42,60 +16,3 @@ tools/lib/find_bit.c tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c -tools/include/asm/alternative-asm.h -tools/include/asm/atomic.h -tools/include/asm/barrier.h -tools/include/asm/bug.h -tools/include/asm-generic/atomic-gcc.h -tools/include/asm-generic/barrier.h -tools/include/asm-generic/bitops/arch_hweight.h -tools/include/asm-generic/bitops/atomic.h -tools/include/asm-generic/bitops/const_hweight.h -tools/include/asm-generic/bitops/__ffs.h -tools/include/asm-generic/bitops/__ffz.h -tools/include/asm-generic/bitops/__fls.h -tools/include/asm-generic/bitops/find.h -tools/include/asm-generic/bitops/fls64.h -tools/include/asm-generic/bitops/fls.h -tools/include/asm-generic/bitops/hweight.h -tools/include/asm-generic/bitops.h -tools/include/linux/atomic.h -tools/include/linux/bitops.h -tools/include/linux/compiler.h -tools/include/linux/compiler-gcc.h -tools/include/linux/coresight-pmu.h -tools/include/linux/bug.h -tools/include/linux/filter.h -tools/include/linux/hash.h -tools/include/linux/kernel.h -tools/include/linux/list.h -tools/include/linux/log2.h -tools/include/uapi/asm-generic/fcntl.h -tools/include/uapi/asm-generic/ioctls.h -tools/include/uapi/asm-generic/mman-common.h -tools/include/uapi/asm-generic/mman.h -tools/include/uapi/drm/drm.h -tools/include/uapi/drm/i915_drm.h -tools/include/uapi/linux/bpf.h -tools/include/uapi/linux/bpf_common.h -tools/include/uapi/linux/fcntl.h -tools/include/uapi/linux/hw_breakpoint.h -tools/include/uapi/linux/kvm.h -tools/include/uapi/linux/mman.h -tools/include/uapi/linux/perf_event.h -tools/include/uapi/linux/sched.h -tools/include/uapi/linux/stat.h -tools/include/uapi/linux/vhost.h -tools/include/uapi/sound/asound.h -tools/include/linux/poison.h -tools/include/linux/rbtree.h -tools/include/linux/rbtree_augmented.h -tools/include/linux/refcount.h -tools/include/linux/string.h -tools/include/linux/stringify.h -tools/include/linux/types.h -tools/include/linux/err.h -tools/include/linux/bitmap.h -tools/include/linux/time64.h -tools/arch/*/include/uapi/asm/mman.h -tools/arch/*/include/uapi/asm/perf_regs.h From 6ae8eefc6c8fe050f057781b70a83262eb0a61ee Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 21 Sep 2017 12:12:17 -0300 Subject: [PATCH 0054/1085] tools include: Do not use poison with C++ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LIST_POISON[12] are used to initialize list_head and hlist_node pointers, and do void pointer arithmetic, which C++ doesn't like, so, to avoid drifting from the kernel by introducing some HLIST_POISON to do away with void pointer math, just make those poisoned pointers be NULL when building it with a C++ compiler. Noticed with: $ make LLVM_CONFIG=/usr/bin/llvm-config-3.9 LIBCLANGLLVM=1 CXX util/c++/clang.o CXX util/c++/clang-test.o In file included from /home/lizj/linux/tools/include/linux/list.h:5:0, from /home/lizj/linux/tools/perf/util/namespaces.h:13, from /home/lizj/linux/tools/perf/util/util.h:15, from /home/lizj/linux/tools/perf/util/util-cxx.h:20, from util/c++/clang-c.h:5, from util/c++/clang-test.cpp:2: /home/lizj/linux/tools/include/linux/list.h: In function ‘void list_del(list_head*)’: /home/lizj/linux/tools/include/linux/poison.h:14:31: error: pointer of type ‘void *’ used in arithmetic [-Werror=pointer-arith] # define POISON_POINTER_DELTA 0 ^ /home/lizj/linux/tools/include/linux/poison.h:22:41: note: in expansion of macro ‘POISON_POINTER_DELTA’ #define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) ^ /home/lizj/linux/tools/include/linux/list.h:107:16: note: in expansion of macro ‘LIST_POISON1’ entry->next = LIST_POISON1; ^ In file included from /home/lizj/linux/tools/perf/util/namespaces.h:13:0, from /home/lizj/linux/tools/perf/util/util.h:15, from /home/lizj/linux/tools/perf/util/util-cxx.h:20, from util/c++/clang-c.h:5, from util/c++/clang-test.cpp:2: /home/lizj/linux/tools/include/linux/list.h:107:14: error: invalid conversion from ‘void*’ to ‘list_head*’ [-fpermissive] Reported-by: Li Zhijian Cc: Adrian Hunter Cc: Alexander Shishkin Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Philip Li Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-m5ei2o0mjshucbr28baf5lqz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/poison.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h index 51334edec506..f306a7642509 100644 --- a/tools/include/linux/poison.h +++ b/tools/include/linux/poison.h @@ -14,6 +14,10 @@ # define POISON_POINTER_DELTA 0 #endif +#ifdef __cplusplus +#define LIST_POISON1 NULL +#define LIST_POISON2 NULL +#else /* * These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses @@ -21,6 +25,7 @@ */ #define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) +#endif /********** include/linux/timer.h **********/ /* From 0a7c74eae307894c6c95316c382f118aef8481e8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 4 Apr 2017 13:15:04 -0300 Subject: [PATCH 0055/1085] perf tools: Provide mutex wrappers for pthreads rwlocks Andi reported a performance drop in single threaded perf tools such as 'perf script' due to the growing number of locks being put in place to allow for multithreaded tools, so wrap the POSIX threads rwlock routines with the names used for such kinds of locks in the Linux kernel and then allow for tools to ask for those locks to be used or not. I.e. a tool may have a multithreaded phase and then switch to single threaded, like the upcoming patches for the synthesizing of PERF_RECORD_{FORK,MMAP,etc} for pre-existing processes to then switch to single threaded mode in 'perf top'. The init routines will not be conditional, this way starting as single threaded to then move to multi threaded mode should be possible. Reported-by: Andi Kleen Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/20170404161739.GH12903@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 1 - tools/perf/builtin-script.c | 2 ++ tools/perf/util/Build | 1 + tools/perf/util/dso.c | 12 ++++----- tools/perf/util/dso.h | 4 +-- tools/perf/util/machine.c | 41 +++++++++++++++--------------- tools/perf/util/machine.h | 3 ++- tools/perf/util/map.c | 34 ++++++++++++------------- tools/perf/util/map.h | 3 ++- tools/perf/util/rwsem.c | 32 +++++++++++++++++++++++ tools/perf/util/rwsem.h | 19 ++++++++++++++ tools/perf/util/symbol.c | 8 +++--- tools/perf/util/thread.c | 4 +-- tools/perf/util/trace-event-info.c | 1 - tools/perf/util/trace-event-read.c | 1 - tools/perf/util/util.c | 13 ++++++++++ tools/perf/util/util.h | 5 ++++ tools/perf/util/vdso.c | 4 +-- 18 files changed, 130 insertions(+), 58 deletions(-) create mode 100644 tools/perf/util/rwsem.c create mode 100644 tools/perf/util/rwsem.h diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index f309c3773522..c747a1af49fe 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -34,7 +34,6 @@ #include #include #include -#include #include static const char *get_filename_for_perf_kvm(void) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 725dbd3dd104..9092de0f7238 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2829,6 +2829,8 @@ int cmd_script(int argc, const char **argv) NULL }; + perf_set_singlethreaded(); + setup_scripting(); argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 71ab8466714d..369c3163e68c 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -79,6 +79,7 @@ libperf-y += data.o libperf-y += tsc.o libperf-y += cloexec.o libperf-y += call-path.o +libperf-y += rwsem.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index ffd723179a4b..339e52971380 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1366,9 +1366,9 @@ void __dsos__add(struct dsos *dsos, struct dso *dso) void dsos__add(struct dsos *dsos, struct dso *dso) { - pthread_rwlock_wrlock(&dsos->lock); + down_write(&dsos->lock); __dsos__add(dsos, dso); - pthread_rwlock_unlock(&dsos->lock); + up_write(&dsos->lock); } struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) @@ -1387,9 +1387,9 @@ struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) { struct dso *dso; - pthread_rwlock_rdlock(&dsos->lock); + down_read(&dsos->lock); dso = __dsos__find(dsos, name, cmp_short); - pthread_rwlock_unlock(&dsos->lock); + up_read(&dsos->lock); return dso; } @@ -1416,9 +1416,9 @@ struct dso *__dsos__findnew(struct dsos *dsos, const char *name) struct dso *dsos__findnew(struct dsos *dsos, const char *name) { struct dso *dso; - pthread_rwlock_wrlock(&dsos->lock); + down_write(&dsos->lock); dso = dso__get(__dsos__findnew(dsos, name)); - pthread_rwlock_unlock(&dsos->lock); + up_write(&dsos->lock); return dso; } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index f886141678eb..a2bbb21f301c 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include "rwsem.h" #include #include #include "map.h" @@ -129,7 +129,7 @@ struct dso_cache { struct dsos { struct list_head head; struct rb_root root; /* rbtree root sorted by long name */ - pthread_rwlock_t lock; + struct rw_semaphore lock; }; struct auxtrace_cache; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index ddeea05eae86..585b4a3d64a4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -30,7 +30,7 @@ static void dsos__init(struct dsos *dsos) { INIT_LIST_HEAD(&dsos->head); dsos->root = RB_ROOT; - pthread_rwlock_init(&dsos->lock, NULL); + init_rwsem(&dsos->lock); } static void machine__threads_init(struct machine *machine) @@ -40,7 +40,7 @@ static void machine__threads_init(struct machine *machine) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; threads->entries = RB_ROOT; - pthread_rwlock_init(&threads->lock, NULL); + init_rwsem(&threads->lock); threads->nr = 0; INIT_LIST_HEAD(&threads->dead); threads->last_match = NULL; @@ -130,7 +130,7 @@ static void dsos__purge(struct dsos *dsos) { struct dso *pos, *n; - pthread_rwlock_wrlock(&dsos->lock); + down_write(&dsos->lock); list_for_each_entry_safe(pos, n, &dsos->head, node) { RB_CLEAR_NODE(&pos->rb_node); @@ -139,13 +139,13 @@ static void dsos__purge(struct dsos *dsos) dso__put(pos); } - pthread_rwlock_unlock(&dsos->lock); + up_write(&dsos->lock); } static void dsos__exit(struct dsos *dsos) { dsos__purge(dsos); - pthread_rwlock_destroy(&dsos->lock); + exit_rwsem(&dsos->lock); } void machine__delete_threads(struct machine *machine) @@ -155,7 +155,7 @@ void machine__delete_threads(struct machine *machine) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; - pthread_rwlock_wrlock(&threads->lock); + down_write(&threads->lock); nd = rb_first(&threads->entries); while (nd) { struct thread *t = rb_entry(nd, struct thread, rb_node); @@ -163,7 +163,7 @@ void machine__delete_threads(struct machine *machine) nd = rb_next(nd); __machine__remove_thread(machine, t, false); } - pthread_rwlock_unlock(&threads->lock); + up_write(&threads->lock); } } @@ -180,7 +180,7 @@ void machine__exit(struct machine *machine) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; - pthread_rwlock_destroy(&threads->lock); + exit_rwsem(&threads->lock); } } @@ -482,9 +482,9 @@ struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, struct threads *threads = machine__threads(machine, tid); struct thread *th; - pthread_rwlock_wrlock(&threads->lock); + down_write(&threads->lock); th = __machine__findnew_thread(machine, pid, tid); - pthread_rwlock_unlock(&threads->lock); + up_write(&threads->lock); return th; } @@ -494,9 +494,9 @@ struct thread *machine__find_thread(struct machine *machine, pid_t pid, struct threads *threads = machine__threads(machine, tid); struct thread *th; - pthread_rwlock_rdlock(&threads->lock); + down_read(&threads->lock); th = ____machine__findnew_thread(machine, threads, pid, tid, false); - pthread_rwlock_unlock(&threads->lock); + up_read(&threads->lock); return th; } @@ -588,7 +588,7 @@ static struct dso *machine__findnew_module_dso(struct machine *machine, { struct dso *dso; - pthread_rwlock_wrlock(&machine->dsos.lock); + down_write(&machine->dsos.lock); dso = __dsos__find(&machine->dsos, m->name, true); if (!dso) { @@ -602,7 +602,7 @@ static struct dso *machine__findnew_module_dso(struct machine *machine, dso__get(dso); out_unlock: - pthread_rwlock_unlock(&machine->dsos.lock); + up_write(&machine->dsos.lock); return dso; } @@ -749,7 +749,8 @@ size_t machine__fprintf(struct machine *machine, FILE *fp) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; - pthread_rwlock_rdlock(&threads->lock); + + down_read(&threads->lock); ret = fprintf(fp, "Threads: %u\n", threads->nr); @@ -759,7 +760,7 @@ size_t machine__fprintf(struct machine *machine, FILE *fp) ret += thread__fprintf(pos, fp); } - pthread_rwlock_unlock(&threads->lock); + up_read(&threads->lock); } return ret; } @@ -1319,7 +1320,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, struct dso *kernel = NULL; struct dso *dso; - pthread_rwlock_rdlock(&machine->dsos.lock); + down_read(&machine->dsos.lock); list_for_each_entry(dso, &machine->dsos.head, node) { @@ -1349,7 +1350,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, break; } - pthread_rwlock_unlock(&machine->dsos.lock); + up_read(&machine->dsos.lock); if (kernel == NULL) kernel = machine__findnew_dso(machine, kmmap_prefix); @@ -1513,7 +1514,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, BUG_ON(refcount_read(&th->refcnt) == 0); if (lock) - pthread_rwlock_wrlock(&threads->lock); + down_write(&threads->lock); rb_erase_init(&th->rb_node, &threads->entries); RB_CLEAR_NODE(&th->rb_node); --threads->nr; @@ -1524,7 +1525,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, */ list_add_tail(&th->node, &threads->dead); if (lock) - pthread_rwlock_unlock(&threads->lock); + up_write(&threads->lock); thread__put(th); } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index fe2f05848050..b1cd516f2025 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -6,6 +6,7 @@ #include "map.h" #include "dso.h" #include "event.h" +#include "rwsem.h" struct addr_location; struct branch_stack; @@ -28,7 +29,7 @@ struct vdso_info; struct threads { struct rb_root entries; - pthread_rwlock_t lock; + struct rw_semaphore lock; unsigned int nr; struct list_head dead; struct thread *last_match; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index bdaa0a4edc17..5792d7a78152 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -488,7 +488,7 @@ u64 map__objdump_2mem(struct map *map, u64 ip) static void maps__init(struct maps *maps) { maps->entries = RB_ROOT; - pthread_rwlock_init(&maps->lock, NULL); + init_rwsem(&maps->lock); } void map_groups__init(struct map_groups *mg, struct machine *machine) @@ -517,9 +517,9 @@ static void __maps__purge(struct maps *maps) static void maps__exit(struct maps *maps) { - pthread_rwlock_wrlock(&maps->lock); + down_write(&maps->lock); __maps__purge(maps); - pthread_rwlock_unlock(&maps->lock); + up_write(&maps->lock); } void map_groups__exit(struct map_groups *mg) @@ -586,7 +586,7 @@ struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name, struct symbol *sym; struct rb_node *nd; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) { struct map *pos = rb_entry(nd, struct map, rb_node); @@ -602,7 +602,7 @@ struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name, sym = NULL; out: - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); return sym; } @@ -638,7 +638,7 @@ static size_t maps__fprintf(struct maps *maps, FILE *fp) size_t printed = 0; struct rb_node *nd; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) { struct map *pos = rb_entry(nd, struct map, rb_node); @@ -650,7 +650,7 @@ static size_t maps__fprintf(struct maps *maps, FILE *fp) } } - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); return printed; } @@ -682,7 +682,7 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp struct rb_node *next; int err = 0; - pthread_rwlock_wrlock(&maps->lock); + down_write(&maps->lock); root = &maps->entries; next = rb_first(root); @@ -750,7 +750,7 @@ put_map: err = 0; out: - pthread_rwlock_unlock(&maps->lock); + up_write(&maps->lock); return err; } @@ -771,7 +771,7 @@ int map_groups__clone(struct thread *thread, struct map *map; struct maps *maps = &parent->maps[type]; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); for (map = maps__first(maps); map; map = map__next(map)) { struct map *new = map__clone(map); @@ -788,7 +788,7 @@ int map_groups__clone(struct thread *thread, err = 0; out_unlock: - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); return err; } @@ -815,9 +815,9 @@ static void __maps__insert(struct maps *maps, struct map *map) void maps__insert(struct maps *maps, struct map *map) { - pthread_rwlock_wrlock(&maps->lock); + down_write(&maps->lock); __maps__insert(maps, map); - pthread_rwlock_unlock(&maps->lock); + up_write(&maps->lock); } static void __maps__remove(struct maps *maps, struct map *map) @@ -828,9 +828,9 @@ static void __maps__remove(struct maps *maps, struct map *map) void maps__remove(struct maps *maps, struct map *map) { - pthread_rwlock_wrlock(&maps->lock); + down_write(&maps->lock); __maps__remove(maps, map); - pthread_rwlock_unlock(&maps->lock); + up_write(&maps->lock); } struct map *maps__find(struct maps *maps, u64 ip) @@ -838,7 +838,7 @@ struct map *maps__find(struct maps *maps, u64 ip) struct rb_node **p, *parent = NULL; struct map *m; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); p = &maps->entries.rb_node; while (*p != NULL) { @@ -854,7 +854,7 @@ struct map *maps__find(struct maps *maps, u64 ip) m = NULL; out: - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); return m; } diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 73aacf7a7dc4..d5d7442dac7a 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -9,6 +9,7 @@ #include #include #include +#include "rwsem.h" enum map_type { MAP__FUNCTION = 0, @@ -61,7 +62,7 @@ struct kmap { struct maps { struct rb_root entries; - pthread_rwlock_t lock; + struct rw_semaphore lock; }; struct map_groups { diff --git a/tools/perf/util/rwsem.c b/tools/perf/util/rwsem.c new file mode 100644 index 000000000000..5e52e7baa7b6 --- /dev/null +++ b/tools/perf/util/rwsem.c @@ -0,0 +1,32 @@ +#include "util.h" +#include "rwsem.h" + +int init_rwsem(struct rw_semaphore *sem) +{ + return pthread_rwlock_init(&sem->lock, NULL); +} + +int exit_rwsem(struct rw_semaphore *sem) +{ + return pthread_rwlock_destroy(&sem->lock); +} + +int down_read(struct rw_semaphore *sem) +{ + return perf_singlethreaded ? 0 : pthread_rwlock_rdlock(&sem->lock); +} + +int up_read(struct rw_semaphore *sem) +{ + return perf_singlethreaded ? 0 : pthread_rwlock_unlock(&sem->lock); +} + +int down_write(struct rw_semaphore *sem) +{ + return perf_singlethreaded ? 0 : pthread_rwlock_wrlock(&sem->lock); +} + +int up_write(struct rw_semaphore *sem) +{ + return perf_singlethreaded ? 0 : pthread_rwlock_unlock(&sem->lock); +} diff --git a/tools/perf/util/rwsem.h b/tools/perf/util/rwsem.h new file mode 100644 index 000000000000..94565ad4d494 --- /dev/null +++ b/tools/perf/util/rwsem.h @@ -0,0 +1,19 @@ +#ifndef _PERF_RWSEM_H +#define _PERF_RWSEM_H + +#include + +struct rw_semaphore { + pthread_rwlock_t lock; +}; + +int init_rwsem(struct rw_semaphore *sem); +int exit_rwsem(struct rw_semaphore *sem); + +int down_read(struct rw_semaphore *sem); +int up_read(struct rw_semaphore *sem); + +int down_write(struct rw_semaphore *sem); +int up_write(struct rw_semaphore *sem); + +#endif /* _PERF_RWSEM_H */ diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 5909ee4c7ade..066e38aa4063 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -226,7 +226,7 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type) struct maps *maps = &mg->maps[type]; struct map *next, *curr; - pthread_rwlock_wrlock(&maps->lock); + down_write(&maps->lock); curr = maps__first(maps); if (curr == NULL) @@ -246,7 +246,7 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type) curr->end = ~0ULL; out_unlock: - pthread_rwlock_unlock(&maps->lock); + up_write(&maps->lock); } struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name) @@ -1671,7 +1671,7 @@ struct map *map_groups__find_by_name(struct map_groups *mg, struct maps *maps = &mg->maps[type]; struct map *map; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); for (map = maps__first(maps); map; map = map__next(map)) { if (map->dso && strcmp(map->dso->short_name, name) == 0) @@ -1681,7 +1681,7 @@ struct map *map_groups__find_by_name(struct map_groups *mg, map = NULL; out_unlock: - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); return map; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index aee9a42102ba..c09bdb509d82 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -264,7 +264,7 @@ static int __thread__prepare_access(struct thread *thread) struct maps *maps = &thread->mg->maps[i]; struct map *map; - pthread_rwlock_rdlock(&maps->lock); + down_read(&maps->lock); for (map = maps__first(maps); map; map = map__next(map)) { err = unwind__prepare_access(thread, map, &initialized); @@ -272,7 +272,7 @@ static int __thread__prepare_access(struct thread *thread) break; } - pthread_rwlock_unlock(&maps->lock); + up_read(&maps->lock); } return err; diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index e7d60d05596d..d7f2113462fb 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 8a9a677f7576..40b425949aa3 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 3c9c39711343..97e0c8e26477 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -23,6 +23,19 @@ /* * XXX We need to find a better place for these things... */ + +bool perf_singlethreaded = true; + +void perf_set_singlethreaded(void) +{ + perf_singlethreaded = true; +} + +void perf_set_multithreaded(void) +{ + perf_singlethreaded = false; +} + unsigned int page_size; int cacheline_size; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 03946d5e4741..6c7e6cc902bb 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -62,4 +62,9 @@ int sched_getcpu(void); int setns(int fd, int nstype); #endif +extern bool perf_singlethreaded; + +void perf_set_singlethreaded(void); +void perf_set_multithreaded(void); + #endif /* GIT_COMPAT_UTIL_H */ diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index d3c39eec89a8..f5f843d3c22f 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -319,7 +319,7 @@ struct dso *machine__findnew_vdso(struct machine *machine, struct vdso_info *vdso_info; struct dso *dso = NULL; - pthread_rwlock_wrlock(&machine->dsos.lock); + down_write(&machine->dsos.lock); if (!machine->vdso_info) machine->vdso_info = vdso_info__new(); @@ -347,7 +347,7 @@ struct dso *machine__findnew_vdso(struct machine *machine, out_unlock: dso__get(dso); - pthread_rwlock_unlock(&machine->dsos.lock); + up_write(&machine->dsos.lock); return dso; } From 66162becb79424ea837cf0436bf1d41a212f30a4 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 31 Aug 2017 15:48:44 +0530 Subject: [PATCH 0056/1085] mmc: host: omap_hsmmc: Remove setting PBIAS voltage PBIAS voltage should be set along with setting vqmmc voltage and these voltages should be set as part of start_signal_voltage_switch callback. However since omap_hsmmc is about to be deprecated, remove setting of PBIAS voltage leaving the PBIAS voltage to be at the reset value of 3.3V (we'll never have to change this to 1.8V since UHS mode support will not be added to omap_hsmmc). This will let pbias regulator driver to be fixed to support a maximum voltage of 3.3V. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 3b5e6d11069b..f2a1137be8bc 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -147,10 +147,6 @@ #define OMAP_MMC_MAX_CLOCK 52000000 #define DRIVER_NAME "omap_hsmmc" -#define VDD_1V8 1800000 /* 180000 uV */ -#define VDD_3V0 3000000 /* 300000 uV */ -#define VDD_165_195 (ffs(MMC_VDD_165_195) - 1) - /* * One controller can have multiple slots, like on some omap boards using * omap.c controller driver. Luckily this is not currently done on any known @@ -308,8 +304,7 @@ err_set_ocr: return ret; } -static int omap_hsmmc_set_pbias(struct omap_hsmmc_host *host, bool power_on, - int vdd) +static int omap_hsmmc_set_pbias(struct omap_hsmmc_host *host, bool power_on) { int ret; @@ -317,17 +312,6 @@ static int omap_hsmmc_set_pbias(struct omap_hsmmc_host *host, bool power_on, return 0; if (power_on) { - if (vdd <= VDD_165_195) - ret = regulator_set_voltage(host->pbias, VDD_1V8, - VDD_1V8); - else - ret = regulator_set_voltage(host->pbias, VDD_3V0, - VDD_3V0); - if (ret < 0) { - dev_err(host->dev, "pbias set voltage fail\n"); - return ret; - } - if (host->pbias_enabled == 0) { ret = regulator_enable(host->pbias); if (ret) { @@ -350,8 +334,7 @@ static int omap_hsmmc_set_pbias(struct omap_hsmmc_host *host, bool power_on, return 0; } -static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, - int vdd) +static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on) { struct mmc_host *mmc = host->mmc; int ret = 0; @@ -363,7 +346,7 @@ static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, if (IS_ERR(mmc->supply.vmmc)) return 0; - ret = omap_hsmmc_set_pbias(host, false, 0); + ret = omap_hsmmc_set_pbias(host, false); if (ret) return ret; @@ -385,7 +368,7 @@ static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, if (ret) return ret; - ret = omap_hsmmc_set_pbias(host, true, vdd); + ret = omap_hsmmc_set_pbias(host, true); if (ret) goto err_set_voltage; } else { @@ -1220,11 +1203,11 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) clk_disable_unprepare(host->dbclk); /* Turn the power off */ - ret = omap_hsmmc_set_power(host, 0, 0); + ret = omap_hsmmc_set_power(host, 0); /* Turn the power ON with given VDD 1.8 or 3.0v */ if (!ret) - ret = omap_hsmmc_set_power(host, 1, vdd); + ret = omap_hsmmc_set_power(host, 1); if (host->dbclk) clk_prepare_enable(host->dbclk); @@ -1621,10 +1604,10 @@ static void omap_hsmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (ios->power_mode != host->power_mode) { switch (ios->power_mode) { case MMC_POWER_OFF: - omap_hsmmc_set_power(host, 0, 0); + omap_hsmmc_set_power(host, 0); break; case MMC_POWER_UP: - omap_hsmmc_set_power(host, 1, ios->vdd); + omap_hsmmc_set_power(host, 1); break; case MMC_POWER_ON: do_send_init_stream = 1; From 9ebd65cbdfbf59e7ef4b9c0c8fd767ff63f60411 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Wed, 6 Sep 2017 17:15:54 +0530 Subject: [PATCH 0057/1085] dt-bindings: sdhci-omap: Add bindings for the sdhci-omap controller Add binding for the TI's sdhci-omap controller. Signed-off-by: Kishon Vijay Abraham I Acked-by: Tony Lindgren Acked-by: Rob Herring Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/sdhci-omap.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Documentation/devicetree/bindings/mmc/sdhci-omap.txt diff --git a/Documentation/devicetree/bindings/mmc/sdhci-omap.txt b/Documentation/devicetree/bindings/mmc/sdhci-omap.txt new file mode 100644 index 000000000000..51775a372c06 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/sdhci-omap.txt @@ -0,0 +1,16 @@ +* TI OMAP SDHCI Controller + +Refer to mmc.txt for standard MMC bindings. + +Required properties: +- compatible: Should be "ti,dra7-sdhci" for DRA7 and DRA72 controllers +- ti,hwmods: Must be "mmc", is controller instance starting 1 + +Example: + mmc1: mmc@4809c000 { + compatible = "ti,dra7-sdhci"; + reg = <0x4809c000 0x400>; + ti,hwmods = "mmc1"; + bus-width = <4>; + vmmc-supply = <&vmmc>; /* phandle to regulator node */ + }; From 7d326930d3522a1183b8d54126c524fcbccd3343 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Wed, 6 Sep 2017 17:15:55 +0530 Subject: [PATCH 0058/1085] mmc: sdhci-omap: Add OMAP SDHCI driver Create a new sdhci-omap driver to configure the eMMC/SD/SDIO controller in TI's OMAP SoCs making use of the SDHCI core library. For OMAP specific configurations, populate sdhci_ops with OMAP specific callbacks and use SDHCI quirks. Enable only high speed mode for both SD and eMMC here and add other UHS mode support later. Signed-off-by: Kishon Vijay Abraham I Acked-by: Tony Lindgren Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 12 + drivers/mmc/host/Makefile | 1 + drivers/mmc/host/sdhci-omap.c | 607 ++++++++++++++++++++++++++++++++++ 3 files changed, 620 insertions(+) create mode 100644 drivers/mmc/host/sdhci-omap.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 02179ed2a40d..03178c42efbb 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -899,3 +899,15 @@ config MMC_SDHCI_XENON This selects Marvell Xenon eMMC/SD/SDIO SDHCI. If you have a controller with this interface, say Y or M here. If unsure, say N. + +config MMC_SDHCI_OMAP + tristate "TI SDHCI Controller Support" + depends on MMC_SDHCI_PLTFM && OF + help + This selects the Secure Digital Host Controller Interface (SDHCI) + support present in TI's DRA7 SOCs. The controller supports + SD/MMC/SDIO devices. + + If you have a controller with this interface, say Y or M here. + + If unsure, say N. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 303f5cd46cd9..2b5a8133948d 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_MMC_SDHCI_MSM) += sdhci-msm.o obj-$(CONFIG_MMC_SDHCI_ST) += sdhci-st.o obj-$(CONFIG_MMC_SDHCI_MICROCHIP_PIC32) += sdhci-pic32.o obj-$(CONFIG_MMC_SDHCI_BRCMSTB) += sdhci-brcmstb.o +obj-$(CONFIG_MMC_SDHCI_OMAP) += sdhci-omap.o ifeq ($(CONFIG_CB710_DEBUG),y) CFLAGS-cb710-mmc += -DDEBUG diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c new file mode 100644 index 000000000000..5ddae39816b7 --- /dev/null +++ b/drivers/mmc/host/sdhci-omap.c @@ -0,0 +1,607 @@ +/** + * SDHCI Controller driver for TI's OMAP SoCs + * + * Copyright (C) 2017 Texas Instruments + * Author: Kishon Vijay Abraham I + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sdhci-pltfm.h" + +#define SDHCI_OMAP_CON 0x12c +#define CON_DW8 BIT(5) +#define CON_DMA_MASTER BIT(20) +#define CON_INIT BIT(1) +#define CON_OD BIT(0) + +#define SDHCI_OMAP_CMD 0x20c + +#define SDHCI_OMAP_HCTL 0x228 +#define HCTL_SDBP BIT(8) +#define HCTL_SDVS_SHIFT 9 +#define HCTL_SDVS_MASK (0x7 << HCTL_SDVS_SHIFT) +#define HCTL_SDVS_33 (0x7 << HCTL_SDVS_SHIFT) +#define HCTL_SDVS_30 (0x6 << HCTL_SDVS_SHIFT) +#define HCTL_SDVS_18 (0x5 << HCTL_SDVS_SHIFT) + +#define SDHCI_OMAP_SYSCTL 0x22c +#define SYSCTL_CEN BIT(2) +#define SYSCTL_CLKD_SHIFT 6 +#define SYSCTL_CLKD_MASK 0x3ff + +#define SDHCI_OMAP_STAT 0x230 + +#define SDHCI_OMAP_IE 0x234 +#define INT_CC_EN BIT(0) + +#define SDHCI_OMAP_AC12 0x23c +#define AC12_V1V8_SIGEN BIT(19) + +#define SDHCI_OMAP_CAPA 0x240 +#define CAPA_VS33 BIT(24) +#define CAPA_VS30 BIT(25) +#define CAPA_VS18 BIT(26) + +#define SDHCI_OMAP_TIMEOUT 1 /* 1 msec */ + +#define SYSCTL_CLKD_MAX 0x3FF + +#define IOV_1V8 1800000 /* 180000 uV */ +#define IOV_3V0 3000000 /* 300000 uV */ +#define IOV_3V3 3300000 /* 330000 uV */ + +struct sdhci_omap_data { + u32 offset; +}; + +struct sdhci_omap_host { + void __iomem *base; + struct device *dev; + struct regulator *pbias; + bool pbias_enabled; + struct sdhci_host *host; + u8 bus_mode; + u8 power_mode; +}; + +static inline u32 sdhci_omap_readl(struct sdhci_omap_host *host, + unsigned int offset) +{ + return readl(host->base + offset); +} + +static inline void sdhci_omap_writel(struct sdhci_omap_host *host, + unsigned int offset, u32 data) +{ + writel(data, host->base + offset); +} + +static int sdhci_omap_set_pbias(struct sdhci_omap_host *omap_host, + bool power_on, unsigned int iov) +{ + int ret; + struct device *dev = omap_host->dev; + + if (IS_ERR(omap_host->pbias)) + return 0; + + if (power_on) { + ret = regulator_set_voltage(omap_host->pbias, iov, iov); + if (ret) { + dev_err(dev, "pbias set voltage failed\n"); + return ret; + } + + if (omap_host->pbias_enabled) + return 0; + + ret = regulator_enable(omap_host->pbias); + if (ret) { + dev_err(dev, "pbias reg enable fail\n"); + return ret; + } + + omap_host->pbias_enabled = true; + } else { + if (!omap_host->pbias_enabled) + return 0; + + ret = regulator_disable(omap_host->pbias); + if (ret) { + dev_err(dev, "pbias reg disable fail\n"); + return ret; + } + omap_host->pbias_enabled = false; + } + + return 0; +} + +static int sdhci_omap_enable_iov(struct sdhci_omap_host *omap_host, + unsigned int iov) +{ + int ret; + struct sdhci_host *host = omap_host->host; + struct mmc_host *mmc = host->mmc; + + ret = sdhci_omap_set_pbias(omap_host, false, 0); + if (ret) + return ret; + + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_set_voltage(mmc->supply.vqmmc, iov, iov); + if (ret) { + dev_err(mmc_dev(mmc), "vqmmc set voltage failed\n"); + return ret; + } + } + + ret = sdhci_omap_set_pbias(omap_host, true, iov); + if (ret) + return ret; + + return 0; +} + +static void sdhci_omap_conf_bus_power(struct sdhci_omap_host *omap_host, + unsigned char signal_voltage) +{ + u32 reg; + ktime_t timeout; + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_HCTL); + reg &= ~HCTL_SDVS_MASK; + + if (signal_voltage == MMC_SIGNAL_VOLTAGE_330) + reg |= HCTL_SDVS_33; + else + reg |= HCTL_SDVS_18; + + sdhci_omap_writel(omap_host, SDHCI_OMAP_HCTL, reg); + + reg |= HCTL_SDBP; + sdhci_omap_writel(omap_host, SDHCI_OMAP_HCTL, reg); + + /* wait 1ms */ + timeout = ktime_add_ms(ktime_get(), SDHCI_OMAP_TIMEOUT); + while (!(sdhci_omap_readl(omap_host, SDHCI_OMAP_HCTL) & HCTL_SDBP)) { + if (WARN_ON(ktime_after(ktime_get(), timeout))) + return; + usleep_range(5, 10); + } +} + +static int sdhci_omap_start_signal_voltage_switch(struct mmc_host *mmc, + struct mmc_ios *ios) +{ + u32 reg; + int ret; + unsigned int iov; + struct sdhci_host *host = mmc_priv(mmc); + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_omap_host *omap_host; + struct device *dev; + + pltfm_host = sdhci_priv(host); + omap_host = sdhci_pltfm_priv(pltfm_host); + dev = omap_host->dev; + + if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_330) { + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CAPA); + if (!(reg & CAPA_VS33)) + return -EOPNOTSUPP; + + sdhci_omap_conf_bus_power(omap_host, ios->signal_voltage); + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_AC12); + reg &= ~AC12_V1V8_SIGEN; + sdhci_omap_writel(omap_host, SDHCI_OMAP_AC12, reg); + + iov = IOV_3V3; + } else if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) { + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CAPA); + if (!(reg & CAPA_VS18)) + return -EOPNOTSUPP; + + sdhci_omap_conf_bus_power(omap_host, ios->signal_voltage); + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_AC12); + reg |= AC12_V1V8_SIGEN; + sdhci_omap_writel(omap_host, SDHCI_OMAP_AC12, reg); + + iov = IOV_1V8; + } else { + return -EOPNOTSUPP; + } + + ret = sdhci_omap_enable_iov(omap_host, iov); + if (ret) { + dev_err(dev, "failed to switch IO voltage to %dmV\n", iov); + return ret; + } + + dev_dbg(dev, "IO voltage switched to %dmV\n", iov); + return 0; +} + +static void sdhci_omap_set_bus_mode(struct sdhci_omap_host *omap_host, + unsigned int mode) +{ + u32 reg; + + if (omap_host->bus_mode == mode) + return; + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); + if (mode == MMC_BUSMODE_OPENDRAIN) + reg |= CON_OD; + else + reg &= ~CON_OD; + sdhci_omap_writel(omap_host, SDHCI_OMAP_CON, reg); + + omap_host->bus_mode = mode; +} + +void sdhci_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct sdhci_host *host = mmc_priv(mmc); + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_omap_host *omap_host; + + pltfm_host = sdhci_priv(host); + omap_host = sdhci_pltfm_priv(pltfm_host); + + sdhci_omap_set_bus_mode(omap_host, ios->bus_mode); + sdhci_set_ios(mmc, ios); +} + +static u16 sdhci_omap_calc_divisor(struct sdhci_pltfm_host *host, + unsigned int clock) +{ + u16 dsor; + + dsor = DIV_ROUND_UP(clk_get_rate(host->clk), clock); + if (dsor > SYSCTL_CLKD_MAX) + dsor = SYSCTL_CLKD_MAX; + + return dsor; +} + +static void sdhci_omap_start_clock(struct sdhci_omap_host *omap_host) +{ + u32 reg; + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_SYSCTL); + reg |= SYSCTL_CEN; + sdhci_omap_writel(omap_host, SDHCI_OMAP_SYSCTL, reg); +} + +static void sdhci_omap_stop_clock(struct sdhci_omap_host *omap_host) +{ + u32 reg; + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_SYSCTL); + reg &= ~SYSCTL_CEN; + sdhci_omap_writel(omap_host, SDHCI_OMAP_SYSCTL, reg); +} + +static void sdhci_omap_set_clock(struct sdhci_host *host, unsigned int clock) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_omap_host *omap_host = sdhci_pltfm_priv(pltfm_host); + unsigned long clkdiv; + + sdhci_omap_stop_clock(omap_host); + + if (!clock) + return; + + clkdiv = sdhci_omap_calc_divisor(pltfm_host, clock); + clkdiv = (clkdiv & SYSCTL_CLKD_MASK) << SYSCTL_CLKD_SHIFT; + sdhci_enable_clk(host, clkdiv); + + sdhci_omap_start_clock(omap_host); +} + +void sdhci_omap_set_power(struct sdhci_host *host, unsigned char mode, + unsigned short vdd) +{ + struct mmc_host *mmc = host->mmc; + + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); +} + +static int sdhci_omap_enable_dma(struct sdhci_host *host) +{ + u32 reg; + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_omap_host *omap_host = sdhci_pltfm_priv(pltfm_host); + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); + reg |= CON_DMA_MASTER; + sdhci_omap_writel(omap_host, SDHCI_OMAP_CON, reg); + + return 0; +} + +unsigned int sdhci_omap_get_min_clock(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + + return clk_get_rate(pltfm_host->clk) / SYSCTL_CLKD_MAX; +} + +static void sdhci_omap_set_bus_width(struct sdhci_host *host, int width) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_omap_host *omap_host = sdhci_pltfm_priv(pltfm_host); + u32 reg; + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); + if (width == MMC_BUS_WIDTH_8) + reg |= CON_DW8; + else + reg &= ~CON_DW8; + sdhci_omap_writel(omap_host, SDHCI_OMAP_CON, reg); + + sdhci_set_bus_width(host, width); +} + +static void sdhci_omap_init_74_clocks(struct sdhci_host *host, u8 power_mode) +{ + u32 reg; + ktime_t timeout; + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_omap_host *omap_host = sdhci_pltfm_priv(pltfm_host); + + if (omap_host->power_mode == power_mode) + return; + + if (power_mode != MMC_POWER_ON) + return; + + disable_irq(host->irq); + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); + reg |= CON_INIT; + sdhci_omap_writel(omap_host, SDHCI_OMAP_CON, reg); + sdhci_omap_writel(omap_host, SDHCI_OMAP_CMD, 0x0); + + /* wait 1ms */ + timeout = ktime_add_ms(ktime_get(), SDHCI_OMAP_TIMEOUT); + while (!(sdhci_omap_readl(omap_host, SDHCI_OMAP_STAT) & INT_CC_EN)) { + if (WARN_ON(ktime_after(ktime_get(), timeout))) + return; + usleep_range(5, 10); + } + + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); + reg &= ~CON_INIT; + sdhci_omap_writel(omap_host, SDHCI_OMAP_CON, reg); + sdhci_omap_writel(omap_host, SDHCI_OMAP_STAT, INT_CC_EN); + + enable_irq(host->irq); + + omap_host->power_mode = power_mode; +} + +static struct sdhci_ops sdhci_omap_ops = { + .set_clock = sdhci_omap_set_clock, + .set_power = sdhci_omap_set_power, + .enable_dma = sdhci_omap_enable_dma, + .get_max_clock = sdhci_pltfm_clk_get_max_clock, + .get_min_clock = sdhci_omap_get_min_clock, + .set_bus_width = sdhci_omap_set_bus_width, + .platform_send_init_74_clocks = sdhci_omap_init_74_clocks, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, +}; + +static int sdhci_omap_set_capabilities(struct sdhci_omap_host *omap_host) +{ + u32 reg; + int ret = 0; + struct device *dev = omap_host->dev; + struct regulator *vqmmc; + + vqmmc = regulator_get(dev, "vqmmc"); + if (IS_ERR(vqmmc)) { + ret = PTR_ERR(vqmmc); + goto reg_put; + } + + /* voltage capabilities might be set by boot loader, clear it */ + reg = sdhci_omap_readl(omap_host, SDHCI_OMAP_CAPA); + reg &= ~(CAPA_VS18 | CAPA_VS30 | CAPA_VS33); + + if (regulator_is_supported_voltage(vqmmc, IOV_3V3, IOV_3V3)) + reg |= CAPA_VS33; + if (regulator_is_supported_voltage(vqmmc, IOV_1V8, IOV_1V8)) + reg |= CAPA_VS18; + + sdhci_omap_writel(omap_host, SDHCI_OMAP_CAPA, reg); + +reg_put: + regulator_put(vqmmc); + + return ret; +} + +static const struct sdhci_pltfm_data sdhci_omap_pdata = { + .quirks = SDHCI_QUIRK_BROKEN_CARD_DETECTION | + SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | + SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN | + SDHCI_QUIRK_NO_HISPD_BIT | + SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, + .quirks2 = SDHCI_QUIRK2_NO_1_8_V | + SDHCI_QUIRK2_ACMD23_BROKEN | + SDHCI_QUIRK2_RSP_136_HAS_CRC, + .ops = &sdhci_omap_ops, +}; + +static const struct sdhci_omap_data dra7_data = { + .offset = 0x200, +}; + +static const struct of_device_id omap_sdhci_match[] = { + { .compatible = "ti,dra7-sdhci", .data = &dra7_data }, + {}, +}; +MODULE_DEVICE_TABLE(of, omap_sdhci_match); + +static int sdhci_omap_probe(struct platform_device *pdev) +{ + int ret; + u32 offset; + struct device *dev = &pdev->dev; + struct sdhci_host *host; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_omap_host *omap_host; + struct mmc_host *mmc; + const struct of_device_id *match; + struct sdhci_omap_data *data; + + match = of_match_device(omap_sdhci_match, dev); + if (!match) + return -EINVAL; + + data = (struct sdhci_omap_data *)match->data; + if (!data) { + dev_err(dev, "no sdhci omap data\n"); + return -EINVAL; + } + offset = data->offset; + + host = sdhci_pltfm_init(pdev, &sdhci_omap_pdata, + sizeof(*omap_host)); + if (IS_ERR(host)) { + dev_err(dev, "Failed sdhci_pltfm_init\n"); + return PTR_ERR(host); + } + + pltfm_host = sdhci_priv(host); + omap_host = sdhci_pltfm_priv(pltfm_host); + omap_host->host = host; + omap_host->base = host->ioaddr; + omap_host->dev = dev; + host->ioaddr += offset; + + mmc = host->mmc; + ret = mmc_of_parse(mmc); + if (ret) + goto err_pltfm_free; + + pltfm_host->clk = devm_clk_get(dev, "fck"); + if (IS_ERR(pltfm_host->clk)) { + ret = PTR_ERR(pltfm_host->clk); + goto err_pltfm_free; + } + + ret = clk_set_rate(pltfm_host->clk, mmc->f_max); + if (ret) { + dev_err(dev, "failed to set clock to %d\n", mmc->f_max); + goto err_pltfm_free; + } + + omap_host->pbias = devm_regulator_get_optional(dev, "pbias"); + if (IS_ERR(omap_host->pbias)) { + ret = PTR_ERR(omap_host->pbias); + if (ret != -ENODEV) + goto err_pltfm_free; + dev_dbg(dev, "unable to get pbias regulator %d\n", ret); + } + omap_host->pbias_enabled = false; + + /* + * omap_device_pm_domain has callbacks to enable the main + * functional clock, interface clock and also configure the + * SYSCONFIG register of omap devices. The callback will be invoked + * as part of pm_runtime_get_sync. + */ + pm_runtime_enable(dev); + ret = pm_runtime_get_sync(dev); + if (ret < 0) { + dev_err(dev, "pm_runtime_get_sync failed\n"); + pm_runtime_put_noidle(dev); + goto err_rpm_disable; + } + + ret = sdhci_omap_set_capabilities(omap_host); + if (ret) { + dev_err(dev, "failed to set system capabilities\n"); + goto err_put_sync; + } + + host->mmc_host_ops.get_ro = mmc_gpio_get_ro; + host->mmc_host_ops.start_signal_voltage_switch = + sdhci_omap_start_signal_voltage_switch; + host->mmc_host_ops.set_ios = sdhci_omap_set_ios; + + sdhci_read_caps(host); + host->caps |= SDHCI_CAN_DO_ADMA2; + + ret = sdhci_add_host(host); + if (ret) + goto err_put_sync; + + return 0; + +err_put_sync: + pm_runtime_put_sync(dev); + +err_rpm_disable: + pm_runtime_disable(dev); + +err_pltfm_free: + sdhci_pltfm_free(pdev); + return ret; +} + +static int sdhci_omap_remove(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sdhci_host *host = platform_get_drvdata(pdev); + + sdhci_remove_host(host, true); + pm_runtime_put_sync(dev); + pm_runtime_disable(dev); + sdhci_pltfm_free(pdev); + + return 0; +} + +static struct platform_driver sdhci_omap_driver = { + .probe = sdhci_omap_probe, + .remove = sdhci_omap_remove, + .driver = { + .name = "sdhci-omap", + .of_match_table = omap_sdhci_match, + }, +}; + +module_platform_driver(sdhci_omap_driver); + +MODULE_DESCRIPTION("SDHCI driver for OMAP SoCs"); +MODULE_AUTHOR("Texas Instruments Inc."); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:sdhci_omap"); From 8438964f35b1afa427767147ade58d44758c692d Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Wed, 6 Sep 2017 17:15:56 +0530 Subject: [PATCH 0059/1085] MAINTAINERS: Add TI OMAP SDHCI Maintainer Add Maintainer for the TI OMAP SDHCI driver. Signed-off-by: Kishon Vijay Abraham I Acked-by: Adrian Hunter Acked-by: Tony Lindgren Signed-off-by: Ulf Hansson --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..5662ce0ef239 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12026,6 +12026,12 @@ L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-spear.c +SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) TI OMAP DRIVER +M: Kishon Vijay Abraham I +L: linux-mmc@vger.kernel.org +S: Maintained +F: drivers/mmc/host/sdhci-omap.c + SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER M: Scott Bauer M: Jonathan Derrick From 3c52b5c64326d9dcfee4e10611c53ec1b1b20675 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 6 Sep 2017 17:18:08 +0200 Subject: [PATCH 0060/1085] x86/asm: Remove unnecessary \n\t in front of CC_SET() from asm templates There is no need for \n\t in front of CC_SET(), as the macro already includes these two. Signed-off-by: Uros Bizjak Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/archrandom.h | 8 ++++---- arch/x86/include/asm/bitops.h | 10 +++++----- arch/x86/include/asm/percpu.h | 2 +- arch/x86/include/asm/rmwcc.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 5b0579abb398..3ac991d81e74 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG "\n\t" + asm volatile(RDRAND_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT "\n\t" + asm volatile(RDRAND_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) static inline bool rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG "\n\t" + asm volatile(RDSEED_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; @@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) static inline bool rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT "\n\t" + asm volatile(RDSEED_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 854022772c5b..8cee8db6dffb 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) : CC_OUT(s) (negative), ADDR : "ir" ((char) ~(1 << nr)) : "memory"); @@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * { bool oldbit; - asm("bts %2,%1\n\t" + asm("bts %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long { bool oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile("btr %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon { bool oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile("btc %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l { bool oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile("bt %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 9fa03604b2b3..b21a475fd7ed 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("bt "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 045f99211a99..0c411c8bbdbd 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -28,7 +28,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ - asm volatile (fullop ";" CC_SET(cc) \ + asm volatile (fullop CC_SET(cc) \ : [counter] "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ return c; \ From c9c8b4d6d0259ac914309f8fbcef022792c99a89 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 16 Aug 2017 19:41:52 +0200 Subject: [PATCH 0061/1085] EDAC, ghes: Remove symbol exports They're called from builtin code so no need for the exports. Signed-off-by: Borislav Petkov --- drivers/edac/ghes_edac.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 6f80eb65c26c..8acdf29cf3a4 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -401,7 +401,6 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, /* Report the error via EDAC API */ edac_raw_mc_handle_error(type, mci, e); } -EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error); int ghes_edac_register(struct ghes *ghes, struct device *dev) { @@ -505,7 +504,6 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) mutex_unlock(&ghes_edac_lock); return 0; } -EXPORT_SYMBOL_GPL(ghes_edac_register); void ghes_edac_unregister(struct ghes *ghes) { @@ -521,4 +519,3 @@ void ghes_edac_unregister(struct ghes *ghes) } } } -EXPORT_SYMBOL_GPL(ghes_edac_unregister); From 0fe5f281f749f1d4e462a282ef8ba76407a11fd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 16 Aug 2017 10:33:44 +0200 Subject: [PATCH 0062/1085] EDAC, ghes: Model a single, logical memory controller We're enumerating the DIMMs through a DMI walk and since we can't get any more detailed topological information about which DIMMs belong to which memory controller, convert it to a single, logical controller which contains all the DIMMs. The error reporting path from GHES ghes_edac_report_mem_error() doesn't get called in NMI context but add a warning about it to catch any changes in the future as if so, our locking scheme will be insufficient then. Signed-off-by: Borislav Petkov --- drivers/edac/ghes_edac.c | 114 +++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 64 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 8acdf29cf3a4..8d904df75ad8 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -28,10 +28,15 @@ struct ghes_edac_pvt { char msg[80]; }; -static LIST_HEAD(ghes_reglist); -static DEFINE_MUTEX(ghes_edac_lock); -static int ghes_edac_mc_num; +static atomic_t ghes_init = ATOMIC_INIT(0); +static struct ghes_edac_pvt *ghes_pvt; +/* + * Sync with other, potentially concurrent callers of + * ghes_edac_report_mem_error(). We don't know what the + * "inventive" firmware would do. + */ +static DEFINE_SPINLOCK(ghes_lock); /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry { @@ -169,18 +174,26 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, enum hw_event_mc_err_type type; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt = NULL; + struct ghes_edac_pvt *pvt = ghes_pvt; + unsigned long flags; char *p; u8 grain_bits; - list_for_each_entry(pvt, &ghes_reglist, list) { - if (ghes == pvt->ghes) - break; - } if (!pvt) { pr_err("Internal error: Can't find EDAC structure\n"); return; } + + /* + * We can do the locking below because GHES defers error processing + * from NMI to IRQ context. Whenever that changes, we'd at least + * know. + */ + if (WARN_ON_ONCE(in_nmi())) + return; + + spin_lock_irqsave(&ghes_lock, flags); + mci = pvt->mci; e = &mci->error_desc; @@ -398,8 +411,8 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, grain_bits, e->syndrome, pvt->detail_location); - /* Report the error via EDAC API */ edac_raw_mc_handle_error(type, mci, e); + spin_unlock_irqrestore(&ghes_lock, flags); } int ghes_edac_register(struct ghes *ghes, struct device *dev) @@ -408,9 +421,14 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) int rc, num_dimm = 0; struct mem_ctl_info *mci; struct edac_mc_layer layers[1]; - struct ghes_edac_pvt *pvt; struct ghes_edac_dimm_fill dimm_fill; + /* + * We have only one logical memory controller to which all DIMMs belong. + */ + if (atomic_inc_return(&ghes_init) > 1) + return 0; + /* Get the number of DIMMs */ dmi_walk(ghes_edac_count_dimms, &num_dimm); @@ -424,26 +442,17 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) layers[0].size = num_dimm; layers[0].is_virt_csrow = true; - /* - * We need to serialize edac_mc_alloc() and edac_mc_add_mc(), - * to avoid duplicated memory controller numbers - */ - mutex_lock(&ghes_edac_lock); - mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers, - sizeof(*pvt)); + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); if (!mci) { pr_info("Can't allocate memory for EDAC data\n"); - mutex_unlock(&ghes_edac_lock); return -ENOMEM; } - pvt = mci->pvt_info; - memset(pvt, 0, sizeof(*pvt)); - list_add_tail(&pvt->list, &ghes_reglist); - pvt->ghes = ghes; - pvt->mci = mci; - mci->pdev = dev; + ghes_pvt = mci->pvt_info; + ghes_pvt->ghes = ghes; + ghes_pvt->mci = mci; + mci->pdev = dev; mci->mtype_cap = MEM_FLAG_EMPTY; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; @@ -451,36 +460,23 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) mci->ctl_name = "ghes_edac"; mci->dev_name = "ghes"; - if (!ghes_edac_mc_num) { - if (!fake) { - pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); - pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); - pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); - pr_info("If you find incorrect reports, please contact your hardware vendor\n"); - pr_info("to correct its BIOS.\n"); - pr_info("This system has %d DIMM sockets.\n", - num_dimm); - } else { - pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); - pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); - pr_info("work on such system. Use this driver with caution\n"); - } + if (!fake) { + pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); + pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); + pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); + pr_info("If you find incorrect reports, please contact your hardware vendor\n"); + pr_info("to correct its BIOS.\n"); + pr_info("This system has %d DIMM sockets.\n", num_dimm); + } else { + pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); + pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); + pr_info("work on such system. Use this driver with caution\n"); } if (!fake) { - /* - * Fill DIMM info from DMI for the memory controller #0 - * - * Keep it in blank for the other memory controllers, as - * there's no reliable way to properly credit each DIMM to - * the memory controller, as different BIOSes fill the - * DMI bank location fields on different ways - */ - if (!ghes_edac_mc_num) { - dimm_fill.count = 0; - dimm_fill.mci = mci; - dmi_walk(ghes_edac_dmidecode, &dimm_fill); - } + dimm_fill.count = 0; + dimm_fill.mci = mci; + dmi_walk(ghes_edac_dmidecode, &dimm_fill); } else { struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0); @@ -496,26 +492,16 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) if (rc < 0) { pr_info("Can't register at EDAC core\n"); edac_mc_free(mci); - mutex_unlock(&ghes_edac_lock); return -ENODEV; } - - ghes_edac_mc_num++; - mutex_unlock(&ghes_edac_lock); return 0; } void ghes_edac_unregister(struct ghes *ghes) { struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt, *tmp; - list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) { - if (ghes == pvt->ghes) { - mci = pvt->mci; - edac_mc_del_mc(mci->pdev); - edac_mc_free(mci); - list_del(&pvt->list); - } - } + mci = ghes_pvt->mci; + edac_mc_del_mc(mci->pdev); + edac_mc_free(mci); } From 5deed6b6a479ad5851d7ead6412dc6faa84a694e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Aug 2017 16:54:45 -0600 Subject: [PATCH 0063/1085] EDAC, ghes: Add platform check The ghes_edac driver was introduced in 2013 [1], but it has not been enabled by any distro yet. This driver obtains error info from firmware interfaces (APEI), which are not properly implemented on many platforms, as the driver says on load: This EDAC driver relies on BIOS to enumerate memory and get error reports. Unfortunately, not all BIOSes reflect the memory layout correctly. So, the end result of using this driver varies from vendor to vendor. If you find incorrect reports, please contact your hardware vendor to correct its BIOS. To get out from this situation, add a platform check to selectively enable the driver on platforms that are known to have proper APEI firmware implementation. "ghes_edac.force_load=1" skips this platform check. [1]: https://lkml.kernel.org/r/cover.1360931635.git.mchehab@redhat.com Signed-off-by: Toshi Kani Cc: Mauro Carvalho Chehab Cc: Tony Luck Cc: linux-acpi@vger.kernel.org Cc: linux-edac Link: http://lkml.kernel.org/r/20170823225447.15608-4-toshi.kani@hpe.com Signed-off-by: Borislav Petkov --- drivers/edac/ghes_edac.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 8d904df75ad8..68b6ee18bea6 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -38,6 +38,10 @@ static struct ghes_edac_pvt *ghes_pvt; */ static DEFINE_SPINLOCK(ghes_lock); +/* "ghes_edac.force_load=1" skips the platform check */ +static bool __read_mostly force_load; +module_param(force_load, bool, 0); + /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry { u8 type; @@ -415,6 +419,14 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, spin_unlock_irqrestore(&ghes_lock, flags); } +/* + * Known systems that are safe to enable this module. + */ +static struct acpi_platform_list plat_list[] = { + {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, + { } /* End */ +}; + int ghes_edac_register(struct ghes *ghes, struct device *dev) { bool fake = false; @@ -422,6 +434,12 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) struct mem_ctl_info *mci; struct edac_mc_layer layers[1]; struct ghes_edac_dimm_fill dimm_fill; + int idx; + + /* Check if safe to enable on this system */ + idx = acpi_match_platform_list(plat_list); + if (!force_load && idx < 0) + return 0; /* * We have only one logical memory controller to which all DIMMs belong. @@ -460,17 +478,17 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) mci->ctl_name = "ghes_edac"; mci->dev_name = "ghes"; - if (!fake) { + if (fake) { + pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); + pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); + pr_info("work on such system. Use this driver with caution\n"); + } else if (idx < 0) { pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); pr_info("If you find incorrect reports, please contact your hardware vendor\n"); pr_info("to correct its BIOS.\n"); pr_info("This system has %d DIMM sockets.\n", num_dimm); - } else { - pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); - pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); - pr_info("work on such system. Use this driver with caution\n"); } if (!fake) { From 3877c7d1e24c05eeb1c57ade04d2527d511f25a1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Aug 2017 16:54:46 -0600 Subject: [PATCH 0064/1085] EDAC: Add helper which returns the loaded platform driver Only a single EDAC platform driver can be loaded. When ghes_edac is enabled, an EDAC platform driver still attempts to register itself and fails in edac_mc_add_mc(). Add edac_get_owner() so that EDAC platform drivers can check the owner first. Signed-off-by: Toshi Kani Suggested-by: Borislav Petkov Cc: Mauro Carvalho Chehab Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170823225447.15608-5-toshi.kani@hpe.com [ Massage commit message. ] Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 7 ++++++- drivers/edac/edac_mc.h | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 480072139b7a..48193f5f3b56 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -53,7 +53,7 @@ static LIST_HEAD(mc_devices); * Used to lock EDAC MC to just one module, avoiding two drivers e. g. * apei/ghes and i7core_edac to be used at the same time. */ -static void const *edac_mc_owner; +static const char *edac_mc_owner; static struct bus_type mc_bus[EDAC_MAX_MCS]; @@ -701,6 +701,11 @@ unlock: } EXPORT_SYMBOL(edac_mc_find); +const char *edac_get_owner(void) +{ + return edac_mc_owner; +} +EXPORT_SYMBOL_GPL(edac_get_owner); /* FIXME - should a warning be printed if no error detection? correction? */ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 5357800e418d..4165e15995ad 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -128,6 +128,14 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, unsigned sz_pvt); /** + * edac_get_owner - Return the owner's mod_name of EDAC MC + * + * Returns: + * Pointer to mod_name string when EDAC MC is owned. NULL otherwise. + */ +extern const char *edac_get_owner(void); + +/* * edac_mc_add_mc_with_groups() - Insert the @mci structure into the mci * global list and create sysfs entries associated with @mci structure. * From 301375e764324b8048704eaf2c46fe1ee290830e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Aug 2017 16:54:47 -0600 Subject: [PATCH 0065/1085] EDAC: Add owner check to the x86 platform drivers Change x86 EDAC platform drivers to verify the module owner at the beginning of their module init functions. This allows them to fail their init immediately when ghes_edac is enabled. Similar change can be made to other edac drivers if necessary. Also, remove ".c" from module names of pnp2_edac, sb_edac, and skx_edac. Signed-off-by: Toshi Kani Suggested-by: Borislav Petkov Cc: Mauro Carvalho Chehab Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170823225447.15608-6-toshi.kani@hpe.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 5 +++++ drivers/edac/pnd2_edac.c | 9 ++++++++- drivers/edac/sb_edac.c | 9 +++++++-- drivers/edac/skx_edac.c | 9 ++++++++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ac2f30295efe..8b16ec595fa7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3434,9 +3434,14 @@ MODULE_DEVICE_TABLE(x86cpu, amd64_cpuids); static int __init amd64_edac_init(void) { + const char *owner; int err = -ENODEV; int i; + owner = edac_get_owner(); + if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) + return -EBUSY; + if (!x86_match_cpu(amd64_cpuids)) return -ENODEV; diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 4395c84cdcbf..df28b65358d2 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -45,6 +45,8 @@ #include "edac_module.h" #include "pnd2_edac.h" +#define EDAC_MOD_STR "pnd2_edac" + #define APL_NUM_CHANNELS 4 #define DNV_NUM_CHANNELS 2 #define DNV_MAX_DIMMS 2 /* Max DIMMs per channel */ @@ -1355,7 +1357,7 @@ static int pnd2_register_mci(struct mem_ctl_info **ppmci) pvt = mci->pvt_info; memset(pvt, 0, sizeof(*pvt)); - mci->mod_name = "pnd2_edac.c"; + mci->mod_name = EDAC_MOD_STR; mci->dev_name = ops->name; mci->ctl_name = "Pondicherry2"; @@ -1547,10 +1549,15 @@ MODULE_DEVICE_TABLE(x86cpu, pnd2_cpuids); static int __init pnd2_init(void) { const struct x86_cpu_id *id; + const char *owner; int rc; edac_dbg(2, "\n"); + owner = edac_get_owner(); + if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) + return -EBUSY; + id = x86_match_cpu(pnd2_cpuids); if (!id) return -ENODEV; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 2078ee414568..3ea90fc5978b 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -36,7 +36,7 @@ static LIST_HEAD(sbridge_edac_list); * Alter this version for the module when modifications are made */ #define SBRIDGE_REVISION " Ver: 1.1.2 " -#define EDAC_MOD_STR "sbridge_edac" +#define EDAC_MOD_STR "sb_edac" /* * Debug macros @@ -3155,7 +3155,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) MEM_FLAG_DDR4 : MEM_FLAG_DDR3; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; - mci->mod_name = "sb_edac.c"; + mci->mod_name = EDAC_MOD_STR; mci->dev_name = pci_name(pdev); mci->ctl_page_to_phys = NULL; @@ -3407,10 +3407,15 @@ static void sbridge_remove(void) static int __init sbridge_init(void) { const struct x86_cpu_id *id; + const char *owner; int rc; edac_dbg(2, "\n"); + owner = edac_get_owner(); + if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) + return -EBUSY; + id = x86_match_cpu(sbridge_cpuids); if (!id) return -ENODEV; diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index 3fe85b0ac274..e8e570e73796 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -31,6 +31,8 @@ #include "edac_module.h" +#define EDAC_MOD_STR "skx_edac" + /* * Debug macros */ @@ -473,7 +475,7 @@ static int skx_register_mci(struct skx_imc *imc) mci->mtype_cap = MEM_FLAG_DDR4; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; - mci->mod_name = "skx_edac.c"; + mci->mod_name = EDAC_MOD_STR; mci->dev_name = pci_name(imc->chan[0].cdev); mci->ctl_page_to_phys = NULL; @@ -1044,12 +1046,17 @@ static int __init skx_init(void) { const struct x86_cpu_id *id; const struct munit *m; + const char *owner; int rc = 0, i; u8 mc = 0, src_id, node_id; struct skx_dev *d; edac_dbg(2, "\n"); + owner = edac_get_owner(); + if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) + return -EBUSY; + id = x86_match_cpu(skx_cpuids); if (!id) return -ENODEV; From 0114a8e87772c4172bf7fefa4c7a9a6bbc52d2ab Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:44 +0800 Subject: [PATCH 0066/1085] x86/apic: Construct a selector for the interrupt delivery mode There are quite some switches which are used to determine the final interrupt delivery mode, as shown below: 1) Kconfig: CONFIG_X86_64; CONFIG_X86_LOCAL_APIC; CONFIG_x86_IO_APIC 2) Command line options: disable_apic; skip_ioapic_setup 3) CPU Capability: boot_cpu_has(X86_FEATURE_APIC) 4) MP table: smp_found_config 5) ACPI: acpi_lapic; acpi_ioapic; nr_ioapic These switches are disordered and scattered and there are also some dependencies between them. These make the code difficult to maintain and read. Construct a selector to unify them into a single function, then, Use this selector to get an interrupt delivery mode directly. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-2-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d705c769f77d..39cb8c1ad98e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1218,6 +1218,58 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } +enum apic_intr_mode { + APIC_PIC, + APIC_VIRTUAL_WIRE, + APIC_SYMMETRIC_IO, +}; + +static int __init apic_intr_mode_select(void) +{ + /* Check kernel option */ + if (disable_apic) { + pr_info("APIC disabled via kernel command line\n"); + return APIC_PIC; + } + + /* Check BIOS */ +#ifdef CONFIG_X86_64 + /* On 64-bit, the APIC must be integrated, Check local APIC only */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + disable_apic = 1; + pr_info("APIC disabled by BIOS\n"); + return APIC_PIC; + } +#else + /* On 32-bit, the APIC may be integrated APIC or 82489DX */ + + /* Neither 82489DX nor integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { + disable_apic = 1; + return APIC_PIC; + } + + /* If the BIOS pretends there is an integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && + APIC_INTEGRATED(boot_cpu_apic_version)) { + disable_apic = 1; + pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", + boot_cpu_physical_apicid); + return APIC_PIC; + } +#endif + + /* Check MP table or ACPI MADT configuration */ + if (!smp_found_config) { + disable_ioapic_support(); + if (!acpi_lapic) + pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE; + } + + return APIC_SYMMETRIC_IO; +} + /* * An initial setup of the virtual wire mode. */ From 4b1669e8d1e4e6cb65b3b114fced6ca9bc39ddea Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:45 +0800 Subject: [PATCH 0067/1085] x86/apic: Prepare for unifying the interrupt delivery modes setup There are three places which initialize the interrupt delivery modes: 1) init_bsp_APIC() which is called early might setup the through-local-APIC virtual wire mode on non SMP systems. 2) In an SMP-capable system, native_smp_prepare_cpus() tries to switch to symmetric I/O model. 3) In UP system with UP_LATE_INIT=y, the local APIC and I/O APIC are set up in smp_init(). There is no technical reason to make these initializations at random places and run the kernel with the potentially wrong mode through the early boot stage, but it has a problematic side effect: The late switch to symmetric I/O mode causes dump-capture kernel to hang when the kernel command line option 'notsc' is active. Provide a new function to unify that three positions. Preparatory patch to initialize an interrupt mode directly. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-3-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/kernel/apic/apic.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5f01671c68f2..1a970f5a6e75 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -128,6 +128,7 @@ extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); +extern void apic_intr_mode_init(void); extern void setup_local_APIC(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); @@ -170,6 +171,7 @@ static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } +static inline void apic_intr_mode_init(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39cb8c1ad98e..08585bcbb38f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1319,6 +1319,22 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +/* Init the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_init(void) +{ + switch (apic_intr_mode_select()) { + case APIC_PIC: + pr_info("APIC: Keep in PIC mode(8259)\n"); + return; + case APIC_VIRTUAL_WIRE: + pr_info("APIC: Switch to virtual wire mode setup\n"); + return; + case APIC_SYMMETRIC_IO: + pr_info("APIC: Switch to symmectic I/O mode setup\n"); + return; + } +} + static void lapic_setup_esr(void) { unsigned int oldvalue, value, maxlvt; From a2510d156eae9cf85c928d428471e44edd82c5ca Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:46 +0800 Subject: [PATCH 0068/1085] x86/apic: Split local APIC timer setup from the APIC setup apic_bsp_setup() sets up the local APIC, I/O APIC and APIC timer. The local APIC and I/O APIC setup belongs to interrupt delivery mode setup. Setting up the local APIC timer for booting CPU is another job and has nothing to do with interrupt delivery mode setup. Split local APIC timer setup from the APIC setup, keep it in the original position for SMP and UP kernel for now. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-4-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 4 ++-- arch/x86/kernel/smpboot.c | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 08585bcbb38f..ad373243c7b3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2397,8 +2397,6 @@ int __init apic_bsp_setup(bool upmode) end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); - /* Setup local timer */ - x86_init.timers.setup_percpu_clockev(); return id; } @@ -2438,6 +2436,8 @@ int __init APIC_init_uniprocessor(void) default_setup_apic_routing(); apic_bsp_setup(true); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); return 0; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..dad0a099e433 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1340,6 +1340,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) case SMP_FORCE_UP: disable_smp(); apic_bsp_setup(false); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); return; case SMP_OK: break; @@ -1354,6 +1356,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); From 4b1244b45c16cef63fa3282e5bb1cc4fa1aef06a Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:47 +0800 Subject: [PATCH 0069/1085] x86/apic: Move logical APIC ID away from apic_bsp_setup() apic_bsp_setup() sets and returns logical APIC ID for initializing cpu0_logical_apicid in a SMP-capable system. The id has nothing to do with the initialization of local APIC and I/O APIC. And apic_bsp_setup() should be called for interrupt mode setup only. Move the id setup into a separate helper function for cleanup and mark apic_bsp_setup() void. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-5-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic.c | 10 +--------- arch/x86/kernel/smpboot.c | 12 +++++++++++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1a970f5a6e75..4e550c742130 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -146,7 +146,7 @@ static inline int apic_force_enable(unsigned long addr) extern int apic_force_enable(unsigned long addr); #endif -extern int apic_bsp_setup(bool upmode); +extern void apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad373243c7b3..eafed8fbf340 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2379,25 +2379,17 @@ static void __init apic_bsp_up_setup(void) * Returns: * apic_id of BSP APIC */ -int __init apic_bsp_setup(bool upmode) +void __init apic_bsp_setup(bool upmode) { - int id; - connect_bsp_APIC(); if (upmode) apic_bsp_up_setup(); setup_local_APIC(); - if (x2apic_mode) - id = apic_read(APIC_LDR); - else - id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - enable_IO_APIC(); end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); - return id; } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index dad0a099e433..d367ddbec5d0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1294,6 +1294,14 @@ static void __init smp_cpu_index_default(void) } } +static void __init smp_get_logical_apicid(void) +{ + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + /* * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. @@ -1354,11 +1362,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } default_setup_apic_routing(); - cpu0_logical_apicid = apic_bsp_setup(false); + apic_bsp_setup(false); /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); + smp_get_logical_apicid(); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); From 3e730dad3b6da42d21c05007445ca1bfd219d7ce Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:48 +0800 Subject: [PATCH 0070/1085] x86/apic: Unify interrupt mode setup for SMP-capable system On a SMP-capable system, the kernel enables and sets up the APIC interrupt delivery mode in native_smp_prepare_cpus(). The decision how to setup the APIC is intermingled with the decision of setting up SMP or not. Split the initialization of the APIC interrupt mode independent from other decisions and have a separate apic_intr_mode_init() function for it. The invocation time stays the same for now. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-6-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 38 ++++++++++++++++++++++++++++++++++--- arch/x86/kernel/smpboot.c | 14 ++------------ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index eafed8fbf340..7ae97c26d23c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1221,7 +1221,9 @@ void __init sync_Arb_IDs(void) enum apic_intr_mode { APIC_PIC, APIC_VIRTUAL_WIRE, + APIC_VIRTUAL_WIRE_NO_CONFIG, APIC_SYMMETRIC_IO, + APIC_SYMMETRIC_IO_NO_ROUTING, }; static int __init apic_intr_mode_select(void) @@ -1262,11 +1264,27 @@ static int __init apic_intr_mode_select(void) /* Check MP table or ACPI MADT configuration */ if (!smp_found_config) { disable_ioapic_support(); - if (!acpi_lapic) + if (!acpi_lapic) { pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE_NO_CONFIG; + } return APIC_VIRTUAL_WIRE; } +#ifdef CONFIG_SMP + /* If SMP should be disabled, then really disable it! */ + if (!setup_max_cpus) { + pr_info("APIC: SMP mode deactivated\n"); + return APIC_SYMMETRIC_IO_NO_ROUTING; + } + + if (read_apic_id() != boot_cpu_physical_apicid) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + read_apic_id(), boot_cpu_physical_apicid); + /* Or can we switch back to PIC here? */ + } +#endif + return APIC_SYMMETRIC_IO; } @@ -1322,17 +1340,31 @@ void __init init_bsp_APIC(void) /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { + bool upmode = false; + switch (apic_intr_mode_select()) { case APIC_PIC: pr_info("APIC: Keep in PIC mode(8259)\n"); return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - return; + default_setup_apic_routing(); + break; + case APIC_VIRTUAL_WIRE_NO_CONFIG: + pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); + upmode = true; + default_setup_apic_routing(); + break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmectic I/O mode setup\n"); - return; + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO_NO_ROUTING: + pr_info("APIC: Switch to symmectic I/O mode setup in no SMP routine\n"); + break; } + + apic_bsp_setup(upmode); } static void lapic_setup_esr(void) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d367ddbec5d0..d0a1d28c23e8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1336,18 +1336,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); + apic_intr_mode_init(); + switch (smp_sanity_check(max_cpus)) { case SMP_NO_CONFIG: disable_smp(); - if (APIC_init_uniprocessor()) - pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return; case SMP_NO_APIC: disable_smp(); return; case SMP_FORCE_UP: disable_smp(); - apic_bsp_setup(false); /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); return; @@ -1355,15 +1354,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } - - default_setup_apic_routing(); - apic_bsp_setup(false); - /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); From 4f45ed9f848f0721967e2f79e5409b6538894a43 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:49 +0800 Subject: [PATCH 0071/1085] x86/apic: Mark the apic_intr_mode extern for sanity check cleanup Calling native_smp_prepare_cpus() to prepare for SMP bootup, does some sanity checking, enables APIC mode and disables SMP feature. Now, APIC mode setup has been unified to apic_intr_mode_init(), some sanity checks are redundant and need to be cleanup. Mark the apic_intr_mode extern to refine the switch and remove the redundant sanity check. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-7-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/apic.h | 9 ++++++ arch/x86/kernel/apic/apic.c | 16 ++++------- arch/x86/kernel/smpboot.c | 57 ++++++------------------------------- 3 files changed, 24 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4e550c742130..01f3fc8f8691 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -53,6 +53,15 @@ extern int local_apic_timer_c2_ok; extern int disable_apic; extern unsigned int lapic_timer_frequency; +extern enum apic_intr_mode_id apic_intr_mode; +enum apic_intr_mode_id { + APIC_PIC, + APIC_VIRTUAL_WIRE, + APIC_VIRTUAL_WIRE_NO_CONFIG, + APIC_SYMMETRIC_IO, + APIC_SYMMETRIC_IO_NO_ROUTING +}; + #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); #else /* CONFIG_SMP */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7ae97c26d23c..21d584d82f1f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1218,13 +1218,7 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } -enum apic_intr_mode { - APIC_PIC, - APIC_VIRTUAL_WIRE, - APIC_VIRTUAL_WIRE_NO_CONFIG, - APIC_SYMMETRIC_IO, - APIC_SYMMETRIC_IO_NO_ROUTING, -}; +enum apic_intr_mode_id apic_intr_mode; static int __init apic_intr_mode_select(void) { @@ -1342,7 +1336,9 @@ void __init apic_intr_mode_init(void) { bool upmode = false; - switch (apic_intr_mode_select()) { + apic_intr_mode = apic_intr_mode_select(); + + switch (apic_intr_mode) { case APIC_PIC: pr_info("APIC: Keep in PIC mode(8259)\n"); return; @@ -1974,8 +1970,8 @@ void __init init_apic_mappings(void) * yeah -- we lie about apic_version * in case if apic was disabled via boot option * but it's not a problem for SMP compiled kernel - * since smp_sanity_check is prepared for such a case - * and disable smp mode + * since apic_intr_mode_select is prepared for such + * a case and disable smp mode */ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d0a1d28c23e8..161935c49166 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1190,17 +1190,10 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_core_cpumask(0)); } -enum { - SMP_OK, - SMP_NO_CONFIG, - SMP_NO_APIC, - SMP_FORCE_UP, -}; - /* * Various sanity checks. */ -static int __init smp_sanity_check(unsigned max_cpus) +static void __init smp_sanity_check(void) { preempt_disable(); @@ -1237,16 +1230,6 @@ static int __init smp_sanity_check(unsigned max_cpus) physid_set(hard_smp_processor_id(), phys_cpu_present_map); } - /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config && !acpi_lapic) { - preempt_enable(); - pr_notice("SMP motherboard not detected\n"); - return SMP_NO_CONFIG; - } - /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. @@ -1257,29 +1240,6 @@ static int __init smp_sanity_check(unsigned max_cpus) physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(boot_cpu_apic_version) && - !boot_cpu_has(X86_FEATURE_APIC)) { - if (!disable_apic) { - pr_err("BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); - } - return SMP_NO_APIC; - } - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - pr_info("SMP mode deactivated\n"); - return SMP_FORCE_UP; - } - - return SMP_OK; } static void __init smp_cpu_index_default(void) @@ -1338,19 +1298,20 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) apic_intr_mode_init(); - switch (smp_sanity_check(max_cpus)) { - case SMP_NO_CONFIG: + smp_sanity_check(); + + switch (apic_intr_mode) { + case APIC_PIC: + case APIC_VIRTUAL_WIRE_NO_CONFIG: disable_smp(); return; - case SMP_NO_APIC: - disable_smp(); - return; - case SMP_FORCE_UP: + case APIC_SYMMETRIC_IO_NO_ROUTING: disable_smp(); /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); return; - case SMP_OK: + case APIC_VIRTUAL_WIRE: + case APIC_SYMMETRIC_IO: break; } From 0c759131ae568f2e620485662104ab8c1e770c81 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:50 +0800 Subject: [PATCH 0072/1085] x86/apic: Unify interrupt mode setup for UP system In UniProcessor kernel with UP_LATE_INIT=y, the interrupt delivery mode is initialized in up_late_init(). Use the new unified apic_intr_mode_init() function and remove APIC_init_uniprocessor(). Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-8-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/apic.h | 1 - arch/x86/kernel/apic/apic.c | 51 ++++++------------------------------- 2 files changed, 8 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 01f3fc8f8691..983a0dc564b3 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -144,7 +144,6 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); -extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 static inline int apic_force_enable(unsigned long addr) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 21d584d82f1f..efc5fbd1c40c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1334,7 +1334,7 @@ void __init init_bsp_APIC(void) /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { - bool upmode = false; + bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); apic_intr_mode = apic_intr_mode_select(); @@ -2420,51 +2420,16 @@ void __init apic_bsp_setup(bool upmode) setup_IO_APIC(); } -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - pr_info("Apic disabled\n"); - return -1; - } -#ifdef CONFIG_X86_64 - if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; - pr_info("Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(boot_cpu_apic_version)) { - pr_err("BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); - return -1; - } -#endif - - if (!smp_found_config) - disable_ioapic_support(); - - default_setup_apic_routing(); - apic_bsp_setup(true); - /* Setup local timer */ - x86_init.timers.setup_percpu_clockev(); - return 0; -} - #ifdef CONFIG_UP_LATE_INIT void __init up_late_init(void) { - APIC_init_uniprocessor(); + apic_intr_mode_init(); + + if (apic_intr_mode == APIC_PIC) + return; + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); } #endif From ca7c6076baed396737e31e33b87a637d70e9fc5f Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:51 +0800 Subject: [PATCH 0073/1085] x86/ioapic: Refactor the delay logic in timer_irq_works() timer_irq_works() is used to detects the timer IRQs. It calls mdelay(10) to delay ten ticks and check whether the timer IRQ work or not. mdelay() depends on the loops_per_jiffy which is set up in calibrate_delay(), but the delay calibration depends on a working timer interrupt, which causes a chicken and egg problem. The correct solution is to set up the interrupt mode and making sure that the timer interrupt is delivered correctly before invoking calibrate_delay(). That means that mdelay() cannot be used in timer_irq_works(). Provide helper functions to make a rough delay estimate which is good enough to prove that the timer interrupt is working. Either use TSC or a simple delay loop and assume that 4GHz is the maximum CPU frequency to base the delay calculation on. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-9-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/io_apic.c | 45 ++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 70e48aa6af98..f8f248749c56 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1585,6 +1585,43 @@ static int __init notimercheck(char *s) } __setup("no_timer_check", notimercheck); +static void __init delay_with_tsc(void) +{ + unsigned long long start, now; + unsigned long end = jiffies + 4; + + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 40000000000/HZ TSC cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 40000000000UL / HZ && + time_before_eq(jiffies, end)); +} + +static void __init delay_without_tsc(void) +{ + unsigned long end = jiffies + 4; + int band = 1; + + /* + * We don't know any frequency yet, but waiting for + * 40940000000/HZ cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094 + */ + do { + __delay(((1U << band++) * 10000000UL) / HZ); + } while (band < 12 && time_before_eq(jiffies, end)); +} + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1603,8 +1640,12 @@ static int __init timer_irq_works(void) local_save_flags(flags); local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); + + if (boot_cpu_has(X86_FEATURE_TSC)) + delay_with_tsc(); + else + delay_without_tsc(); + local_irq_restore(flags); /* From 34fba3e6b1e5d42c81fc00ede715e0cdd2ebfada Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:52 +0800 Subject: [PATCH 0074/1085] x86/init: Add intr_mode_init to x86_init_ops X86 and XEN initialize interrupt delivery mode in different way. To avoid conditionals, add a new x86_init_ops function which defaults to the standard function and can be overridden by the early XEN platform code. [ tglx: Folded the XEN part which was a separate patch to preserve bisectability ] Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-10-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/x86_init.c | 1 + arch/x86/xen/enlighten_pv.c | 1 + 5 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 7ba7e90a9ad6..f45acdf45957 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -50,11 +50,13 @@ struct x86_init_resources { * are set up. * @intr_init: interrupt init code * @trap_init: platform specific trap setup + * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); void (*trap_init)(void); + void (*intr_mode_init)(void); }; /** diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index efc5fbd1c40c..8dbcff2f96eb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2423,7 +2423,7 @@ void __init apic_bsp_setup(bool upmode) #ifdef CONFIG_UP_LATE_INIT void __init up_late_init(void) { - apic_intr_mode_init(); + x86_init.irqs.intr_mode_init(); if (apic_intr_mode == APIC_PIC) return; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 161935c49166..3d045e82352d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1296,7 +1296,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); - apic_intr_mode_init(); + x86_init.irqs.intr_mode_init(); smp_sanity_check(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a088b2c47f73..a7889b93e438 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -55,6 +55,7 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, + .intr_mode_init = apic_intr_mode_init }, .oem = { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69b9deff7e5c..73f809a6ca87 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1230,6 +1230,7 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_platform.get_nmi_reason = xen_get_nmi_reason; x86_init.resources.memory_setup = xen_memory_setup; + x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; From 935356cecda851d94381e1c6fea9dec443f908fe Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:54 +0800 Subject: [PATCH 0075/1085] x86/apic: Initialize interrupt mode after timer init A cold or warm boot through BIOS sets the APIC in default interrupt delivery mode. A dump-capture kernel will not go through a BIOS reset and leave the interrupt delivery mode in the state which was active on the crashed kernel, but the dump kernel startup code assumes default delivery mode which can result in interrupt delivery/handling to fail. To solve this problem, it's required to set up the final interrupt delivery mode as soon as possible. As IOAPIC setup needs the timer initialized for verifying the timer interrupt delivery mode, the earliest point is right after timer setup in late_time_init(). That results in the following init order: 1) Set up the legacy timer, if applicable on the platform 2) Set up APIC/IOAPIC which includes the verification of the legacy timer interrupt delivery. 3) TSC calibration 4) Local APIC timer setup Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-12-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/smpboot.c | 7 +++---- arch/x86/kernel/time.c | 5 +++++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8dbcff2f96eb..3d08649acec6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2423,8 +2423,6 @@ void __init apic_bsp_setup(bool upmode) #ifdef CONFIG_UP_LATE_INIT void __init up_late_init(void) { - x86_init.irqs.intr_mode_init(); - if (apic_intr_mode == APIC_PIC) return; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3d045e82352d..81652e3b8c17 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1263,8 +1263,9 @@ static void __init smp_get_logical_apicid(void) } /* - * Prepare for SMP bootup. The MP table or ACPI has been read - * earlier. Just do some sanity checking here and enable APIC mode. + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { @@ -1296,8 +1297,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); - x86_init.irqs.intr_mode_init(); - smp_sanity_check(); switch (apic_intr_mode) { diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e0754cdbad37..3ceb834233c8 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -84,6 +84,11 @@ void __init hpet_time_init(void) static __init void x86_late_time_init(void) { x86_init.timers.timer_init(); + /* + * After PIT/HPET timers init, select and setup + * the final interrupt mode for delivering IRQs. + */ + x86_init.irqs.intr_mode_init(); tsc_init(); } From b371ae0d4a194b178817b0edfb6a7395c7aec37a Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:12:55 +0800 Subject: [PATCH 0076/1085] x86/apic: Remove init_bsp_APIC() init_bsp_APIC() which works for the virtual wire mode is used in ISA irq initialization at boot time. With the new APIC interrupt delivery mode scheme, which initializes the APIC before the first interrupt is expected, init_bsp_APIC() is not longer required and can be removed. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1505293975-26005-13-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/apic.h | 1 - arch/x86/kernel/apic/apic.c | 49 ------------------------------------- arch/x86/kernel/irqinit.c | 3 --- 3 files changed, 53 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 983a0dc564b3..7d247b2d8c54 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -136,7 +136,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); -extern void init_bsp_APIC(void); extern void apic_intr_mode_init(void); extern void setup_local_APIC(void); extern void init_apic_mappings(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3d08649acec6..a4ee36706999 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1282,55 +1282,6 @@ static int __init apic_intr_mode_select(void) return APIC_SYMMETRIC_IO; } -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - if (apic_extnmi == APIC_EXTNMI_NONE) - value |= APIC_LVT_MASKED; - apic_write(APIC_LVT1, value); -} - /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1add9e08e83e..beafcf584e44 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -60,9 +60,6 @@ void __init init_ISA_irqs(void) struct irq_chip *chip = legacy_pic->chip; int i; -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif legacy_pic->init(0); for (i = 0; i < nr_legacy_irqs(); i++) From e3cccbce146fdc61e0f7ffc4cdda2b408b23cf3a Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 7 Sep 2017 16:49:20 +0800 Subject: [PATCH 0077/1085] x86/apic: Remove duplicate X86_64 conditional in lapic_is_integrated() The macro APIC_INTEGRATED(x) is already wrapped by CONFIG_X86_32. So it can be invoked unconditionally. Remove the extra "#ifdef CONFIG_X86_64...". No functional change. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1504774161-7137-1-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a4ee36706999..6708e25a09f5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -211,11 +211,7 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { -#ifdef CONFIG_X86_64 - return 1; -#else return APIC_INTEGRATED(lapic_get_version()); -#endif } /* From ae41a2a40ed4253b9e1e111df409bbecab0f9800 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 7 Sep 2017 16:49:21 +0800 Subject: [PATCH 0078/1085] x86/apic: Use lapic_is_integrated() consistently lapic_is_integrated() is a wrapper around APIC_INTEGRATED(), but not used consistently. Replace the direct usage of APIC_INTEGRATED() and fixup a hard to read tail comment. No functional change. [ tglx: Made it compile and work .... ] Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/1504774161-7137-2-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6708e25a09f5..ffcd7556795f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -294,14 +294,11 @@ int get_physical_broadcast(void) */ int lapic_get_maxlvt(void) { - unsigned int v; - - v = apic_read(APIC_LVR); /* * - we always have APIC integrated on 64bit mode * - 82489DXs do not report # of LVT entries */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; + return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2; } /* @@ -1531,7 +1528,9 @@ void setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!lapic_is_integrated()) /* 82489DX */ + + /* Is 82489DX ? */ + if (!lapic_is_integrated()) value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); From eb496063c9904ce682253ee445b9acb9b6257581 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 14 Jul 2017 11:34:06 +0800 Subject: [PATCH 0079/1085] x86/timers: Move the simple udelay calibration to tsc.h Commit dd759d93f4dd ("x86/timers: Add simple udelay calibration") adds an static function in x86 boot-time initializations. But, this function is actually related to TSC, so it should be maintained in tsc.c, not in setup.c. Move simple_udelay_calibration() from setup.c to tsc.c and rename it to tsc_early_delay_calibrate for more readability. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1500003247-17368-1-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/tsc.h | 1 + arch/x86/kernel/setup.c | 22 +--------------------- arch/x86/kernel/tsc.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index f5e6f1c417df..d0509c75e150 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -31,6 +31,7 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); +extern void tsc_early_delay_calibrate(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..c63d2b4dd5c8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -822,26 +822,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static void __init simple_udelay_calibration(void) -{ - unsigned int tsc_khz, cpu_khz; - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -1049,7 +1029,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); - simple_udelay_calibration(); + tsc_early_delay_calibrate(); x86_init.resources.probe_roms(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 796d96bb0821..c173dcef1f46 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1263,6 +1263,25 @@ static int __init init_tsc_clocksource(void) */ device_initcall(init_tsc_clocksource); +void __init tsc_early_delay_calibrate(void) +{ + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + void __init tsc_init(void) { u64 lpj, cyc; From af5768507c051ceb9fe12bee59202bd83115c073 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 14 Jul 2017 11:34:07 +0800 Subject: [PATCH 0080/1085] x86/timers: Make recalibrate_cpu_khz() void recalibrate_cpu_khz() is called from powernow K7 and Pentium 4/Xeon CPU freq driver. It recalibrates cpu frequency in case of SMP = n and doesn't need to return anything. Mark it void, also remove the #else branch. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1500003247-17368-2-git-send-email-douly.fnst@cn.fujitsu.com --- arch/x86/include/asm/timer.h | 2 +- arch/x86/kernel/tsc.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 2016962103df..e90edbee313a 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -8,7 +8,7 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -extern int recalibrate_cpu_khz(void); +extern void recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c173dcef1f46..896dbe31b407 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -812,13 +812,13 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } -int recalibrate_cpu_khz(void) +void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) - return -ENODEV; + return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); @@ -828,10 +828,6 @@ int recalibrate_cpu_khz(void) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); - - return 0; -#else - return -ENODEV; #endif } From ccb64941f375a6eb21b1b20136730eb7d1716068 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 11 Sep 2017 14:51:11 -0400 Subject: [PATCH 0081/1085] x86/timers: Move simple_udelay_calibration() past kvmclock_init() simple_udelay_calibration() relies on x86_platform's calibration ops. For KVM these ops are set late in setup_arch() and so simple_udelay_calibration() ends up using native version. Besides being possibly incorrect, this significantly increases kernel boot time. For example, on my laptop executing start_kernel() by a guest takes ~10 times more than when KVM's ops are used. Since early_xdbc_setup_hardware() relies on calibration having been performed move it too. Signed-off-by: Boris Ostrovsky Cc: baolu.lu@linux.intel.com Link: https://lkml.kernel.org/r/20170911185111.20636-1-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c63d2b4dd5c8..b40311027c15 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1029,8 +1029,6 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); - tsc_early_delay_calibrate(); - x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1115,9 +1113,6 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); - if (!early_xdbc_setup_hardware()) - early_xdbc_register_console(); - reserve_bios_regions(); if (efi_enabled(EFI_MEMMAP)) { @@ -1223,6 +1218,10 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif + tsc_early_delay_calibrate(); + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); + x86_init.paging.pagetable_init(); kasan_init(); From 04063a011f2f35c37e1146c736e6d4ad402a8557 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Sep 2017 13:21:33 +0300 Subject: [PATCH 0082/1085] spi: sprd-adi: checking for NULL instead of IS_ERR() devm_ioremap_resource() returns error pointers, it never returns NULL. Fixes: 7e2903cb91df ("spi: Add ADI driver for Spreadtrum platform") Signed-off-by: Dan Carpenter Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index 0d481f8a46c2..bff6ef1caad7 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -341,8 +341,8 @@ static int sprd_adi_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); sadi->base = devm_ioremap_resource(&pdev->dev, res); - if (!sadi->base) { - ret = -ENOMEM; + if (IS_ERR(sadi->base)) { + ret = PTR_ERR(sadi->base); goto put_ctlr; } From b0d6e097b922ac7f538623c52794d9d63d6ee378 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:48:08 +0300 Subject: [PATCH 0083/1085] spi: sprd-adi: silence an uninitialized variable warning If of_get_property() fails then "size" is uninitialized and it leads to a static checker warning: drivers/spi/spi-sprd-adi.c:288 sprd_adi_hw_init() error: uninitialized symbol 'size'. We can silence the warning by re-arranging the order of these checks. It obviously doesn't affect runtime at all. Signed-off-by: Dan Carpenter Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index bff6ef1caad7..1324463244d3 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -285,7 +285,7 @@ static void sprd_adi_hw_init(struct sprd_adi *sadi) /* Set hardware channels setting */ list = of_get_property(np, "sprd,hw-channels", &size); - if (!size || !list) { + if (!list || !size) { dev_info(sadi->dev, "no hw channels setting in node\n"); return; } From bdacfc7b6216dd30d07c10732fd4c0a660c62853 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Mon, 25 Sep 2017 09:54:19 +0100 Subject: [PATCH 0084/1085] spi: sh-msiof: Add compatible strings for r8a774[35] Signed-off-by: Fabrizio Castro Signed-off-by: Mark Brown --- drivers/spi/spi-sh-msiof.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 0eb1e9583485..00808448cf35 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -1021,6 +1021,8 @@ static const struct sh_msiof_chipdata rcar_gen3_data = { static const struct of_device_id sh_msiof_match[] = { { .compatible = "renesas,sh-mobile-msiof", .data = &sh_data }, + { .compatible = "renesas,msiof-r8a7743", .data = &rcar_gen2_data }, + { .compatible = "renesas,msiof-r8a7745", .data = &rcar_gen2_data }, { .compatible = "renesas,msiof-r8a7790", .data = &rcar_gen2_data }, { .compatible = "renesas,msiof-r8a7791", .data = &rcar_gen2_data }, { .compatible = "renesas,msiof-r8a7792", .data = &rcar_gen2_data }, From 4702f4b23a2fc6196abacf515a959e69176da40e Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Mon, 25 Sep 2017 09:54:20 +0100 Subject: [PATCH 0085/1085] spi: sh-msiof: Add r8a774[35] to the compatible list Signed-off-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/sh-msiof.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/sh-msiof.txt b/Documentation/devicetree/bindings/spi/sh-msiof.txt index e865855726a2..bdd83959019c 100644 --- a/Documentation/devicetree/bindings/spi/sh-msiof.txt +++ b/Documentation/devicetree/bindings/spi/sh-msiof.txt @@ -1,7 +1,9 @@ Renesas MSIOF spi controller Required properties: -- compatible : "renesas,msiof-r8a7790" (R-Car H2) +- compatible : "renesas,msiof-r8a7743" (RZ/G1M) + "renesas,msiof-r8a7745" (RZ/G1E) + "renesas,msiof-r8a7790" (R-Car H2) "renesas,msiof-r8a7791" (R-Car M2-W) "renesas,msiof-r8a7792" (R-Car V2H) "renesas,msiof-r8a7793" (R-Car M2-N) @@ -10,7 +12,7 @@ Required properties: "renesas,msiof-r8a7796" (R-Car M3-W) "renesas,msiof-sh73a0" (SH-Mobile AG5) "renesas,sh-mobile-msiof" (generic SH-Mobile compatibile device) - "renesas,rcar-gen2-msiof" (generic R-Car Gen2 compatible device) + "renesas,rcar-gen2-msiof" (generic R-Car Gen2 and RZ/G1 compatible device) "renesas,rcar-gen3-msiof" (generic R-Car Gen3 compatible device) "renesas,sh-msiof" (deprecated) From e0b477941d130576d9daf31160dab6e34335b10a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:04 +0200 Subject: [PATCH 0086/1085] genirq/debugfs: Show debug information for all irq descriptors Currently the debugfs shows only information about actively used interrupts like /proc/irq/ does. That's fine for most cases, but not helpful when internals of allocated, but unused interrupt descriptors have to debugged. It's also useful to provide information about all descriptors so leaks can be debugged in a simpler way. Move the debugfs registration to the descriptor allocation code. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.355525908@linutronix.de --- kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..0f6805b75fc8 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -462,6 +462,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); + irq_add_debugfs_entry(start + i, desc); } bitmap_set(allocated_irqs, start, cnt); return start; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..0e8b48315f3c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1400,7 +1400,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); - irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; From 07557ccb8c83f315e409ddee181e9ffbf87c6ad1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:05 +0200 Subject: [PATCH 0087/1085] genirq/msi: Capture device name for debugfs For debugging the allocation of unused or potentially leaked interrupt descriptor it's helpful to have some information about the site which allocated them. In case of MSI this is simple because the caller hands the device struct pointer into the domain allocation function. Duplicate the device name and show it in the debugfs entry of the interrupt descriptor. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.433038426@linutronix.de --- include/linux/irqdesc.h | 1 + kernel/irq/debugfs.c | 10 ++++++++++ kernel/irq/internals.h | 5 +++++ kernel/irq/msi.c | 6 +++++- 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 3e90a094798d..b55b113c049b 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -93,6 +93,7 @@ struct irq_desc { #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS struct dentry *debugfs_file; + const char *dev_name; #endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c3fdb36dec30..b7d1023b9b22 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -149,6 +149,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, ARRAY_SIZE(irqdesc_states)); @@ -226,6 +227,15 @@ static const struct file_operations dfs_irq_ops = { .release = single_release, }; +void irq_debugfs_copy_devname(int irq, struct device *dev) +{ + struct irq_desc *desc = irq_to_desc(irq); + const char *name = dev_name(dev); + + if (name) + desc->dev_name = kstrdup(name, GFP_KERNEL); +} + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { char name [10]; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a4aa39009f0d..cfaec2669093 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -443,7 +443,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); + kfree(desc->dev_name); } +void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else @@ -458,4 +460,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } +static inline void irq_debugfs_copy_devname(int irq, struct device *dev) +{ +} #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3fa4bd59f569..94ecc0293844 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -16,6 +16,8 @@ #include #include +#include "internals.h" + /** * alloc_msi_entry - Allocate an initialize msi_entry * @dev: Pointer to the device for which this is allocated @@ -373,8 +375,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return ret; } - for (i = 0; i < desc->nvec_used; i++) + for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); + irq_debugfs_copy_devname(virq + i, dev); + } } if (ops->msi_finish) From c3e7239a7f43ce1ff407df5f5944bf0d42dc21bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:06 +0200 Subject: [PATCH 0088/1085] irqdomain/debugfs: Provide domain specific debug callback Some interrupt domains like the X86 vector domain has special requirements for debugging, like showing the vector usage on the CPUs. Add a callback to the irqdomain ops which can be filled in by domains which require it and add conditional invocations to the irqdomain and the per irq debug files. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.512937505@linutronix.de --- include/linux/irqdomain.h | 6 +++++- kernel/irq/debugfs.c | 2 ++ kernel/irq/irqdomain.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 81e4889ca6dd..1fb50438a868 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -40,6 +40,7 @@ struct of_device_id; struct irq_chip; struct irq_data; struct cpumask; +struct seq_file; /* Number of irqs reserved for a legacy isa controller */ #define NUM_ISA_INTERRUPTS 16 @@ -104,7 +105,6 @@ struct irq_domain_ops { int (*xlate)(struct irq_domain *d, struct device_node *node, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type); - #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* extended V2 interfaces to support hierarchy irq_domains */ int (*alloc)(struct irq_domain *d, unsigned int virq, @@ -116,6 +116,10 @@ struct irq_domain_ops { int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); #endif +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + void (*debug_show)(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind); +#endif }; extern struct irq_domain_ops irq_generic_chip_ops; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b7d1023b9b22..7f608ac39653 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) data->domain ? data->domain->name : ""); seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); irq_debug_show_chip(m, data, ind + 1); + if (data->domain && data->domain->ops && data->domain->ops->debug_show) + data->domain->ops->debug_show(m, NULL, data, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!data->parent_data) return; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..1a75a00ee01f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1810,6 +1810,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) d->revmap_size + d->revmap_direct_max_irq); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); + if (d->ops && d->ops->debug_show) + d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; From 457f6d35072f395508255ef09fd08f824382cf85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:07 +0200 Subject: [PATCH 0089/1085] genirq: Make state consistent for !IRQ_DOMAIN_HIERARCHY In the !IRQ_DOMAIN_HIERARCHY cas the activation stubs are not setting/clearing the activation status bits. This is not a problem at the moment, but upcoming changes require a correct status. Add the set/clear incovations to the stub functions and move them to the core internal header to avoid duplication and visibility outside the core. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.591985591@linutronix.de --- include/linux/irqdomain.h | 4 ---- kernel/irq/internals.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 1fb50438a868..de06ce992a2d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -511,8 +511,6 @@ static inline bool irq_domain_is_msi_remap(struct irq_domain *domain) extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain); #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -static inline void irq_domain_activate_irq(struct irq_data *data) { } -static inline void irq_domain_deactivate_irq(struct irq_data *data) { } static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) { @@ -561,8 +559,6 @@ irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain) #else /* CONFIG_IRQ_DOMAIN */ static inline void irq_dispose_mapping(unsigned int virq) { } -static inline void irq_domain_activate_irq(struct irq_data *data) { } -static inline void irq_domain_deactivate_irq(struct irq_data *data) { } static inline struct irq_domain *irq_find_matching_fwnode( struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cfaec2669093..72b8da2a7c39 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -436,6 +436,17 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ +#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) +static inline void irq_domain_activate_irq(struct irq_data *data) +{ + irqd_set_activated(data); +} +static inline void irq_domain_deactivate_irq(struct irq_data *data) +{ + irqd_clr_activated(data); +} +#endif + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include From 239306fee8a59beb728faf8f42b13b2250b24841 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:08 +0200 Subject: [PATCH 0090/1085] genirq: Set managed shut down flag at init Managed interrupts should start up in managed shutdown mode. Set the status flag when initialising the irq descriptor. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.669687742@linutronix.de --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0f6805b75fc8..982a3576fb01 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -448,7 +448,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, } } - flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; mask = NULL; for (i = 0; i < cnt; i++) { From c942cee46bba761ce97ee6d4fc71892e064e8628 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:09 +0200 Subject: [PATCH 0091/1085] genirq: Separate activation and startup Activation of an interrupt and startup are currently a combo functionlity. That works so far, but upcoming changes require a strict separation because the activation can fail in future. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.754334077@linutronix.de --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 30 ++++++++++++++++++++++++------ kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 17 ++++++++++++++++- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d30a0dd5cc02..6608d03efb23 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_NORESEND); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..37dd34d922f4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -207,20 +207,19 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt * installment or irq auto probing should not happen on - * managed irqs either. Emit a warning, break the affinity - * and start it up as a normal interrupt. + * managed irqs either. */ if (WARN_ON_ONCE(force)) - return IRQ_STARTUP_NORMAL; + return IRQ_STARTUP_ABORT; /* * The interrupt was requested, but there is no online CPU * in it's affinity mask. Put it into managed shutdown * state and let the cpu hotplug mechanism start it up once * a CPU in the mask becomes available. */ - irqd_set_managed_shutdown(d); return IRQ_STARTUP_ABORT; } + irq_domain_activate_irq(d); return IRQ_STARTUP_MANAGED; } #else @@ -236,7 +235,9 @@ static int __irq_startup(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); int ret = 0; - irq_domain_activate_irq(d); + /* Warn if this interrupt is not activated but try nevertheless */ + WARN_ON_ONCE(!irqd_is_activated(d)); + if (d->chip->irq_startup) { ret = d->chip->irq_startup(d); irq_state_clr_disabled(desc); @@ -269,6 +270,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) irq_set_affinity_locked(d, aff, false); break; case IRQ_STARTUP_ABORT: + irqd_set_managed_shutdown(d); return 0; } } @@ -278,6 +280,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) return ret; } +int irq_activate(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + if (!irqd_affinity_is_managed(d)) + irq_domain_activate_irq(d); + return 0; +} + +void irq_activate_and_startup(struct irq_desc *desc, bool resend) +{ + if (WARN_ON(irq_activate(desc))) + return; + irq_startup(desc, resend, IRQ_START_FORCE); +} + static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) @@ -953,7 +971,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; - irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_RESEND); } } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 72b8da2a7c39..8bd131e4c944 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,6 +74,8 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_FORCE true #define IRQ_START_COND false +extern int irq_activate(struct irq_desc *desc); +extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0e8b48315f3c..e667912d0e9c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -519,7 +519,7 @@ void __enable_irq(struct irq_desc *desc) * time. If it was already started up, then irq_startup() * will invoke irq_enable() under the hood. */ - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: @@ -1325,6 +1325,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_unlock; } + /* + * Activate the interrupt. That activation must happen + * independently of IRQ_NOAUTOEN. request_irq() can fail + * and the callers are supposed to handle + * that. enable_irq() of an interrupt requested with + * IRQ_NOAUTOEN is not supposed to fail. The activation + * keeps it in shutdown mode, it merily associates + * resources if necessary and if that's not possible it + * fails. Interrupts which are in managed shutdown mode + * will simply ignore that activation request. + */ + ret = irq_activate(desc); + if (ret) + goto out_unlock; + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); From 72491643469aab736536ae71dcd199b19dabd891 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:10 +0200 Subject: [PATCH 0092/1085] genirq/irqdomain: Update irq_domain_ops.activate() signature The irq_domain_ops.activate() callback has no return value and no way to tell the function that the activation is early. The upcoming changes to support a reservation scheme which allows to assign interrupt vectors on x86 only when the interrupt is actually requested requires: - A return value, so activation can fail at request_irq() time - Information that the activate invocation is early, i.e. before request_irq(). Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.848490816@linutronix.de --- arch/x86/include/asm/irqdomain.h | 4 ++-- arch/x86/kernel/apic/htirq.c | 5 +++-- arch/x86/kernel/apic/io_apic.c | 5 +++-- arch/x86/platform/uv/uv_irq.c | 5 +++-- drivers/gpio/gpio-xgene-sb.c | 8 +++++--- drivers/iommu/amd_iommu.c | 5 +++-- drivers/iommu/intel_irq_remapping.c | 5 +++-- drivers/irqchip/irq-gic-v3-its.c | 10 ++++++---- drivers/pinctrl/stm32/pinctrl-stm32.c | 5 +++-- include/linux/irqdomain.h | 2 +- kernel/irq/irqdomain.c | 2 +- kernel/irq/msi.c | 5 +++-- 12 files changed, 36 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index d26075b52885..1d9091ffa140 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -41,8 +41,8 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg); extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); -extern void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data); +extern int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early); extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 56ccf9346b08..b07075dce8b7 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -112,8 +112,8 @@ static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -static void htirq_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct ht_irq_msg msg; struct irq_cfg *cfg = irqd_cfg(irq_data); @@ -132,6 +132,7 @@ static void htirq_domain_activate(struct irq_domain *domain, HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; write_ht_irq_msg(irq_data->irq, &msg); + return 0; } static void htirq_domain_deactivate(struct irq_domain *domain, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 70e48aa6af98..d50e46757f6d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2977,8 +2977,8 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { unsigned long flags; struct irq_pin_list *entry; @@ -2988,6 +2988,7 @@ void mp_irqdomain_activate(struct irq_domain *domain, for_each_irq_pin(entry, data->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, data->entry); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return 0; } void mp_irqdomain_deactivate(struct irq_domain *domain, diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 03fc397335b7..5f6fd860820a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -127,10 +127,11 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq, * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. */ -static void uv_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int uv_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); + return 0; } /* diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index 033258634b8c..b5843fe6a44d 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -140,8 +140,9 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio) return irq_create_fwspec_mapping(&fwspec); } -static void xgene_gpio_sb_domain_activate(struct irq_domain *d, - struct irq_data *irq_data) +static int xgene_gpio_sb_domain_activate(struct irq_domain *d, + struct irq_data *irq_data, + bool early) { struct xgene_gpio_sb *priv = d->host_data; u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); @@ -150,11 +151,12 @@ static void xgene_gpio_sb_domain_activate(struct irq_domain *d, dev_err(priv->gc.parent, "Unable to configure XGene GPIO standby pin %d as IRQ\n", gpio); - return; + return -ENOSPC; } xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 1); + return 0; } static void xgene_gpio_sb_domain_deactivate(struct irq_domain *d, diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 51f8215877f5..ea03f4138f5f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4170,8 +4170,8 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_common(domain, virq, nr_irqs); } -static void irq_remapping_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; @@ -4180,6 +4180,7 @@ static void irq_remapping_activate(struct irq_domain *domain, if (iommu) iommu->irte_ops->activate(data->entry, irte_info->devid, irte_info->index); + return 0; } static void irq_remapping_deactivate(struct irq_domain *domain, diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index a5b89f6bcdbf..762d84713b7a 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1389,12 +1389,13 @@ static void intel_irq_remapping_free(struct irq_domain *domain, irq_domain_free_irqs_common(domain, virq, nr_irqs); } -static void intel_irq_remapping_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int intel_irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct intel_ir_data *data = irq_data->chip_data; modify_irte(&data->irq_2_iommu, &data->irte_entry); + return 0; } static void intel_irq_remapping_deactivate(struct irq_domain *domain, diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index e8d89343d613..20e2b5fac7b9 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2186,8 +2186,8 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, return 0; } -static void its_irq_domain_activate(struct irq_domain *domain, - struct irq_data *d) +static int its_irq_domain_activate(struct irq_domain *domain, + struct irq_data *d, bool early) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); @@ -2205,6 +2205,7 @@ static void its_irq_domain_activate(struct irq_domain *domain, /* Map the GIC IRQ and event to the device */ its_send_mapti(its_dev, d->hwirq, event); + return 0; } static void its_irq_domain_deactivate(struct irq_domain *domain, @@ -2678,8 +2679,8 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq return err; } -static void its_vpe_irq_domain_activate(struct irq_domain *domain, - struct irq_data *d) +static int its_vpe_irq_domain_activate(struct irq_domain *domain, + struct irq_data *d, bool early) { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); @@ -2687,6 +2688,7 @@ static void its_vpe_irq_domain_activate(struct irq_domain *domain, vpe->col_idx = cpumask_first(cpu_online_mask); its_send_vmapp(vpe, true); its_send_vinvall(vpe); + return 0; } static void its_vpe_irq_domain_deactivate(struct irq_domain *domain, diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 50299ad96659..02b66588cac6 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -289,13 +289,14 @@ static int stm32_gpio_domain_translate(struct irq_domain *d, return 0; } -static void stm32_gpio_domain_activate(struct irq_domain *d, - struct irq_data *irq_data) +static int stm32_gpio_domain_activate(struct irq_domain *d, + struct irq_data *irq_data, bool early) { struct stm32_gpio_bank *bank = d->host_data; struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent); regmap_field_write(pctl->irqmux[irq_data->hwirq], bank->bank_nr); + return 0; } static int stm32_gpio_domain_alloc(struct irq_domain *d, diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index de06ce992a2d..9bc07f59b090 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -111,7 +111,7 @@ struct irq_domain_ops { unsigned int nr_irqs, void *arg); void (*free)(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs); - void (*activate)(struct irq_domain *d, struct irq_data *irq_data); + int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early); void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1a75a00ee01f..1423f8a6c75d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1690,7 +1690,7 @@ static void __irq_domain_activate_irq(struct irq_data *irq_data) if (irq_data->parent_data) __irq_domain_activate_irq(irq_data->parent_data); if (domain->ops->activate) - domain->ops->activate(domain, irq_data); + domain->ops->activate(domain, irq_data, false); } } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 94ecc0293844..6155fff9f647 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -102,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data, return ret; } -static void msi_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct msi_msg msg; BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); irq_chip_write_msi_msg(irq_data, &msg); + return 0; } static void msi_domain_deactivate(struct irq_domain *domain, From bb9b428a5c832d7abb494fbabac37c515c01c6c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:11 +0200 Subject: [PATCH 0093/1085] genirq/irqdomain: Allow irq_domain_activate_irq() to fail Allow irq_domain_activate_irq() to fail. This is required to support a reservation and late vector assignment scheme. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.933882227@linutronix.de --- include/linux/irqdomain.h | 2 +- kernel/irq/chip.c | 9 +++++++-- kernel/irq/internals.h | 3 ++- kernel/irq/irqdomain.c | 42 ++++++++++++++++++++++++--------------- kernel/irq/msi.c | 19 ++++++++++++++++-- 5 files changed, 53 insertions(+), 22 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9bc07f59b090..8fb877121984 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct cpumask *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern void irq_domain_activate_irq(struct irq_data *irq_data); +extern int irq_domain_activate_irq(struct irq_data *irq_data); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 37dd34d922f4..cd5b3eb38082 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -219,7 +219,12 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) */ return IRQ_STARTUP_ABORT; } - irq_domain_activate_irq(d); + /* + * Managed interrupts have reserved resources, so this should not + * happen. + */ + if (WARN_ON(irq_domain_activate_irq(d))) + return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } #else @@ -285,7 +290,7 @@ int irq_activate(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) - irq_domain_activate_irq(d); + return irq_domain_activate_irq(d); return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8bd131e4c944..e84d0e3899f6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -439,9 +439,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline void irq_domain_activate_irq(struct irq_data *data) +static inline int irq_domain_activate_irq(struct irq_data *data) { irqd_set_activated(data); + return 0; } static inline void irq_domain_deactivate_irq(struct irq_data *data) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1423f8a6c75d..47e8ddd9e8cf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1682,18 +1682,6 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); -static void __irq_domain_activate_irq(struct irq_data *irq_data) -{ - if (irq_data && irq_data->domain) { - struct irq_domain *domain = irq_data->domain; - - if (irq_data->parent_data) - __irq_domain_activate_irq(irq_data->parent_data); - if (domain->ops->activate) - domain->ops->activate(domain, irq_data, false); - } -} - static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { @@ -1706,6 +1694,25 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } +static int __irq_domain_activate_irq(struct irq_data *irqd) +{ + int ret = 0; + + if (irqd && irqd->domain) { + struct irq_domain *domain = irqd->domain; + + if (irqd->parent_data) + ret = __irq_domain_activate_irq(irqd->parent_data); + if (!ret && domain->ops->activate) { + ret = domain->ops->activate(domain, irqd, false); + /* Rollback in case of error */ + if (ret && irqd->parent_data) + __irq_domain_deactivate_irq(irqd->parent_data); + } + } + return ret; +} + /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt @@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -void irq_domain_activate_irq(struct irq_data *irq_data) +int irq_domain_activate_irq(struct irq_data *irq_data) { - if (!irqd_is_activated(irq_data)) { - __irq_domain_activate_irq(irq_data); + int ret = 0; + + if (!irqd_is_activated(irq_data)) + ret = __irq_domain_activate_irq(irq_data); + if (!ret) irqd_set_activated(irq_data); - } + return ret; } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6155fff9f647..5ece369950ec 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -401,11 +401,26 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; irq_data = irq_domain_get_irq_data(domain, desc->irq); - irq_domain_activate_irq(irq_data); + ret = irq_domain_activate_irq(irq_data); + if (ret) + goto cleanup; } } - return 0; + +cleanup: + for_each_msi_entry(desc, dev) { + struct irq_data *irqd; + + if (desc->irq == virq) + break; + + irqd = irq_domain_get_irq_data(domain, desc->irq); + if (irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); + } + msi_domain_free_irqs(domain, dev); + return ret; } /** From 42e1cc2dc5b698181ab1ffb7972bd880230c506e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:12 +0200 Subject: [PATCH 0094/1085] genirq/irqdomain: Propagate early activation Propagate the early activation mode to the irqdomain activate() callbacks. This is required for the upcoming reservation, late vector assignment scheme, so that the early activation call can act accordingly. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.028353660@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 4 ++-- include/linux/irqdomain.h | 2 +- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 11 ++++++----- kernel/irq/msi.c | 2 +- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d50e46757f6d..6f1007fd9783 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2096,7 +2096,7 @@ static inline void __init check_timer(void) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2118,7 +2118,7 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 8fb877121984..7d0c6c144708 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct cpumask *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern int irq_domain_activate_irq(struct irq_data *irq_data); +extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cd5b3eb38082..82333835ac66 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -223,7 +223,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) * Managed interrupts have reserved resources, so this should not * happen. */ - if (WARN_ON(irq_domain_activate_irq(d))) + if (WARN_ON(irq_domain_activate_irq(d, false))) return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } @@ -290,7 +290,7 @@ int irq_activate(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) - return irq_domain_activate_irq(d); + return irq_domain_activate_irq(d, false); return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e84d0e3899f6..a0327136e469 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -439,7 +439,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data) +static inline int irq_domain_activate_irq(struct irq_data *data, bool early) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 47e8ddd9e8cf..b50f737574ae 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1694,7 +1694,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) { int ret = 0; @@ -1702,9 +1702,10 @@ static int __irq_domain_activate_irq(struct irq_data *irqd) struct irq_domain *domain = irqd->domain; if (irqd->parent_data) - ret = __irq_domain_activate_irq(irqd->parent_data); + ret = __irq_domain_activate_irq(irqd->parent_data, + early); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, false); + ret = domain->ops->activate(domain, irqd, early); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1721,12 +1722,12 @@ static int __irq_domain_activate_irq(struct irq_data *irqd) * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data) +int irq_domain_activate_irq(struct irq_data *irq_data, bool early) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data); + ret = __irq_domain_activate_irq(irq_data, early); if (!ret) irqd_set_activated(irq_data); return ret; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5ece369950ec..d7553d015175 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -401,7 +401,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data); + ret = irq_domain_activate_irq(irq_data, true); if (ret) goto cleanup; } From 22d0b12f3560d3b3264ee79faa1c05a5060fb916 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:13 +0200 Subject: [PATCH 0095/1085] genirq/irqdomain: Add force reactivation flag to irq domains Allow irqdomains to tell the core code, that after early activation the interrupt needs to be reactivated at request_irq() time. This allows reservation of vectors at early activation time and actual vector assignment at request_irq() time. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.106242536@linutronix.de --- include/linux/msi.h | 5 +++++ kernel/irq/msi.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/msi.h b/include/linux/msi.h index 80e3b562bef6..eff16ef81f43 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -283,6 +283,11 @@ enum { MSI_FLAG_PCI_MSIX = (1 << 3), /* Needs early activate, required for PCI */ MSI_FLAG_ACTIVATE_EARLY = (1 << 4), + /* + * Must reactivate when irq is started even when + * MSI_FLAG_ACTIVATE_EARLY has been set. + */ + MSI_FLAG_MUST_REACTIVATE = (1 << 5), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index d7553d015175..edb987b2c58d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -404,6 +404,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ret = irq_domain_activate_irq(irq_data, true); if (ret) goto cleanup; + if (info->flags & MSI_FLAG_MUST_REACTIVATE) + irqd_clr_activated(irq_data); } } return 0; From 2f75d9e1c90511bff6d1ce4de94503cc28fec032 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:14 +0200 Subject: [PATCH 0096/1085] genirq: Implement bitmap matrix allocator Implement the infrastructure for a simple bitmap based allocator, which will replace the x86 vector allocator. It's in the core code as other architectures might be able to reuse/extend it. For now it only implements allocations for single CPUs, but it's simple to add multi CPU allocation support if required. The concept is rather simple: Global information: system_vector bitmap global accounting PerCPU information: allocation bitmap managed allocation bitmap local accounting The system vector bitmap is used to exclude vectors system wide from the allocation space. The allocation bitmap is used to keep track of per cpu used vectors. The managed allocation bitmap is used to reserve vectors for managed interrupts. When a regular (non managed) interrupt allocation happens then the following rule applies: tmpmap = system_map | alloc_map | managed_map find_zero_bit(tmpmap) Oring the bitmaps together gives the real available space. The same rule applies for reserving a managed interrupt vector. But contrary to the regular interrupts the reservation only marks the bit in the managed map and therefor excludes it from the regular allocations. The managed map is only cleaned out when the a managed interrupt is completely released and it stays alive accross CPU offline/online operations. For managed interrupt allocations the rule is: tmpmap = managed_map & ~alloc_map find_first_bit(tmpmap) This returns the first bit which is in the managed map, but not yet allocated in the allocation map. The allocation marks it in the allocation map and hands it back to the caller for use. The rest of the code are helper functions to handle the various requirements and the accounting which are necessary to replace the x86 vector allocation code. The result is a single patch as the evolution of this infrastructure cannot be represented in bits and pieces. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Chris Metcalf Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.185437174@linutronix.de --- include/linux/irq.h | 22 +++ kernel/irq/Kconfig | 3 + kernel/irq/Makefile | 1 + kernel/irq/matrix.c | 428 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 454 insertions(+) create mode 100644 kernel/irq/matrix.c diff --git a/include/linux/irq.h b/include/linux/irq.h index d4728bf6a537..fda8da7c45e7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1113,6 +1113,28 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +struct irq_matrix; +struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, + unsigned int alloc_start, + unsigned int alloc_end); +void irq_matrix_online(struct irq_matrix *m); +void irq_matrix_offline(struct irq_matrix *m); +void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); +int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); +void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); +int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu); +void irq_matrix_reserve(struct irq_matrix *m); +void irq_matrix_remove_reserved(struct irq_matrix *m); +int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, + bool reserved, unsigned int *mapped_cpu); +void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, + unsigned int bit, bool managed); +void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); +unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); +unsigned int irq_matrix_allocated(struct irq_matrix *m); +unsigned int irq_matrix_reserved(struct irq_matrix *m); +void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); + /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a117adf7084b..ac1a3e29d3b9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -97,6 +97,9 @@ config HANDLE_DOMAIN_IRQ config IRQ_TIMINGS bool +config GENERIC_IRQ_MATRIX_ALLOCATOR + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1970cafe8f2a..329c9193b4bf 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o +obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c new file mode 100644 index 000000000000..7b2b4fbde1e2 --- /dev/null +++ b/kernel/irq/matrix.c @@ -0,0 +1,428 @@ +/* + * Copyright (C) 2017 Thomas Gleixner + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include +#include +#include +#include +#include + +#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long)) + +struct cpumap { + unsigned int available; + unsigned int allocated; + unsigned int managed; + bool online; + unsigned long alloc_map[IRQ_MATRIX_SIZE]; + unsigned long managed_map[IRQ_MATRIX_SIZE]; +}; + +struct irq_matrix { + unsigned int matrix_bits; + unsigned int alloc_start; + unsigned int alloc_end; + unsigned int alloc_size; + unsigned int global_available; + unsigned int global_reserved; + unsigned int systembits_inalloc; + unsigned int total_allocated; + unsigned int online_maps; + struct cpumap __percpu *maps; + unsigned long scratch_map[IRQ_MATRIX_SIZE]; + unsigned long system_map[IRQ_MATRIX_SIZE]; +}; + +/** + * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it + * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS + * @alloc_start: From which bit the allocation search starts + * @alloc_end: At which bit the allocation search ends, i.e first + * invalid bit + */ +__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, + unsigned int alloc_start, + unsigned int alloc_end) +{ + struct irq_matrix *m; + + if (matrix_bits > IRQ_MATRIX_BITS) + return NULL; + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return NULL; + + m->matrix_bits = matrix_bits; + m->alloc_start = alloc_start; + m->alloc_end = alloc_end; + m->alloc_size = alloc_end - alloc_start; + m->maps = alloc_percpu(*m->maps); + if (!m->maps) { + kfree(m); + return NULL; + } + return m; +} + +/** + * irq_matrix_online - Bring the local CPU matrix online + * @m: Matrix pointer + */ +void irq_matrix_online(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(cm->online); + + bitmap_zero(cm->alloc_map, m->matrix_bits); + cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); + cm->allocated = 0; + m->global_available += cm->available; + cm->online = true; + m->online_maps++; +} + +/** + * irq_matrix_offline - Bring the local CPU matrix offline + * @m: Matrix pointer + */ +void irq_matrix_offline(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + /* Update the global available size */ + m->global_available -= cm->available; + cm->online = false; + m->online_maps--; +} + +static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, + unsigned int num, bool managed) +{ + unsigned int area, start = m->alloc_start; + unsigned int end = m->alloc_end; + + bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end); + bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end); + area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0); + if (area >= end) + return area; + if (managed) + bitmap_set(cm->managed_map, area, num); + else + bitmap_set(cm->alloc_map, area, num); + return area; +} + +/** + * irq_matrix_assign_system - Assign system wide entry in the matrix + * @m: Matrix pointer + * @bit: Which bit to reserve + * @replace: Replace an already allocated vector with a system + * vector at the same bit position. + * + * The BUG_ON()s below are on purpose. If this goes wrong in the + * early boot process, then the chance to survive is about zero. + * If this happens when the system is life, it's not much better. + */ +void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, + bool replace) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(bit > m->matrix_bits); + BUG_ON(m->online_maps > 1 || (m->online_maps && !replace)); + + set_bit(bit, m->system_map); + if (replace) { + BUG_ON(!test_and_clear_bit(bit, cm->alloc_map)); + cm->allocated--; + m->total_allocated--; + } + if (bit >= m->alloc_start && bit < m->alloc_end) + m->systembits_inalloc++; +} + +/** + * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be reserved. + * + * Can be called for offline CPUs. Note, this will only reserve one bit + * on all CPUs in @msk, but it's not guaranteed that the bits are at the + * same offset on all CPUs + */ +int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu, failed_cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + bit = matrix_alloc_area(m, cm, 1, true); + if (bit >= m->alloc_end) + goto cleanup; + cm->managed++; + if (cm->online) { + cm->available--; + m->global_available--; + } + } + return 0; +cleanup: + failed_cpu = cpu; + for_each_cpu(cpu, msk) { + if (cpu == failed_cpu) + break; + irq_matrix_remove_managed(m, cpumask_of(cpu)); + } + return -ENOSPC; +} + +/** + * irq_matrix_remove_managed - Remove managed interrupts in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be removed + * + * Can be called for offline CPUs + * + * This removes not allocated managed interrupts from the map. It does + * not matter which one because the managed interrupts free their + * allocation when they shut down. If not, the accounting is screwed, + * but all what can be done at this point is warn about it. + */ +void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + if (WARN_ON_ONCE(!cm->managed)) + continue; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + + bit = find_first_bit(m->scratch_map, end); + if (WARN_ON_ONCE(bit >= end)) + continue; + + clear_bit(bit, cm->managed_map); + + cm->managed--; + if (cm->online) { + cm->available++; + m->global_available++; + } + } +} + +/** + * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map + * @m: Matrix pointer + * @cpu: On which CPU the interrupt should be allocated + */ +int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + bit = find_first_bit(m->scratch_map, end); + if (bit >= end) + return -ENOSPC; + set_bit(bit, cm->alloc_map); + cm->allocated++; + m->total_allocated++; + return bit; +} + +/** + * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map + * @m: Matrix pointer + * @bit: Which bit to mark + * + * This should only be used to mark preallocated vectors + */ +void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map))) + return; + cm->allocated++; + m->total_allocated++; + cm->available--; + m->global_available--; +} + +/** + * irq_matrix_reserve - Reserve interrupts + * @m: Matrix pointer + * + * This is merily a book keeping call. It increments the number of globally + * reserved interrupt bits w/o actually allocating them. This allows to + * setup interrupt descriptors w/o assigning low level resources to it. + * The actual allocation happens when the interrupt gets activated. + */ +void irq_matrix_reserve(struct irq_matrix *m) +{ + if (m->global_reserved <= m->global_available && + m->global_reserved + 1 > m->global_available) + pr_warn("Interrupt reservation exceeds available resources\n"); + + m->global_reserved++; +} + +/** + * irq_matrix_remove_reserved - Remove interrupt reservation + * @m: Matrix pointer + * + * This is merily a book keeping call. It decrements the number of globally + * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the + * interrupt was never in use and a real vector allocated, which undid the + * reservation. + */ +void irq_matrix_remove_reserved(struct irq_matrix *m) +{ + m->global_reserved--; +} + +/** + * irq_matrix_alloc - Allocate a regular interrupt in a CPU map + * @m: Matrix pointer + * @msk: Which CPUs to search in + * @reserved: Allocate previously reserved interrupts + * @mapped_cpu: Pointer to store the CPU for which the irq was allocated + */ +int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, + bool reserved, unsigned int *mapped_cpu) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + if (!cm->online) + continue; + + bit = matrix_alloc_area(m, cm, 1, false); + if (bit < m->alloc_end) { + cm->allocated++; + cm->available--; + m->total_allocated++; + m->global_available--; + if (reserved) + m->global_reserved--; + *mapped_cpu = cpu; + return bit; + } + } + return -ENOSPC; +} + +/** + * irq_matrix_free - Free allocated interrupt in the matrix + * @m: Matrix pointer + * @cpu: Which CPU map needs be updated + * @bit: The bit to remove + * @managed: If true, the interrupt is managed and not accounted + * as available. + */ +void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, + unsigned int bit, bool managed) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + + if (cm->online) { + clear_bit(bit, cm->alloc_map); + cm->allocated--; + m->total_allocated--; + if (!managed) { + cm->available++; + m->global_available++; + } + } +} + +/** + * irq_matrix_available - Get the number of globally available irqs + * @m: Pointer to the matrix to query + * @cpudown: If true, the local CPU is about to go down, adjust + * the number of available irqs accordingly + */ +unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return m->global_available - cpudown ? cm->available : 0; +} + +/** + * irq_matrix_reserved - Get the number of globally reserved irqs + * @m: Pointer to the matrix to query + */ +unsigned int irq_matrix_reserved(struct irq_matrix *m) +{ + return m->global_reserved; +} + +/** + * irq_matrix_allocated - Get the number of allocated irqs on the local cpu + * @m: Pointer to the matrix to search + * + * This returns number of allocated irqs + */ +unsigned int irq_matrix_allocated(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return cm->allocated; +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +/** + * irq_matrix_debug_show - Show detailed allocation information + * @sf: Pointer to the seq_file to print to + * @m: Pointer to the matrix allocator + * @ind: Indentation for the print format + * + * Note, this is a lockless snapshot. + */ +void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) +{ + unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits); + int cpu; + + seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps); + seq_printf(sf, "Global available: %6u\n", m->global_available); + seq_printf(sf, "Global reserved: %6u\n", m->global_reserved); + seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); + seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, + m->system_map); + seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", + cpu, cm->available, cm->managed, cm->allocated, + m->matrix_bits, cm->alloc_map); + } + cpus_read_unlock(); +} +#endif From ec0f7cd273dc41ab28bba703cac82690ea5f2863 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:15 +0200 Subject: [PATCH 0097/1085] genirq/matrix: Add tracepoints Add tracepoints for the irq bitmap matrix allocator. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.279468022@linutronix.de --- include/trace/events/irq_matrix.h | 201 ++++++++++++++++++++++++++++++ kernel/irq/matrix.c | 15 +++ 2 files changed, 216 insertions(+) create mode 100644 include/trace/events/irq_matrix.h diff --git a/include/trace/events/irq_matrix.h b/include/trace/events/irq_matrix.h new file mode 100644 index 000000000000..267d4cbbf360 --- /dev/null +++ b/include/trace/events/irq_matrix.h @@ -0,0 +1,201 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq_matrix + +#if !defined(_TRACE_IRQ_MATRIX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_MATRIX_H + +#include + +struct irq_matrix; +struct cpumap; + +DECLARE_EVENT_CLASS(irq_matrix_global, + + TP_PROTO(struct irq_matrix *matrix), + + TP_ARGS(matrix), + + TP_STRUCT__entry( + __field( unsigned int, online_maps ) + __field( unsigned int, global_available ) + __field( unsigned int, global_reserved ) + __field( unsigned int, total_allocated ) + ), + + TP_fast_assign( + __entry->online_maps = matrix->online_maps; + __entry->global_available = matrix->global_available; + __entry->global_reserved = matrix->global_reserved; + __entry->total_allocated = matrix->total_allocated; + ), + + TP_printk("online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u", + __entry->online_maps, __entry->global_available, + __entry->global_reserved, __entry->total_allocated) +); + +DECLARE_EVENT_CLASS(irq_matrix_global_update, + + TP_PROTO(int bit, struct irq_matrix *matrix), + + TP_ARGS(bit, matrix), + + TP_STRUCT__entry( + __field( int, bit ) + __field( unsigned int, online_maps ) + __field( unsigned int, global_available ) + __field( unsigned int, global_reserved ) + __field( unsigned int, total_allocated ) + ), + + TP_fast_assign( + __entry->bit = bit; + __entry->online_maps = matrix->online_maps; + __entry->global_available = matrix->global_available; + __entry->global_reserved = matrix->global_reserved; + __entry->total_allocated = matrix->total_allocated; + ), + + TP_printk("bit=%d online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u", + __entry->bit, __entry->online_maps, + __entry->global_available, __entry->global_reserved, + __entry->total_allocated) +); + +DECLARE_EVENT_CLASS(irq_matrix_cpu, + + TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix, + struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap), + + TP_STRUCT__entry( + __field( int, bit ) + __field( unsigned int, cpu ) + __field( bool, online ) + __field( unsigned int, available ) + __field( unsigned int, allocated ) + __field( unsigned int, managed ) + __field( unsigned int, online_maps ) + __field( unsigned int, global_available ) + __field( unsigned int, global_reserved ) + __field( unsigned int, total_allocated ) + ), + + TP_fast_assign( + __entry->bit = bit; + __entry->cpu = cpu; + __entry->online = cmap->online; + __entry->available = cmap->available; + __entry->allocated = cmap->allocated; + __entry->managed = cmap->managed; + __entry->online_maps = matrix->online_maps; + __entry->global_available = matrix->global_available; + __entry->global_reserved = matrix->global_reserved; + __entry->total_allocated = matrix->total_allocated; + ), + + TP_printk("bit=%d cpu=%u online=%d avl=%u alloc=%u managed=%u online_maps=%u global_avl=%u, global_rsvd=%u, total_alloc=%u", + __entry->bit, __entry->cpu, __entry->online, + __entry->available, __entry->allocated, + __entry->managed, __entry->online_maps, + __entry->global_available, __entry->global_reserved, + __entry->total_allocated) +); + +DEFINE_EVENT(irq_matrix_global, irq_matrix_online, + + TP_PROTO(struct irq_matrix *matrix), + + TP_ARGS(matrix) +); + +DEFINE_EVENT(irq_matrix_global, irq_matrix_offline, + + TP_PROTO(struct irq_matrix *matrix), + + TP_ARGS(matrix) +); + +DEFINE_EVENT(irq_matrix_global, irq_matrix_reserve, + + TP_PROTO(struct irq_matrix *matrix), + + TP_ARGS(matrix) +); + +DEFINE_EVENT(irq_matrix_global, irq_matrix_remove_reserved, + + TP_PROTO(struct irq_matrix *matrix), + + TP_ARGS(matrix) +); + +DEFINE_EVENT(irq_matrix_global_update, irq_matrix_assign_system, + + TP_PROTO(int bit, struct irq_matrix *matrix), + + TP_ARGS(bit, matrix) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_reserved, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_reserve_managed, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_remove_managed, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_managed, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_assign, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + +DEFINE_EVENT(irq_matrix_cpu, irq_matrix_free, + + TP_PROTO(int bit, unsigned int cpu, + struct irq_matrix *matrix, struct cpumap *cmap), + + TP_ARGS(bit, cpu, matrix, cmap) +); + + +#endif /* _TRACE_IRQ_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7b2b4fbde1e2..a3cbbc8191c5 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -36,6 +36,9 @@ struct irq_matrix { unsigned long system_map[IRQ_MATRIX_SIZE]; }; +#define CREATE_TRACE_POINTS +#include + /** * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS @@ -84,6 +87,7 @@ void irq_matrix_online(struct irq_matrix *m) m->global_available += cm->available; cm->online = true; m->online_maps++; + trace_irq_matrix_online(m); } /** @@ -98,6 +102,7 @@ void irq_matrix_offline(struct irq_matrix *m) m->global_available -= cm->available; cm->online = false; m->online_maps--; + trace_irq_matrix_offline(m); } static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, @@ -145,6 +150,8 @@ void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, } if (bit >= m->alloc_start && bit < m->alloc_end) m->systembits_inalloc++; + + trace_irq_matrix_assign_system(bit, m); } /** @@ -172,6 +179,7 @@ int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) cm->available--; m->global_available--; } + trace_irq_matrix_reserve_managed(bit, cpu, m, cm); } return 0; cleanup: @@ -221,6 +229,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) cm->available++; m->global_available++; } + trace_irq_matrix_remove_managed(bit, cpu, m, cm); } } @@ -242,6 +251,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) set_bit(bit, cm->alloc_map); cm->allocated++; m->total_allocated++; + trace_irq_matrix_alloc_managed(bit, cpu, m, cm); return bit; } @@ -264,6 +274,7 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) m->total_allocated++; cm->available--; m->global_available--; + trace_irq_matrix_assign(bit, smp_processor_id(), m, cm); } /** @@ -282,6 +293,7 @@ void irq_matrix_reserve(struct irq_matrix *m) pr_warn("Interrupt reservation exceeds available resources\n"); m->global_reserved++; + trace_irq_matrix_reserve(m); } /** @@ -296,6 +308,7 @@ void irq_matrix_reserve(struct irq_matrix *m) void irq_matrix_remove_reserved(struct irq_matrix *m) { m->global_reserved--; + trace_irq_matrix_remove_reserved(m); } /** @@ -326,6 +339,7 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, if (reserved) m->global_reserved--; *mapped_cpu = cpu; + trace_irq_matrix_alloc(bit, cpu, m, cm); return bit; } } @@ -357,6 +371,7 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, m->global_available++; } } + trace_irq_matrix_free(bit, cpu, m, cm); } /** From 981c2eac1cb97c7db64acf567950e7e81019dd33 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:16 +0200 Subject: [PATCH 0098/1085] x86/apic: Deinline x2apic functions These inline functions are used in both the cluster and the physical x2apic code to fill in the function pointers of the apic structure. That means the code is generated twice for no reason. Move it to a C code and reuse it. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.358954066@linutronix.de --- arch/x86/include/asm/x2apic.h | 49 --------------------------- arch/x86/kernel/apic/x2apic.h | 9 +++++ arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 40 +++++++++++++++++++++- 4 files changed, 49 insertions(+), 51 deletions(-) delete mode 100644 arch/x86/include/asm/x2apic.h create mode 100644 arch/x86/kernel/apic/x2apic.h diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h deleted file mode 100644 index f90f0a587c66..000000000000 --- a/arch/x86/include/asm/x2apic.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Common bits for X2APIC cluster/physical modes. - */ - -#ifndef _ASM_X86_X2APIC_H -#define _ASM_X86_X2APIC_H - -#include -#include -#include - -static int x2apic_apic_id_valid(int apicid) -{ - return 1; -} - -static int x2apic_apic_id_registered(void) -{ - return 1; -} - -static void -__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) -{ - unsigned long cfg = __prepare_ICR(0, vector, dest); - native_x2apic_icr_write(cfg, apicid); -} - -static unsigned int x2apic_get_apic_id(unsigned long id) -{ - return id; -} - -static unsigned long x2apic_set_apic_id(unsigned int id) -{ - return id; -} - -static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} - -static void x2apic_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - -#endif /* _ASM_X86_X2APIC_H */ diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h new file mode 100644 index 000000000000..4c38c2328948 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic.h @@ -0,0 +1,9 @@ +/* Common bits for X2APIC cluster/physical modes. */ + +int x2apic_apic_id_valid(int apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +unsigned long x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 481237cb1544..d7f5132ba5ca 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -8,7 +8,7 @@ #include #include -#include +#include "x2apic.h" static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 3baf0c3dc875..a4f28212dc5d 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -6,7 +6,8 @@ #include #include -#include +#include +#include "x2apic.h" int x2apic_phys; @@ -98,6 +99,43 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } +/* Common x2apic functions, also used by x2apic_cluster */ +int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + +int x2apic_apic_id_registered(void) +{ + return 1; +} + +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +unsigned long x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", From 727657e6205d201e9acdb5d2c25bc1cd63c0ab16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:17 +0200 Subject: [PATCH 0099/1085] x86/apic: Sanitize return value of apic.set_apic_id() The set_apic_id() callback returns an unsigned long value which is handed in to apic_write() as the value argument u32. Adjust the return value so it returns u32 right away. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.437208268@linutronix.de --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 4 ++-- arch/x86/kernel/apic/x2apic.h | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- arch/x86/xen/apic.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 7d247b2d8c54..86a3a359e603 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,7 +305,7 @@ struct apic { unsigned int (*get_apic_id)(unsigned long x); /* Can't be NULL on 64-bit */ - unsigned long (*set_apic_id)(unsigned int id); + u32 (*set_apic_id)(unsigned int id); int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, struct irq_data *irqdata, diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index dedd5a41ba48..6543648f0b81 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -119,7 +119,7 @@ static unsigned int flat_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { return (id & 0xFF) << 24; } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 2fda912219a6..d77c8cc4afc2 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,7 +38,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) return id; } -static unsigned long numachip1_set_apic_id(unsigned int id) +static u32 numachip1_set_apic_id(unsigned int id) { return (id & 0xff) << 24; } @@ -51,7 +51,7 @@ static unsigned int numachip2_get_apic_id(unsigned long x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static unsigned long numachip2_set_apic_id(unsigned int id) +static u32 numachip2_set_apic_id(unsigned int id) { return id << 24; } diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h index 4c38c2328948..b107de381cb5 100644 --- a/arch/x86/kernel/apic/x2apic.h +++ b/arch/x86/kernel/apic/x2apic.h @@ -4,6 +4,6 @@ int x2apic_apic_id_valid(int apicid); int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); -unsigned long x2apic_set_apic_id(unsigned int id); +u32 x2apic_set_apic_id(unsigned int id); int x2apic_phys_pkg_id(int initial_apicid, int index_msb); void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a4f28212dc5d..315c44eda399 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -121,7 +121,7 @@ unsigned int x2apic_get_apic_id(unsigned long id) return id; } -unsigned long x2apic_set_apic_id(unsigned int id) +u32 x2apic_set_apic_id(unsigned int id) { return id; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0d57bb9079c9..1b23b9e0a8af 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -547,7 +547,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) return id; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index b5e48da7fbff..652d62458d8d 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -30,7 +30,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static unsigned long xen_set_apic_id(unsigned int x) +static u32 xen_set_apic_id(unsigned int x) { WARN_ON(1); return x; From 57e0aa446176493f69a8f8e270e9c4addca80772 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:18 +0200 Subject: [PATCH 0100/1085] x86/apic: Sanitize return value of check_apicid_used() The check is boolean, but the function returns unsigned long for no value. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.516730518@linutronix.de --- arch/x86/include/asm/apic.h | 4 ++-- arch/x86/kernel/apic/bigsmp_32.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 86a3a359e603..63f4ad5123cc 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -289,7 +289,7 @@ struct apic { int disable_esr; int dest_logical; - unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); + bool (*check_apicid_used)(physid_mask_t *map, int apicid); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, const struct cpumask *mask); @@ -581,7 +581,7 @@ default_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_copy(retmask, cpumask_of(cpu)); } -static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) +static inline bool default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); } diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 456e45e8bf84..6eb5f10f1599 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,9 +26,9 @@ static int bigsmp_apic_id_registered(void) return 1; } -static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { - return 0; + return false; } static int bigsmp_early_logical_apicid(int cpu) From 0801bbaac00b2c729adb1b1b0e0945ca8bbea088 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:19 +0200 Subject: [PATCH 0101/1085] x86/apic: Move probe32 specific APIC functions The apic functions which are used in probe_32.c are implemented as inlines or in apic.c. There is no reason to have them at random places. Move them to the actual usage site and make them static. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.596768194@linutronix.de --- arch/x86/include/asm/apic.h | 21 --------------------- arch/x86/kernel/apic/apic.c | 10 ---------- arch/x86/kernel/apic/probe_32.c | 25 +++++++++++++++++++++++++ 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 63f4ad5123cc..06a023b2dca1 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -524,31 +524,10 @@ extern void default_setup_apic_routing(void); extern struct apic apic_noop; #ifdef CONFIG_X86_32 - static inline int noop_x86_32_early_logical_apicid(int cpu) { return BAD_APICID; } - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -extern void default_init_apic_ldr(void); - -static inline int default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - -static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - #endif extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ffcd7556795f..1b1aeda189d7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2273,16 +2273,6 @@ int hard_smp_processor_id(void) return read_apic_id(); } -void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - int default_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, unsigned int *apicid) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 63287659adb6..12d171204c8a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,6 +66,31 @@ static void setup_apic_flat_routing(void) #endif } +static int default_apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +/* + * Set up the logical destination ID. Intel recommends to set DFR, LDR and + * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" + * (Intel document number 292116). + */ +static void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +static int default_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + /* should be called last. */ static int probe_default(void) { From 1da91779e1fb79aaed3de118a156b7040f6147c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:20 +0200 Subject: [PATCH 0102/1085] x86/apic: Move APIC noop specific functions Move more inlines to the place where they belong. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.677743545@linutronix.de --- arch/x86/include/asm/apic.h | 7 ------- arch/x86/kernel/apic/apic_noop.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 06a023b2dca1..cf10be9afde0 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -523,13 +523,6 @@ extern void default_setup_apic_routing(void); extern struct apic apic_noop; -#ifdef CONFIG_X86_32 -static inline int noop_x86_32_early_logical_apicid(int cpu) -{ - return BAD_APICID; -} -#endif - extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 6599f437b4ab..c2c6bac28e79 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,6 +108,13 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } +#ifdef CONFIG_X86_32 +static int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} +#endif + struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, From 64063505835663c67cf18524c46e1eb70d30fb54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:21 +0200 Subject: [PATCH 0103/1085] x86/apic: Sanitize 32/64bit APIC callbacks The 32bit and the 64bit implementation of default_cpu_present_to_apicid() and default_check_phys_apicid_present() are exactly the same, but implemented and located differently. Move them to common apic code and get rid of the pointless difference. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.757329991@linutronix.de --- arch/x86/include/asm/apic.h | 30 ------------------------------ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kernel/apic/Makefile | 2 +- arch/x86/kernel/apic/apic_common.c | 20 ++++++++++++++++++++ arch/x86/kernel/setup.c | 12 ------------ 5 files changed, 22 insertions(+), 44 deletions(-) create mode 100644 arch/x86/kernel/apic/apic_common.c diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index cf10be9afde0..6561ea088b6a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -472,9 +472,6 @@ static inline unsigned default_get_apic_id(unsigned long x) extern void apic_send_IPI_self(int vector); DECLARE_PER_CPU(int, x2apic_extra_bits); - -extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int phys_apicid); #endif extern void generic_bigsmp_probe(void); @@ -563,35 +560,8 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma *retmap = *phys_map; } -static inline int __default_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static inline int -__default_check_phys_apicid_present(int phys_apicid) -{ - return physid_isset(phys_apicid, phys_cpu_present_map); -} - -#ifdef CONFIG_X86_32 -static inline int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -static inline int -default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#else extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); -#endif #endif /* CONFIG_X86_LOCAL_APIC */ extern void irq_enter(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c73e493adf07..9d7d856b2d89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1419,7 +1419,7 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC - return __default_cpu_present_to_apicid(mps_cpu); + return default_cpu_present_to_apicid(mps_cpu); #else WARN_ON_ONCE(1); return BAD_APICID; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8e63ebdcbd0b..bd65ce2e768e 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -6,7 +6,7 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c new file mode 100644 index 000000000000..2a084d48db37 --- /dev/null +++ b/arch/x86/kernel/apic/apic_common.c @@ -0,0 +1,20 @@ +/* + * Common functions shared between the various APIC flavours + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include + +int default_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} + +int default_check_phys_apicid_present(int phys_apicid) +{ + return physid_isset(phys_apicid, phys_cpu_present_map); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..82559867e0a9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -136,18 +136,6 @@ RESERVE_BRK(dmi_alloc, 65536); static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; -#ifdef CONFIG_X86_64 -int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -int default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#endif - struct boot_params boot_params; /* From 83a105229c59e433409e4d86e9bb915ca281235c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:22 +0200 Subject: [PATCH 0104/1085] x86/apic: Move common APIC callbacks Move more apic struct specific functions out of the header and the apic management code into the common source file. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.834421893@linutronix.de --- arch/x86/include/asm/apic.h | 73 +++++----------------------- arch/x86/kernel/apic/apic.c | 28 ----------- arch/x86/kernel/apic/apic_common.c | 78 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 89 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 6561ea088b6a..1081cfb4f159 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -476,94 +476,45 @@ DECLARE_PER_CPU(int, x2apic_extra_bits); extern void generic_bigsmp_probe(void); - #ifdef CONFIG_X86_LOCAL_APIC #include #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline const struct cpumask *default_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - -static inline const struct cpumask *online_target_cpus(void) -{ - return cpu_online_mask; -} - DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); +extern struct apic apic_noop; static inline unsigned int read_apic_id(void) { - unsigned int reg; - - reg = apic_read(APIC_ID); + unsigned int reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } -static inline int default_apic_id_valid(int apicid) -{ - return (apicid < 255); -} - +extern const struct cpumask *default_target_cpus(void); +extern const struct cpumask *online_target_cpus(void); +extern int default_apic_id_valid(int apicid); extern int default_acpi_madt_oem_check(char *, char *); - extern void default_setup_apic_routing(void); - -extern struct apic apic_noop; - extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); extern int default_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); - -static inline void -flat_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - -static inline void -default_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_copy(retmask, cpumask_of(cpu)); -} - -static inline bool default_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - *retmap = *phys_map; -} - +extern bool default_check_apicid_used(physid_mask_t *map, int apicid); +extern void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask); +extern void default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask); +extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ + extern void irq_enter(void); extern void irq_exit(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1b1aeda189d7..ca5ec3fddc49 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2273,34 +2273,6 @@ int hard_smp_processor_id(void) return read_apic_id(); } -int default_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) -{ - unsigned int cpu = cpumask_first(mask); - - if (cpu >= nr_cpu_ids) - return -EINVAL; - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - irq_data_update_effective_affinity(irqdata, cpumask_of(cpu)); - return 0; -} - -int flat_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) - -{ - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; - - if (!cpu_mask) - return -EINVAL; - *apicid = (unsigned int)cpu_mask; - cpumask_bits(effmsk)[0] = cpu_mask; - return 0; -} - /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 2a084d48db37..43f9eac53437 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -6,6 +6,64 @@ #include #include +int default_cpu_mask_to_apicid(const struct cpumask *msk, struct irq_data *irqd, + unsigned int *apicid) +{ + unsigned int cpu = cpumask_first(msk); + + if (cpu >= nr_cpu_ids) + return -EINVAL; + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + return 0; +} + +int flat_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqd, + unsigned int *apicid) + +{ + struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqd); + unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; + + if (!cpu_mask) + return -EINVAL; + *apicid = (unsigned int)cpu_mask; + cpumask_bits(effmsk)[0] = cpu_mask; + return 0; +} + +bool default_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + /* + * Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +} + +void default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + cpumask_copy(retmask, cpumask_of(cpu)); +} + +void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + *retmap = *phys_map; +} + int default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -13,8 +71,28 @@ int default_cpu_present_to_apicid(int mps_cpu) else return BAD_APICID; } +EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); int default_check_phys_apicid_present(int phys_apicid) { return physid_isset(phys_apicid, phys_cpu_present_map); } + +const struct cpumask *default_target_cpus(void) +{ +#ifdef CONFIG_SMP + return cpu_online_mask; +#else + return cpumask_of(0); +#endif +} + +const struct cpumask *online_target_cpus(void) +{ + return cpu_online_mask; +} + +int default_apic_id_valid(int apicid) +{ + return (apicid < 255); +} From 72f48a38505de105e798d4783942df073aeab7ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:23 +0200 Subject: [PATCH 0105/1085] x86/apic: Reorganize struct apic struct apic has just grown over time by adding function pointers in random places. Reorganize it so it becomes more cache line friendly. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.913642524@linutronix.de --- arch/x86/include/asm/apic.h | 103 ++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1081cfb4f159..e3e0883fa96f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -274,73 +274,63 @@ struct irq_data; * James Cleverdon. */ struct apic { - char *name; + /* Hotpath functions first */ + void (*eoi_write)(u32 reg, u32 v); + void (*native_eoi_write)(u32 reg, u32 v); + void (*write)(u32 reg, u32 v); + u32 (*read)(u32 reg); - int (*probe)(void); - int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(int apicid); - int (*apic_id_registered)(void); + /* IPI related functions */ + void (*wait_icr_idle)(void); + u32 (*safe_wait_icr_idle)(void); - u32 irq_delivery_mode; - u32 irq_dest_mode; + void (*send_IPI)(int cpu, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + /* dest_logical is used by the IPI functions */ + u32 dest_logical; + u32 disable_esr; + u32 irq_delivery_mode; + u32 irq_dest_mode; + + /* Functions and data related to vector allocation */ const struct cpumask *(*target_cpus)(void); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, + const struct cpumask *mask); + int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, + struct irq_data *irqdata, + unsigned int *apicid); - int disable_esr; + /* ICR related functions */ + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); - int dest_logical; - bool (*check_apicid_used)(physid_mask_t *map, int apicid); + /* Probe, setup and smpboot functions */ + int (*probe)(void); + int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + int (*apic_id_valid)(int apicid); + int (*apic_id_registered)(void); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, - const struct cpumask *mask); - void (*init_apic_ldr)(void); + bool (*check_apicid_used)(physid_mask_t *map, int apicid); + void (*init_apic_ldr)(void); + void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); + void (*setup_apic_routing)(void); + int (*cpu_present_to_apicid)(int mps_cpu); + void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); + int (*check_phys_apicid_present)(int phys_apicid); + int (*phys_pkg_id)(int cpuid_apic, int index_msb); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - - void (*setup_apic_routing)(void); - int (*cpu_present_to_apicid)(int mps_cpu); - void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - int (*check_phys_apicid_present)(int phys_apicid); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); - - unsigned int (*get_apic_id)(unsigned long x); - /* Can't be NULL on 64-bit */ - u32 (*set_apic_id)(unsigned int id); - - int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, - struct irq_data *irqdata, - unsigned int *apicid); - - /* ipi */ - void (*send_IPI)(int cpu, int vector); - void (*send_IPI_mask)(const struct cpumask *mask, int vector); - void (*send_IPI_mask_allbutself)(const struct cpumask *mask, - int vector); - void (*send_IPI_allbutself)(int vector); - void (*send_IPI_all)(int vector); - void (*send_IPI_self)(int vector); + u32 (*get_apic_id)(unsigned long x); + u32 (*set_apic_id)(unsigned int id); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - void (*inquire_remote_apic)(int apicid); - - /* apic ops */ - u32 (*read)(u32 reg); - void (*write)(u32 reg, u32 v); - /* - * ->eoi_write() has the same signature as ->write(). - * - * Drivers can support both ->eoi_write() and ->write() by passing the same - * callback value. Kernel can override ->eoi_write() and fall back - * on write for EOI. - */ - void (*eoi_write)(u32 reg, u32 v); - void (*native_eoi_write)(u32 reg, u32 v); - u64 (*icr_read)(void); - void (*icr_write)(u32 low, u32 high); - void (*wait_icr_idle)(void); - u32 (*safe_wait_icr_idle)(void); + void (*inquire_remote_apic)(int apicid); #ifdef CONFIG_X86_32 /* @@ -355,6 +345,7 @@ struct apic { */ int (*x86_32_early_logical_apicid)(int cpu); #endif + char *name; }; /* From 023a611748fd58d46c8aa049cf4f22ebada983f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:24 +0200 Subject: [PATCH 0106/1085] x86/apic/x2apic: Simplify cluster management The cluster management code creates a cluster mask per cpu, which requires that on cpu on/offline all cluster masks have to be iterated and updated. Other information about the cluster is in different per cpu variables. Create a data structure which holds all information about a cluster and fill it in when the first CPU of a cluster comes online. If another CPU of a cluster comes online it just finds the pointer to the existing cluster structure and reuses it. That simplifies all usage sites and gets rid of quite some pointless iterations over the online cpus to find the cpus which belong to the cluster. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.992629420@linutronix.de --- arch/x86/kernel/apic/x2apic_cluster.c | 154 +++++++++++++------------- 1 file changed, 76 insertions(+), 78 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index d7f5132ba5ca..729c0a512b72 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,20 +10,22 @@ #include #include "x2apic.h" +struct cluster_mask { + unsigned int clusterid; + int node; + struct cpumask mask; +}; + static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); -static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); +static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -static inline u32 x2apic_cluster(int cpu) -{ - return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; -} - static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); @@ -35,49 +37,34 @@ static void x2apic_send_IPI(int cpu, int vector) static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - struct cpumask *cpus_in_cluster_ptr; - struct cpumask *ipi_mask_ptr; - unsigned int cpu, this_cpu; + unsigned int cpu, clustercpu; + struct cpumask *tmpmsk; unsigned long flags; u32 dest; x2apic_wrmsr_fence(); - local_irq_save(flags); - this_cpu = smp_processor_id(); + tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); + cpumask_copy(tmpmsk, mask); + /* If IPI should not be sent to self, clear current CPU */ + if (apic_dest != APIC_DEST_ALLINC) + cpumask_clear_cpu(smp_processor_id(), tmpmsk); - /* - * We are to modify mask, so we need an own copy - * and be sure it's manipulated with irq off. - */ - ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); - cpumask_copy(ipi_mask_ptr, mask); + /* Collapse cpus in a cluster so a single IPI per cluster is sent */ + for_each_cpu(cpu, tmpmsk) { + struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); - /* - * The idea is to send one IPI per cluster. - */ - for_each_cpu(cpu, ipi_mask_ptr) { - unsigned long i; - - cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); dest = 0; - - /* Collect cpus in cluster. */ - for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { - if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) - dest |= per_cpu(x86_cpu_to_logical_apicid, i); - } + for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); if (!dest) continue; __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); - /* - * Cluster sibling cpus should be discared now so - * we would not send IPI them second time. - */ - cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); + /* Remove cluster CPUs from tmpmask */ + cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } local_irq_restore(flags); @@ -109,91 +96,100 @@ x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, unsigned int *apicid) { struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); + struct cluster_mask *cmsk; unsigned int cpu; u32 dest = 0; - u16 cluster; cpu = cpumask_first(mask); if (cpu >= nr_cpu_ids) return -EINVAL; - dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - cluster = x2apic_cluster(cpu); - + cmsk = per_cpu(cluster_masks, cpu); cpumask_clear(effmsk); - for_each_cpu(cpu, mask) { - if (cluster != x2apic_cluster(cpu)) - continue; + for_each_cpu_and(cpu, &cmsk->mask, mask) { dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); cpumask_set_cpu(cpu, effmsk); } - *apicid = dest; return 0; } static void init_x2apic_ldr(void) { - unsigned int this_cpu = smp_processor_id(); + struct cluster_mask *cmsk = this_cpu_read(cluster_masks); + u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + this_cpu_write(x86_cpu_to_logical_apicid, apicid); - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + if (cmsk) + goto update; + + cluster = apicid >> 16; for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cmsk = per_cpu(cluster_masks, cpu); + /* Matching cluster found. Link and update it. */ + if (cmsk && cmsk->clusterid == cluster) + goto update; } + cmsk = cluster_hotplug_mask; + cluster_hotplug_mask = NULL; +update: + this_cpu_write(cluster_masks, cmsk); + cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -/* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int x2apic_prepare_cpu(unsigned int cpu) +static int alloc_clustermask(unsigned int cpu, int node) { - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); - return -ENOMEM; + if (per_cpu(cluster_masks, cpu)) + return 0; + /* + * If a hotplug spare mask exists, check whether it's on the right + * node. If not, free it and allocate a new one. + */ + if (cluster_hotplug_mask) { + if (cluster_hotplug_mask->node == node) + return 0; + kfree(cluster_hotplug_mask); } + cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), + GFP_KERNEL, node); + if (!cluster_hotplug_mask) + return -ENOMEM; + cluster_hotplug_mask->node = node; return 0; } -static int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_prepare_cpu(unsigned int cpu) { - int cpu; + if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + return -ENOMEM; + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + return -ENOMEM; + return 0; +} - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); +static int x2apic_dead_cpu(unsigned int dead_cpu) +{ + struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); + + cpumask_clear_cpu(smp_processor_id(), &cmsk->mask); + free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - int ret; - if (!x2apic_mode) return 0; - ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", - x2apic_prepare_cpu, x2apic_dead_cpu); - if (ret < 0) { + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", + x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); return 0; } - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + init_x2apic_ldr(); return 1; } @@ -208,6 +204,8 @@ static const struct cpumask *x2apic_cluster_target_cpus(void) static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { + struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); + /* * To minimize vector pressure, default case of boot, device bringup * etc will use a single cpu for the interrupt destination. @@ -220,7 +218,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, if (mask == x2apic_cluster_target_cpus()) cpumask_copy(retmask, cpumask_of(cpu)); else - cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); + cpumask_and(retmask, mask, &cmsk->mask); } static struct apic apic_x2apic_cluster __ro_after_init = { From c1d1ee9ac1793d939ba1a1322767cc5f77a5b8fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:25 +0200 Subject: [PATCH 0107/1085] x86/apic: Get rid of apic->target_cpus The target_cpus() callback of the apic struct is not really useful. Some APICs return cpu_online_mask and others cpus_all_mask. The latter is bogus as it does not take holes in the cpus_possible_mask into account. Replace it with cpus_online_mask which makes the most sense and remove the callback. The usage sites will be removed in a later step anyway, so get rid of it now to have incremental changes. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.070850916@linutronix.de --- arch/x86/include/asm/apic.h | 3 --- arch/x86/kernel/apic/apic_common.c | 14 -------------- arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 7 ------- arch/x86/kernel/apic/apic_numachip.c | 2 -- arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/io_apic.c | 7 +++---- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 8 +------- arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 1 - arch/x86/xen/apic.c | 1 - 13 files changed, 5 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e3e0883fa96f..ff0bddabaa04 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -298,7 +298,6 @@ struct apic { u32 irq_dest_mode; /* Functions and data related to vector allocation */ - const struct cpumask *(*target_cpus)(void); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, const struct cpumask *mask); int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, @@ -484,8 +483,6 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } -extern const struct cpumask *default_target_cpus(void); -extern const struct cpumask *online_target_cpus(void); extern int default_apic_id_valid(int apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 43f9eac53437..4791654cdeb2 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -78,20 +78,6 @@ int default_check_phys_apicid_present(int phys_apicid) return physid_isset(phys_apicid, phys_cpu_present_map); } -const struct cpumask *default_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - -const struct cpumask *online_target_cpus(void) -{ - return cpu_online_mask; -} - int default_apic_id_valid(int apicid) { return (apicid < 255); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 6543648f0b81..7ca354dee8af 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -154,7 +154,6 @@ static struct apic apic_flat __ro_after_init = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, @@ -249,7 +248,6 @@ static struct apic apic_physflat __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c2c6bac28e79..a8a7cb1347dc 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -83,12 +83,6 @@ static int noop_apic_id_registered(void) return physid_isset(0, phys_cpu_present_map); } -static const struct cpumask *noop_target_cpus(void) -{ - /* only BSP here */ - return cpumask_of(0); -} - static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { @@ -127,7 +121,6 @@ struct apic apic_noop __ro_after_init = { /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = noop_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index d77c8cc4afc2..cc2f8843391f 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -249,7 +249,6 @@ static const struct apic apic_numachip1 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, @@ -300,7 +299,6 @@ static const struct apic apic_numachip2 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 6eb5f10f1599..72a1a0385549 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -154,7 +154,6 @@ static struct apic apic_bigsmp __ro_after_init = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 11702d92407d..d8c75d61f766 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2553,9 +2553,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) } /* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be apic->target_cpus() + * This function updates target affinity of IOAPIC interrupts to include + * the CPUs which came online during SMP bringup. */ #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) @@ -2588,7 +2587,7 @@ void __init setup_ioapic_dest(void) if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) mask = irq_data_get_affinity_mask(idata); else - mask = apic->target_cpus(); + mask = irq_default_affinity; chip = irq_data_get_irq_chip(idata); /* Might be lapic_chip for irq 0 */ diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 12d171204c8a..95125bfb4e09 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -109,7 +109,6 @@ static struct apic apic_default __ro_after_init = { /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = default_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 88c214e75a6b..b6b963e42028 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -255,7 +255,7 @@ static int assign_irq_vector_policy(int irq, int node, if (node != NUMA_NO_NODE && assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) return 0; - return assign_irq_vector(irq, data, apic->target_cpus(), irqdata); + return assign_irq_vector(irq, data, cpu_online_mask, irqdata); } static void clear_irq_vector(int irq, struct apic_chip_data *data) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 729c0a512b72..c1684f27226e 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -193,11 +193,6 @@ static int x2apic_cluster_probe(void) return 1; } -static const struct cpumask *x2apic_cluster_target_cpus(void) -{ - return cpu_all_mask; -} - /* * Each x2apic cluster is an allocation domain. */ @@ -215,7 +210,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, * derived from the first cpu in the mask) members specified * in the mask. */ - if (mask == x2apic_cluster_target_cpus()) + if (cpumask_equal(mask, cpu_online_mask)) cpumask_copy(retmask, cpumask_of(cpu)); else cpumask_and(retmask, mask, &cmsk->mask); @@ -232,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 315c44eda399..6903e69a7b60 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -147,7 +147,6 @@ static struct apic apic_x2apic_phys __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1b23b9e0a8af..9f6d551deeb4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -584,7 +584,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* Physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 652d62458d8d..58776bcf4251 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -160,7 +160,6 @@ static struct apic xen_pv_apic = { /* .irq_delivery_mode - used in native_compose_msi_msg only */ /* .irq_dest_mode - used in native_compose_msi_msg only */ - .target_cpus = default_target_cpus, .disable_esr = 0, /* .dest_logical - default_send_IPI_ use it but we use our own. */ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ From 7854f82293e99f6bb3df793a2f579db4670ba71b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:26 +0200 Subject: [PATCH 0108/1085] x86/vector: Rename used_vectors to system_vectors used_vectors is a nisnomer as it only has the system vectors which are excluded from the regular vector allocation marked. It's not what the name suggests storage for the actually used vectors. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.150209009@linutronix.de --- arch/x86/include/asm/desc.h | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/idt.c | 12 ++++++------ arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/traps.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 9d0e13738ed3..c474bf4971d9 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -392,7 +392,7 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) void update_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr); -extern unsigned long used_vectors[]; +extern unsigned long system_vectors[]; #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b6b963e42028..67d20ee60e33 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -175,7 +175,7 @@ next: if (unlikely(current_vector == vector)) goto next_cpu; - if (test_bit(vector, used_vectors)) + if (test_bit(vector, system_vectors)) goto next; for_each_cpu(new_cpu, vector_searchmask) { diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6107ee1cb8d5..723fa9782186 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -225,7 +225,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy idt_init_desc(&desc, t); write_idt_entry(idt, t->vector, &desc); if (sys) - set_bit(t->vector, used_vectors); + set_bit(t->vector, system_vectors); } } @@ -313,14 +313,14 @@ void __init idt_setup_apic_and_irq_gates(void) idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); - for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); set_intr_gate(i, entry); } - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { + for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { #ifdef CONFIG_X86_LOCAL_APIC - set_bit(i, used_vectors); + set_bit(i, system_vectors); set_intr_gate(i, spurious_interrupt); #else entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); @@ -358,7 +358,7 @@ void idt_invalidate(void *addr) void __init update_intr_gate(unsigned int n, const void *addr) { - if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + if (WARN_ON_ONCE(!test_bit(n, system_vectors))) return; set_intr_gate(n, addr); } @@ -366,6 +366,6 @@ void __init update_intr_gate(unsigned int n, const void *addr) void alloc_intr_gate(unsigned int n, const void *addr) { BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, used_vectors)) + if (!test_and_set_bit(n, system_vectors)) set_intr_gate(n, addr); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 52089c043160..188990c3a514 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -134,7 +134,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) { + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -416,7 +416,7 @@ int check_irq_vectors_for_cpu_disable(void) */ for (vector = FIRST_EXTERNAL_VECTOR; vector < FIRST_SYSTEM_VECTOR; vector++) { - if (!test_bit(vector, used_vectors) && + if (!test_bit(vector, system_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) return 0; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 34ea3651362e..321240f712e1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -71,7 +71,7 @@ #include #endif -DECLARE_BITMAP(used_vectors, NR_VECTORS); +DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) { From fdba46ffb4c203b6e6794163493fd310f98bb4be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:27 +0200 Subject: [PATCH 0109/1085] x86/apic: Get rid of multi CPU affinity Setting the interrupt affinity of a single interrupt to multiple CPUs has a dubious value. 1) This only works on machines where the APIC uses logical destination mode. If the APIC uses physical destination mode then it is already restricted to a single CPU 2) Experiments have shown, that the benefit of multi CPU affinity is close to zero and in some test even worse than setting the affinity to a single CPU. The reason for this is that the delivery targets the APIC with the lowest ID first and only if that APIC is busy (servicing an interrupt, i.e. ISR is not empty) it hands it over to the next APIC. In the conducted tests the vast majority of interrupts ends up on the APIC with the lowest ID anyway, so there is no natural spreading of the interrupts possible. Supporting multi CPU affinities adds a lot of complexity to the code, which can turn the allocation search into a worst case of nr_vectors * nr_online_cpus * nr_bits_in_target_mask As a first step disable it by restricting the vector search to a single CPU. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.228824430@linutronix.de --- arch/x86/kernel/apic/vector.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 67d20ee60e33..93edc2236282 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -136,8 +136,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, while (cpu < nr_cpu_ids) { int new_cpu, offset; - /* Get the possible target cpus for @mask/@cpu from the apic */ - apic->vector_allocation_domain(cpu, vector_cpumask, mask); + cpumask_copy(vector_cpumask, cpumask_of(cpu)); /* * Clear the offline cpus from @vector_cpumask for searching @@ -367,17 +366,11 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; + irqd_set_single_target(irq_data); err = assign_irq_vector_policy(virq + i, node, data, info, irq_data); if (err) goto error; - /* - * If the apic destination mode is physical, then the - * effective affinity is restricted to a single target - * CPU. Mark the interrupt accordingly. - */ - if (!apic->irq_dest_mode) - irqd_set_single_target(irq_data); } return 0; @@ -434,7 +427,7 @@ static void __init init_legacy_irqs(void) BUG_ON(!data); data->cfg.vector = ISA_IRQ_VECTOR(i); - cpumask_setall(data->domain); + cpumask_copy(data->domain, cpumask_of(0)); irq_set_chip_data(i, data); } } From ef9e56d894eab99a33a06b96ba8057afa67d3702 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:28 +0200 Subject: [PATCH 0110/1085] x86/ioapic: Remove obsolete post hotplug update With single CPU affinities the post SMP boot vector update is pointless as it will just leave the affinities on the same vectors and the same CPUs. Remove it. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.308697243@linutronix.de --- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/kernel/apic/io_apic.c | 42 ---------------------------------- arch/x86/kernel/smpboot.c | 1 - 3 files changed, 45 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6cbf2cfb3f8a..731c686de37c 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -192,7 +192,6 @@ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); -extern void setup_ioapic_dest(void); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin); extern void print_IO_APICs(void); #else /* !CONFIG_X86_IO_APIC */ @@ -232,7 +231,6 @@ static inline void io_apic_init_mappings(void) { } static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } -static inline void setup_ioapic_dest(void) { } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d8c75d61f766..81f35ae3f884 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2556,48 +2556,6 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) * This function updates target affinity of IOAPIC interrupts to include * the CPUs which came online during SMP bringup. */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - const struct cpumask *mask; - struct irq_desc *desc; - struct irq_data *idata; - struct irq_chip *chip; - - if (skip_ioapic_setup == 1) - return; - - for_each_ioapic_pin(ioapic, pin) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - - irq = pin_2_irq(irq_entry, ioapic, pin, 0); - if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) - continue; - - desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - idata = irq_desc_get_irq_data(desc); - - /* - * Honour affinities which have been set in early boot - */ - if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) - mask = irq_data_get_affinity_mask(idata); - else - mask = irq_default_affinity; - - chip = irq_data_get_irq_chip(idata); - /* Might be lapic_chip for irq 0 */ - if (chip->irq_set_affinity) - chip->irq_set_affinity(idata, mask, false); - raw_spin_unlock_irq(&desc->lock); - } -} -#endif - #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 81652e3b8c17..d8cef3222887 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1360,7 +1360,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) nmi_selftest(); impress_friends(); - setup_ioapic_dest(); mtrr_aps_init(); } From f0cc6ccaf7ba42a1247fe5a9244b6009a3beddd5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:29 +0200 Subject: [PATCH 0111/1085] x86/vector: Simplify the CPU hotplug vector update With single CPU affinities it's not longer required to scan all interrupts for potential destination masks which contain the newly booting CPU. Reduce it to install the active legacy PIC vectors on the newly booting CPU as those cannot be affinity controlled by the kernel and potentially end up at any CPU in the system. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.388040204@linutronix.de --- arch/x86/kernel/apic/vector.c | 64 +++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 93edc2236282..b45020364cc0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -459,33 +459,32 @@ int __init arch_early_irq_init(void) return arch_early_ioapic_init(); } -/* Initialize vector_irq on a new cpu */ -static void __setup_vector_irq(int cpu) +/* Temporary hack to keep things working */ +static void vector_update_shutdown_irqs(void) { - struct apic_chip_data *data; struct irq_desc *desc; - int irq, vector; + int irq; - /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { - struct irq_data *idata = irq_desc_get_irq_data(desc); + struct irq_data *irqd = irq_desc_get_irq_data(desc); + struct apic_chip_data *ad = apic_chip_data(irqd); - data = apic_chip_data(idata); - if (!data || !cpumask_test_cpu(cpu, data->domain)) - continue; - vector = data->cfg.vector; - per_cpu(vector_irq, cpu)[vector] = desc; + if (ad && cpumask_test_cpu(cpu, ad->domain) && ad->cfg.vector) + this_cpu_write(vector_irq[ad->cfg.vector], desc); } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - desc = per_cpu(vector_irq, cpu)[vector]; - if (IS_ERR_OR_NULL(desc)) - continue; +} - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!cpumask_test_cpu(cpu, data->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - } +static struct irq_desc *__setup_vector_irq(int vector) +{ + int isairq = vector - ISA_IRQ_VECTOR(0); + + /* Check whether the irq is in the legacy space */ + if (isairq < 0 || isairq >= nr_legacy_irqs()) + return VECTOR_UNUSED; + /* Check whether the irq is handled by the IOAPIC */ + if (test_bit(isairq, &io_apic_irqs)) + return VECTOR_UNUSED; + return irq_to_desc(isairq); } /* @@ -493,20 +492,27 @@ static void __setup_vector_irq(int cpu) */ void setup_vector_irq(int cpu) { - int irq; + unsigned int vector; lockdep_assert_held(&vector_lock); /* - * On most of the platforms, legacy PIC delivers the interrupts on the - * boot cpu. But there are certain platforms where PIC interrupts are - * delivered to multiple cpu's. If the legacy IRQ is handled by the - * legacy PIC, for the new cpu that is coming online, setup the static - * legacy vector to irq mapping: + * The interrupt affinity logic never targets interrupts to offline + * CPUs. The exception are the legacy PIC interrupts. In general + * they are only targeted to CPU0, but depending on the platform + * they can be distributed to any online CPU in hardware. The + * kernel has no influence on that. So all active legacy vectors + * must be installed on all CPUs. All non legacy interrupts can be + * cleared. */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq); + for (vector = 0; vector < NR_VECTORS; vector++) + this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); - __setup_vector_irq(cpu); + /* + * Until the rewrite of the managed interrupt management is in + * place it's necessary to walk the irq descriptors and check for + * interrupts which are targeted at this CPU. + */ + vector_update_shutdown_irqs(); } static int apic_retrigger_irq(struct irq_data *irq_data) From 86ba65514f8730d58e2c11fb6e25caa537d6bc93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:30 +0200 Subject: [PATCH 0112/1085] x86/vector: Cleanup variable names The naming convention of variables with the types irq_data and apic_chip_data are inconsistent and confusing. Before reworking the whole vector management make them consistent so irq_data pointers are named 'irqd' and apic_chip_data are named 'apicd' all over the place. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.465731667@linutronix.de --- arch/x86/kernel/apic/vector.c | 228 +++++++++++++++++----------------- 1 file changed, 114 insertions(+), 114 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b45020364cc0..a7f7c3730a09 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -50,22 +50,22 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) +static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) { - if (!irq_data) + if (!irqd) return NULL; - while (irq_data->parent_data) - irq_data = irq_data->parent_data; + while (irqd->parent_data) + irqd = irqd->parent_data; - return irq_data->chip_data; + return irqd->chip_data; } -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +struct irq_cfg *irqd_cfg(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); - return data ? &data->cfg : NULL; + return apicd ? &apicd->cfg : NULL; } EXPORT_SYMBOL_GPL(irqd_cfg); @@ -76,35 +76,35 @@ struct irq_cfg *irq_cfg(unsigned int irq) static struct apic_chip_data *alloc_apic_chip_data(int node) { - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); - if (!data) + apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); + if (!apicd) return NULL; - if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&apicd->domain, GFP_KERNEL, node)) goto out_data; - if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&apicd->old_domain, GFP_KERNEL, node)) goto out_domain; - return data; + return apicd; out_domain: - free_cpumask_var(data->domain); + free_cpumask_var(apicd->domain); out_data: - kfree(data); + kfree(apicd); return NULL; } -static void free_apic_chip_data(struct apic_chip_data *data) +static void free_apic_chip_data(struct apic_chip_data *apicd) { - if (data) { - free_cpumask_var(data->domain); - free_cpumask_var(data->old_domain); - kfree(data); + if (apicd) { + free_cpumask_var(apicd->domain); + free_cpumask_var(apicd->old_domain); + kfree(apicd); } } static int __assign_irq_vector(int irq, struct apic_chip_data *d, const struct cpumask *mask, - struct irq_data *irqdata) + struct irq_data *irqd) { /* * NOTE! The local APIC isn't very good at handling @@ -226,62 +226,62 @@ success: * cpus masked out. */ cpumask_and(vector_searchmask, vector_searchmask, mask); - BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata, + BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqd, &d->cfg.dest_apicid)); return 0; } -static int assign_irq_vector(int irq, struct apic_chip_data *data, +static int assign_irq_vector(int irq, struct apic_chip_data *apicd, const struct cpumask *mask, - struct irq_data *irqdata) + struct irq_data *irqd) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, data, mask, irqdata); + err = __assign_irq_vector(irq, apicd, mask, irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } static int assign_irq_vector_policy(int irq, int node, - struct apic_chip_data *data, + struct apic_chip_data *apicd, struct irq_alloc_info *info, - struct irq_data *irqdata) + struct irq_data *irqd) { if (info && info->mask) - return assign_irq_vector(irq, data, info->mask, irqdata); + return assign_irq_vector(irq, apicd, info->mask, irqd); if (node != NUMA_NO_NODE && - assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) + assign_irq_vector(irq, apicd, cpumask_of_node(node), irqd) == 0) return 0; - return assign_irq_vector(irq, data, cpu_online_mask, irqdata); + return assign_irq_vector(irq, apicd, cpu_online_mask, irqd); } -static void clear_irq_vector(int irq, struct apic_chip_data *data) +static void clear_irq_vector(int irq, struct apic_chip_data *apicd) { struct irq_desc *desc; int cpu, vector; - if (!data->cfg.vector) + if (!apicd->cfg.vector) return; - vector = data->cfg.vector; - for_each_cpu_and(cpu, data->domain, cpu_online_mask) + vector = apicd->cfg.vector; + for_each_cpu_and(cpu, apicd->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - data->cfg.vector = 0; - cpumask_clear(data->domain); + apicd->cfg.vector = 0; + cpumask_clear(apicd->domain); /* * If move is in progress or the old_domain mask is not empty, * i.e. the cleanup IPI has not been processed yet, we need to remove * the old references to desc from all cpus vector tables. */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) + if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain)) return; desc = irq_to_desc(irq); - for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, apicd->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != desc) @@ -290,7 +290,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) break; } } - data->move_in_progress = 0; + apicd->move_in_progress = 0; } void init_irq_alloc_info(struct irq_alloc_info *info, @@ -311,20 +311,20 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct apic_chip_data *apic_data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; unsigned long flags; int i; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); - if (irq_data && irq_data->chip_data) { + irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irqd && irqd->chip_data) { raw_spin_lock_irqsave(&vector_lock, flags); - clear_irq_vector(virq + i, irq_data->chip_data); - apic_data = irq_data->chip_data; - irq_domain_reset_irq_data(irq_data); + clear_irq_vector(virq + i, irqd->chip_data); + apicd = irqd->chip_data; + irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); - free_apic_chip_data(apic_data); + free_apic_chip_data(apicd); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs()) legacy_irq_data[virq + i] = NULL; @@ -337,8 +337,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; - struct apic_chip_data *data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; int i, err, node; if (disable_apic) @@ -349,26 +349,26 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, return -ENOSYS; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(domain, virq + i); - BUG_ON(!irq_data); - node = irq_data_get_node(irq_data); + irqd = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irqd); + node = irq_data_get_node(irqd); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) - data = legacy_irq_data[virq + i]; + apicd = legacy_irq_data[virq + i]; else #endif - data = alloc_apic_chip_data(node); - if (!data) { + apicd = alloc_apic_chip_data(node); + if (!apicd) { err = -ENOMEM; goto error; } - irq_data->chip = &lapic_controller; - irq_data->chip_data = data; - irq_data->hwirq = virq + i; - irqd_set_single_target(irq_data); - err = assign_irq_vector_policy(virq + i, node, data, info, - irq_data); + irqd->chip = &lapic_controller; + irqd->chip_data = apicd; + irqd->hwirq = virq + i; + irqd_set_single_target(irqd); + err = assign_irq_vector_policy(virq + i, node, apicd, info, + irqd); if (err) goto error; } @@ -416,19 +416,19 @@ int __init arch_probe_nr_irqs(void) static void __init init_legacy_irqs(void) { int i, node = cpu_to_node(0); - struct apic_chip_data *data; + struct apic_chip_data *apicd; /* * For legacy IRQ's, start with assigning irq0 to irq15 to * ISA_IRQ_VECTOR(i) for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { - data = legacy_irq_data[i] = alloc_apic_chip_data(node); - BUG_ON(!data); + apicd = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!apicd); - data->cfg.vector = ISA_IRQ_VECTOR(i); - cpumask_copy(data->domain, cpumask_of(0)); - irq_set_chip_data(i, data); + apicd->cfg.vector = ISA_IRQ_VECTOR(i); + cpumask_copy(apicd->domain, cpumask_of(0)); + irq_set_chip_data(i, apicd); } } #else @@ -515,32 +515,32 @@ void setup_vector_irq(int cpu) vector_update_shutdown_irqs(); } -static int apic_retrigger_irq(struct irq_data *irq_data) +static int apic_retrigger_irq(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(data->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); + cpu = cpumask_first_and(apicd->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), apicd->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } -void apic_ack_edge(struct irq_data *data) +void apic_ack_edge(struct irq_data *irqd) { - irq_complete_move(irqd_cfg(data)); - irq_move_irq(data); + irq_complete_move(irqd_cfg(irqd)); + irq_move_irq(irqd); ack_APIC_irq(); } -static int apic_set_affinity(struct irq_data *irq_data, +static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { - struct apic_chip_data *data = irq_data->chip_data; - int err, irq = irq_data->irq; + struct apic_chip_data *apicd = irqd->chip_data; + int err, irq = irqd->irq; if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; @@ -548,7 +548,7 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, data, dest, irq_data); + err = assign_irq_vector(irq, apicd, dest, irqd); return err ? err : IRQ_SET_MASK_OK; } @@ -560,23 +560,23 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct apic_chip_data *data) +static void __send_cleanup_vector(struct apic_chip_data *apicd) { raw_spin_lock(&vector_lock); - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - data->move_in_progress = 0; - if (!cpumask_empty(data->old_domain)) - apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR); + cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask); + apicd->move_in_progress = 0; + if (!cpumask_empty(apicd->old_domain)) + apic->send_IPI_mask(apicd->old_domain, IRQ_MOVE_CLEANUP_VECTOR); raw_spin_unlock(&vector_lock); } void send_cleanup_vector(struct irq_cfg *cfg) { - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = container_of(cfg, struct apic_chip_data, cfg); - if (data->move_in_progress) - __send_cleanup_vector(data); + apicd = container_of(cfg, struct apic_chip_data, cfg); + if (apicd->move_in_progress) + __send_cleanup_vector(apicd); } asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) @@ -590,7 +590,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - struct apic_chip_data *data; + struct apic_chip_data *apicd; struct irq_desc *desc; unsigned int irr; @@ -606,16 +606,16 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) goto retry; } - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!data) + apicd = apic_chip_data(irq_desc_get_irq_data(desc)); + if (!apicd) goto unlock; /* * Nothing to cleanup if irq migration is in progress * or this cpu is not set in the cleanup mask. */ - if (data->move_in_progress || - !cpumask_test_cpu(me, data->old_domain)) + if (apicd->move_in_progress || + !cpumask_test_cpu(me, apicd->old_domain)) goto unlock; /* @@ -630,8 +630,8 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) * this cpu is part of the target mask. We better leave that * one alone. */ - if (vector == data->cfg.vector && - cpumask_test_cpu(me, data->domain)) + if (vector == apicd->cfg.vector && + cpumask_test_cpu(me, apicd->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -647,7 +647,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) goto unlock; } __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); - cpumask_clear_cpu(me, data->old_domain); + cpumask_clear_cpu(me, apicd->old_domain); unlock: raw_spin_unlock(&desc->lock); } @@ -660,15 +660,15 @@ unlock: static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = container_of(cfg, struct apic_chip_data, cfg); - if (likely(!data->move_in_progress)) + apicd = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!apicd->move_in_progress)) return; me = smp_processor_id(); - if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) - __send_cleanup_vector(data); + if (vector == apicd->cfg.vector && cpumask_test_cpu(me, apicd->domain)) + __send_cleanup_vector(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -681,8 +681,8 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqdata; - struct apic_chip_data *data; + struct irq_data *irqd; + struct apic_chip_data *apicd; struct irq_cfg *cfg; unsigned int cpu; @@ -695,13 +695,13 @@ void irq_force_complete_move(struct irq_desc *desc) * Check first that the chip_data is what we expect * (apic_chip_data) before touching it any further. */ - irqdata = irq_domain_get_irq_data(x86_vector_domain, + irqd = irq_domain_get_irq_data(x86_vector_domain, irq_desc_get_irq(desc)); - if (!irqdata) + if (!irqd) return; - data = apic_chip_data(irqdata); - cfg = data ? &data->cfg : NULL; + apicd = apic_chip_data(irqd); + cfg = apicd ? &apicd->cfg : NULL; if (!cfg) return; @@ -719,14 +719,14 @@ void irq_force_complete_move(struct irq_desc *desc) * Clean out all offline cpus (including the outgoing one) from the * old_domain mask. */ - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); + cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask); /* * If move_in_progress is cleared and the old_domain mask is empty, * then there is nothing to cleanup. fixup_irqs() will take care of * the stale vectors on the outgoing cpu. */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) { + if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain)) { raw_spin_unlock(&vector_lock); return; } @@ -739,7 +739,7 @@ void irq_force_complete_move(struct irq_desc *desc) * 2) The interrupt has fired on the new vector, but the cleanup IPIs * have not been processed yet. */ - if (data->move_in_progress) { + if (apicd->move_in_progress) { /* * In theory there is a race: * @@ -773,18 +773,18 @@ void irq_force_complete_move(struct irq_desc *desc) * area arises. */ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqdata->irq, cfg->old_vector); + irqd->irq, cfg->old_vector); } /* * If old_domain is not empty, then other cpus still have the irq * descriptor set in their vector array. Clean it up. */ - for_each_cpu(cpu, data->old_domain) + for_each_cpu(cpu, apicd->old_domain) per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; /* Cleanup the left overs of the (half finished) move */ - cpumask_clear(data->old_domain); - data->move_in_progress = 0; + cpumask_clear(apicd->old_domain); + apicd->move_in_progress = 0; raw_spin_unlock(&vector_lock); } #endif From 029c6e1c9df776fe1b2ba756a28fb65e9f9e9f69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:31 +0200 Subject: [PATCH 0113/1085] x86/vector: Store the single CPU targets in apic data Now that the interrupt affinities are targeted at single CPUs storing them in a cpumask is overkill. Store them in a dedicated variable. This does not yet remove the domain cpumasks because the current allocator relies on them. Preparatory change for the allocator rework. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.544867277@linutronix.de --- arch/x86/kernel/apic/vector.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a7f7c3730a09..7a9e0c6dd756 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -23,6 +23,8 @@ struct apic_chip_data { struct irq_cfg cfg; + unsigned int cpu; + unsigned int prev_cpu; cpumask_var_t domain; cpumask_var_t old_domain; u8 move_in_progress : 1; @@ -214,6 +216,7 @@ update: cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); d->move_in_progress = !cpumask_empty(d->old_domain); d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; + d->prev_cpu = d->cpu; d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); success: @@ -228,6 +231,7 @@ success: cpumask_and(vector_searchmask, vector_searchmask, mask); BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqd, &d->cfg.dest_apicid)); + d->cpu = cpumask_first(vector_searchmask); return 0; } @@ -428,6 +432,7 @@ static void __init init_legacy_irqs(void) apicd->cfg.vector = ISA_IRQ_VECTOR(i); cpumask_copy(apicd->domain, cpumask_of(0)); + apicd->cpu = 0; irq_set_chip_data(i, apicd); } } From dccfe3147b42b78458ab8e4440822c805ee76d72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:32 +0200 Subject: [PATCH 0114/1085] x86/vector: Simplify vector move cleanup The vector move cleanup needs to walk the vector space and do a lot of sanity checks to find a vector to cleanup. With single CPU affinities this can be simplified and made more robust by queueing the vector configuration which needs to be cleaned up in a hlist on the CPU which was the previous target. That removes all the race conditions because the cleanup either finds a valid list entry or not. The latter happens when the interrupt was torn down before the cleanup handler was able to run. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.622727892@linutronix.de --- arch/x86/kernel/apic/vector.c | 233 ++++++++++++---------------------- 1 file changed, 83 insertions(+), 150 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7a9e0c6dd756..68f885913927 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -25,6 +25,7 @@ struct apic_chip_data { struct irq_cfg cfg; unsigned int cpu; unsigned int prev_cpu; + struct hlist_node clist; cpumask_var_t domain; cpumask_var_t old_domain; u8 move_in_progress : 1; @@ -38,6 +39,9 @@ static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; #endif +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct hlist_head, cleanup_list); +#endif void lock_vector_lock(void) { @@ -87,6 +91,7 @@ static struct apic_chip_data *alloc_apic_chip_data(int node) goto out_data; if (!zalloc_cpumask_var_node(&apicd->old_domain, GFP_KERNEL, node)) goto out_domain; + INIT_HLIST_NODE(&apicd->clist); return apicd; out_domain: free_cpumask_var(apicd->domain); @@ -127,8 +132,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, * If there is still a move in progress or the previous move has not * been cleaned up completely, tell the caller to come back later. */ - if (d->move_in_progress || - cpumask_intersects(d->old_domain, cpu_online_mask)) + if (d->cfg.old_vector) return -EBUSY; /* Only try and allocate irqs on cpus that are present */ @@ -263,38 +267,22 @@ static int assign_irq_vector_policy(int irq, int node, static void clear_irq_vector(int irq, struct apic_chip_data *apicd) { - struct irq_desc *desc; - int cpu, vector; + unsigned int vector = apicd->cfg.vector; - if (!apicd->cfg.vector) + if (!vector) return; - vector = apicd->cfg.vector; - for_each_cpu_and(cpu, apicd->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; apicd->cfg.vector = 0; - cpumask_clear(apicd->domain); - /* - * If move is in progress or the old_domain mask is not empty, - * i.e. the cleanup IPI has not been processed yet, we need to remove - * the old references to desc from all cpus vector tables. - */ - if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain)) + /* Clean up move in progress */ + vector = apicd->cfg.old_vector; + if (!vector) return; - desc = irq_to_desc(irq); - for_each_cpu_and(cpu, apicd->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { - if (per_cpu(vector_irq, cpu)[vector] != desc) - continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - break; - } - } + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); } void init_irq_alloc_info(struct irq_alloc_info *info, @@ -474,7 +462,7 @@ static void vector_update_shutdown_irqs(void) struct irq_data *irqd = irq_desc_get_irq_data(desc); struct apic_chip_data *ad = apic_chip_data(irqd); - if (ad && cpumask_test_cpu(cpu, ad->domain) && ad->cfg.vector) + if (ad && ad->cfg.vector && ad->cpu == smp_processor_id()) this_cpu_write(vector_irq[ad->cfg.vector], desc); } } @@ -524,11 +512,9 @@ static int apic_retrigger_irq(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; - int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(apicd->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), apicd->cfg.vector); + apic->send_IPI(apicd->cpu, apicd->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -565,13 +551,56 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP + +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) +{ + struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); + struct apic_chip_data *apicd; + struct hlist_node *tmp; + + entering_ack_irq(); + /* Prevent vectors vanishing under us */ + raw_spin_lock(&vector_lock); + + hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + unsigned int irr, vector = apicd->cfg.old_vector; + + /* + * Paranoia: Check if the vector that needs to be cleaned + * up is registered at the APICs IRR. If so, then this is + * not the best time to clean it up. Clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest + * priority external vector, so on return from this + * interrupt the device interrupt will happen first. + */ + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1U << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + continue; + } + hlist_del_init(&apicd->clist); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + apicd->cfg.old_vector = 0; + } + + raw_spin_unlock(&vector_lock); + exiting_irq(); +} + static void __send_cleanup_vector(struct apic_chip_data *apicd) { + unsigned int cpu; + raw_spin_lock(&vector_lock); - cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask); apicd->move_in_progress = 0; - if (!cpumask_empty(apicd->old_domain)) - apic->send_IPI_mask(apicd->old_domain, IRQ_MOVE_CLEANUP_VECTOR); + cpu = apicd->prev_cpu; + if (cpu_online(cpu)) { + hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); + apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + } else { + apicd->cfg.old_vector = 0; + } raw_spin_unlock(&vector_lock); } @@ -584,95 +613,15 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(apicd); } -asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - - entering_ack_irq(); - - /* Prevent vectors vanishing under us */ - raw_spin_lock(&vector_lock); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - struct apic_chip_data *apicd; - struct irq_desc *desc; - unsigned int irr; - - retry: - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - - if (!raw_spin_trylock(&desc->lock)) { - raw_spin_unlock(&vector_lock); - cpu_relax(); - raw_spin_lock(&vector_lock); - goto retry; - } - - apicd = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!apicd) - goto unlock; - - /* - * Nothing to cleanup if irq migration is in progress - * or this cpu is not set in the cleanup mask. - */ - if (apicd->move_in_progress || - !cpumask_test_cpu(me, apicd->old_domain)) - goto unlock; - - /* - * We have two cases to handle here: - * 1) vector is unchanged but the target mask got reduced - * 2) vector and the target mask has changed - * - * #1 is obvious, but in #2 we have two vectors with the same - * irq descriptor: the old and the new vector. So we need to - * make sure that we only cleanup the old vector. The new - * vector has the current @vector number in the config and - * this cpu is part of the target mask. We better leave that - * one alone. - */ - if (vector == apicd->cfg.vector && - cpumask_test_cpu(me, apicd->domain)) - goto unlock; - - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - /* - * Check if the vector that needs to be cleanedup is - * registered at the cpu's IRR. If so, then this is not - * the best time to clean it up. Lets clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to myself. - */ - if (irr & (1 << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); - goto unlock; - } - __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); - cpumask_clear_cpu(me, apicd->old_domain); -unlock: - raw_spin_unlock(&desc->lock); - } - - raw_spin_unlock(&vector_lock); - - exiting_irq(); -} - static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { - unsigned me; struct apic_chip_data *apicd; apicd = container_of(cfg, struct apic_chip_data, cfg); if (likely(!apicd->move_in_progress)) return; - me = smp_processor_id(); - if (vector == apicd->cfg.vector && cpumask_test_cpu(me, apicd->domain)) + if (vector == apicd->cfg.vector && apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); } @@ -686,10 +635,9 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqd; struct apic_chip_data *apicd; - struct irq_cfg *cfg; - unsigned int cpu; + struct irq_data *irqd; + unsigned int vector; /* * The function is called for all descriptors regardless of which @@ -701,42 +649,30 @@ void irq_force_complete_move(struct irq_desc *desc) * (apic_chip_data) before touching it any further. */ irqd = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); + irq_desc_get_irq(desc)); if (!irqd) return; + raw_spin_lock(&vector_lock); apicd = apic_chip_data(irqd); - cfg = apicd ? &apicd->cfg : NULL; - - if (!cfg) - return; + if (!apicd) + goto unlock; /* - * This is tricky. If the cleanup of @data->old_domain has not been + * If old_vector is empty, no action required. + */ + vector = apicd->cfg.old_vector; + if (!vector) + goto unlock; + + /* + * This is tricky. If the cleanup of the old vector has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * * All CPUs are stuck in stop machine with interrupts disabled so * calling __irq_complete_move() would be completely pointless. - */ - raw_spin_lock(&vector_lock); - /* - * Clean out all offline cpus (including the outgoing one) from the - * old_domain mask. - */ - cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask); - - /* - * If move_in_progress is cleared and the old_domain mask is empty, - * then there is nothing to cleanup. fixup_irqs() will take care of - * the stale vectors on the outgoing cpu. - */ - if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain)) { - raw_spin_unlock(&vector_lock); - return; - } - - /* + * * 1) The interrupt is in move_in_progress state. That means that we * have not seen an interrupt since the io_apic was reprogrammed to * the new vector. @@ -778,18 +714,15 @@ void irq_force_complete_move(struct irq_desc *desc) * area arises. */ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqd->irq, cfg->old_vector); + irqd->irq, vector); } - /* - * If old_domain is not empty, then other cpus still have the irq - * descriptor set in their vector array. Clean it up. - */ - for_each_cpu(cpu, apicd->old_domain) - per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; - + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; /* Cleanup the left overs of the (half finished) move */ cpumask_clear(apicd->old_domain); + apicd->cfg.old_vector = 0; apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); +unlock: raw_spin_unlock(&vector_lock); } #endif From 3534be05e4adc303d41fae65901598695adea685 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:33 +0200 Subject: [PATCH 0115/1085] x86/ioapic: Mark legacy vectors at reallocation time When the legacy PIC vectors are taken over by the IO APIC the current vector assignement code is tricked to reuse the vector by allocating the apic data in the early boot process. This can be avoided by marking the allocation as legacy PIC take over. Preparatory patch for further cleanups. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.700501979@linutronix.de --- arch/x86/include/asm/irqdomain.h | 1 + arch/x86/kernel/apic/io_apic.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 1d9091ffa140..73e9c42ce63b 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -8,6 +8,7 @@ enum { /* Allocate contiguous CPU vectors */ X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, + X86_IRQ_ALLOC_LEGACY = 0x2, }; extern struct irq_domain *x86_vector_domain; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81f35ae3f884..a4b0c60ab8e1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1013,6 +1013,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { + info->flags |= X86_IRQ_ALLOC_LEGACY; irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { From 4ef76eb6de734dc03a7f3b8f80884362364e6049 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:34 +0200 Subject: [PATCH 0116/1085] x86/apic: Get rid of the legacy irq data storage Now that the legacy PIC takeover by the IOAPIC is marked accordingly the early boot allocation of APIC data is not longer necessary. Use the regular allocation mechansim as it is used by non legacy interrupts and fill in the known information (vector and affinity) so the allocator reuses the vector, This is important as the timer check might move the timer interrupt 0 back to the PIC in case the delivery through the IOAPIC fails. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.780521549@linutronix.de --- arch/x86/kernel/apic/vector.c | 52 +++++++++-------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 68f885913927..d6feb9ca8f52 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -36,9 +36,6 @@ EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; static struct irq_chip lapic_controller; -#ifdef CONFIG_X86_IO_APIC -static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; -#endif #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct hlist_head, cleanup_list); #endif @@ -317,10 +314,6 @@ static void x86_vector_free_irqs(struct irq_domain *domain, irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); free_apic_chip_data(apicd); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs()) - legacy_irq_data[virq + i] = NULL; -#endif } } } @@ -344,12 +337,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irqd = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irqd); node = irq_data_get_node(irqd); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) - apicd = legacy_irq_data[virq + i]; - else -#endif - apicd = alloc_apic_chip_data(node); + WARN_ON_ONCE(irqd->chip_data); + apicd = alloc_apic_chip_data(node); if (!apicd) { err = -ENOMEM; goto error; @@ -359,6 +348,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irqd->chip_data = apicd; irqd->hwirq = virq + i; irqd_set_single_target(irqd); + /* + * Make sure, that the legacy to IOAPIC transition stays on + * the same vector. This is required for check_timer() to + * work correctly as it might switch back to legacy mode. + */ + if (info->flags & X86_IRQ_ALLOC_LEGACY) { + apicd->cfg.vector = ISA_IRQ_VECTOR(virq + i); + apicd->cpu = 0; + cpumask_copy(apicd->domain, cpumask_of(0)); + } + err = assign_irq_vector_policy(virq + i, node, apicd, info, irqd); if (err) @@ -404,36 +404,10 @@ int __init arch_probe_nr_irqs(void) return legacy_pic->probe(); } -#ifdef CONFIG_X86_IO_APIC -static void __init init_legacy_irqs(void) -{ - int i, node = cpu_to_node(0); - struct apic_chip_data *apicd; - - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * ISA_IRQ_VECTOR(i) for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - apicd = legacy_irq_data[i] = alloc_apic_chip_data(node); - BUG_ON(!apicd); - - apicd->cfg.vector = ISA_IRQ_VECTOR(i); - cpumask_copy(apicd->domain, cpumask_of(0)); - apicd->cpu = 0; - irq_set_chip_data(i, apicd); - } -} -#else -static inline void init_legacy_irqs(void) { } -#endif - int __init arch_early_irq_init(void) { struct fwnode_handle *fn; - init_legacy_irqs(); - fn = irq_domain_alloc_named_fwnode("VECTOR"); BUG_ON(!fn); x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, From 258d86eef94fcaa72e088962259490866ad93489 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:35 +0200 Subject: [PATCH 0117/1085] x86/vector: Remove pointless pointer checks The info pointer checks in assign_irq_vector_policy() are pointless because the pointer cannot be NULL, otherwise the calling code would already crash. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.859484148@linutronix.de --- arch/x86/kernel/apic/vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d6feb9ca8f52..22cae8888e97 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -254,7 +254,7 @@ static int assign_irq_vector_policy(int irq, int node, struct irq_alloc_info *info, struct irq_data *irqd) { - if (info && info->mask) + if (info->mask) return assign_irq_vector(irq, apicd, info->mask, irqd); if (node != NUMA_NO_NODE && assign_irq_vector(irq, apicd, cpumask_of_node(node), irqd) == 0) From 99a1482d8aa105922dc4a3360ab11600f0bc9d80 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:36 +0200 Subject: [PATCH 0118/1085] x86/vector: Move helper functions around Move the helper functions to a different place as they would end up in the middle of management functions. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213154.949581934@linutronix.de --- arch/x86/kernel/apic/vector.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 22cae8888e97..5d5c2c064a3e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -53,6 +53,21 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) { if (!irqd) @@ -282,21 +297,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *apicd) hlist_del_init(&apicd->clist); } -void init_irq_alloc_info(struct irq_alloc_info *info, - const struct cpumask *mask) -{ - memset(info, 0, sizeof(*info)); - info->mask = mask; -} - -void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) -{ - if (src) - *dst = *src; - else - memset(dst, 0, sizeof(*dst)); -} - static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { From 9f9e3bb1cf2ecba7697bfb5e350ad2648e69dbdf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:37 +0200 Subject: [PATCH 0119/1085] x86/apic: Add replacement for cpu_mask_to_apicid() As preparation for replacing the vector allocator, provide a new function which takes a cpu number instead of a cpu mask to calculate/lookup the resulting APIC destination id. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown --- arch/x86/include/asm/apic.h | 5 +++++ arch/x86/kernel/apic/apic_common.c | 10 ++++++++++ arch/x86/kernel/apic/apic_flat_64.c | 2 ++ arch/x86/kernel/apic/apic_noop.c | 1 + arch/x86/kernel/apic/apic_numachip.c | 2 ++ arch/x86/kernel/apic/bigsmp_32.c | 1 + arch/x86/kernel/apic/probe_32.c | 1 + arch/x86/kernel/apic/x2apic_cluster.c | 6 ++++++ arch/x86/kernel/apic/x2apic_phys.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 6 ++++++ arch/x86/xen/apic.c | 1 + 11 files changed, 36 insertions(+) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ff0bddabaa04..01bcaa8b62b3 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -303,6 +303,7 @@ struct apic { int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); + u32 (*calc_dest_apicid)(unsigned int cpu); /* ICR related functions */ u64 (*icr_read)(void); @@ -486,6 +487,10 @@ static inline unsigned int read_apic_id(void) extern int default_apic_id_valid(int apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); + +extern u32 apic_default_calc_apicid(unsigned int cpu); +extern u32 apic_flat_calc_apicid(unsigned int cpu); + extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 4791654cdeb2..ddc6a4301588 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -6,6 +6,11 @@ #include #include +u32 apic_default_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_apicid, cpu); +} + int default_cpu_mask_to_apicid(const struct cpumask *msk, struct irq_data *irqd, unsigned int *apicid) { @@ -18,6 +23,11 @@ int default_cpu_mask_to_apicid(const struct cpumask *msk, struct irq_data *irqd, return 0; } +u32 apic_flat_calc_apicid(unsigned int cpu) +{ + return 1U << cpu; +} + int flat_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqd, unsigned int *apicid) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7ca354dee8af..697704443fda 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -172,6 +172,7 @@ static struct apic apic_flat __ro_after_init = { .set_apic_id = set_apic_id, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, @@ -267,6 +268,7 @@ static struct apic apic_physflat __ro_after_init = { .set_apic_id = set_apic_id, .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a8a7cb1347dc..d8c24e6f1a11 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -142,6 +142,7 @@ struct apic apic_noop __ro_after_init = { .set_apic_id = NULL, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index cc2f8843391f..4ec293b30eb8 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -267,6 +267,7 @@ static const struct apic apic_numachip1 __refconst = { .set_apic_id = numachip1_set_apic_id, .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, @@ -317,6 +318,7 @@ static const struct apic apic_numachip2 __refconst = { .set_apic_id = numachip2_set_apic_id, .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 72a1a0385549..de2e8597f2df 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -172,6 +172,7 @@ static struct apic apic_bigsmp __ro_after_init = { .set_apic_id = NULL, .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 95125bfb4e09..6a9020a3c243 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -127,6 +127,7 @@ static struct apic apic_default __ro_after_init = { .set_apic_id = NULL, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index c1684f27226e..17bf63f580d7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -114,6 +114,11 @@ x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, return 0; } +static u32 x2apic_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_logical_apicid, cpu); +} + static void init_x2apic_ldr(void) { struct cluster_mask *cmsk = this_cpu_read(cluster_masks); @@ -245,6 +250,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .set_apic_id = x2apic_set_apic_id, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .calc_dest_apicid = x2apic_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6903e69a7b60..ebad7ddbfdfc 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -165,6 +165,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .set_apic_id = x2apic_set_apic_id, .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 9f6d551deeb4..99c3c039646d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -537,6 +537,11 @@ uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, return ret; } +static u32 apic_uv_calc_apicid(unsigned int cpu) +{ + return apic_default_calc_apicid(cpu) | uv_apicid_hibits; +} + static unsigned int x2apic_get_apic_id(unsigned long x) { unsigned int id; @@ -602,6 +607,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .set_apic_id = set_apic_id, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .calc_dest_apicid = apic_uv_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 58776bcf4251..fb8522bed08c 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -178,6 +178,7 @@ static struct apic xen_pv_apic = { .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP .send_IPI_mask = xen_send_IPI_mask, From 0fa115da408f645cca419a60a5af8f4426ad4188 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:38 +0200 Subject: [PATCH 0120/1085] x86/irq/vector: Initialize matrix allocator Initialize the matrix allocator and add the proper accounting points to the code. No functional change, just preparation. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.108410660@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/include/asm/apic.h | 6 ++++ arch/x86/include/asm/hw_irq.h | 3 +- arch/x86/kernel/apic/vector.c | 56 ++++++++++++++++++++++++++++++++--- arch/x86/kernel/i8259.c | 1 + arch/x86/kernel/irqinit.c | 1 + arch/x86/kernel/smpboot.c | 3 +- 7 files changed, 65 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..64e99d3c5169 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -92,6 +92,7 @@ config X86 select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP + select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC select GENERIC_IRQ_MIGRATION if SMP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 01bcaa8b62b3..7a8651921ed5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -169,6 +169,10 @@ static inline int apic_is_clustered_box(void) #endif extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); +extern void lapic_assign_system_vectors(void); +extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); +extern void lapic_online(void); +extern void lapic_offline(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } @@ -179,6 +183,8 @@ static inline void disable_local_APIC(void) { } # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } static inline void apic_intr_mode_init(void) { } +static inline void lapic_assign_system_vectors(void) { } +static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6dfe366a8804..386368890376 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -15,6 +15,8 @@ #include +#define IRQ_MATRIX_BITS NR_VECTORS + #ifndef __ASSEMBLY__ #include @@ -130,7 +132,6 @@ extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d5c2c064a3e..078fbd08499c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -36,6 +36,7 @@ EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; static struct irq_chip lapic_controller; +static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct hlist_head, cleanup_list); #endif @@ -404,6 +405,36 @@ int __init arch_probe_nr_irqs(void) return legacy_pic->probe(); } +void lapic_assign_legacy_vector(unsigned int irq, bool replace) +{ + /* + * Use assign system here so it wont get accounted as allocated + * and moveable in the cpu hotplug check and it prevents managed + * irq reservation from touching it. + */ + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); +} + +void __init lapic_assign_system_vectors(void) +{ + unsigned int i, vector = 0; + + for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + irq_matrix_assign_system(vector_matrix, vector, false); + + if (nr_legacy_irqs() > 1) + lapic_assign_legacy_vector(PIC_CASCADE_IR, false); + + /* System vectors are reserved, online it */ + irq_matrix_online(vector_matrix); + + /* Mark the preallocated legacy interrupts */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); + } +} + int __init arch_early_irq_init(void) { struct fwnode_handle *fn; @@ -423,6 +454,14 @@ int __init arch_early_irq_init(void) BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL)); + /* + * Allocate the vector matrix allocator data structure and limit the + * search area. + */ + vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR, + FIRST_SYSTEM_VECTOR); + BUG_ON(!vector_matrix); + return arch_early_ioapic_init(); } @@ -454,14 +493,16 @@ static struct irq_desc *__setup_vector_irq(int vector) return irq_to_desc(isairq); } -/* - * Setup the vector to irq mappings. Must be called with vector_lock held. - */ -void setup_vector_irq(int cpu) +/* Online the local APIC infrastructure and initialize the vectors */ +void lapic_online(void) { unsigned int vector; lockdep_assert_held(&vector_lock); + + /* Online the vector matrix array for this CPU */ + irq_matrix_online(vector_matrix); + /* * The interrupt affinity logic never targets interrupts to offline * CPUs. The exception are the legacy PIC interrupts. In general @@ -482,6 +523,13 @@ void setup_vector_irq(int cpu) vector_update_shutdown_irqs(); } +void lapic_offline(void) +{ + lock_vector_lock(); + irq_matrix_offline(vector_matrix); + unlock_vector_lock(); +} + static int apic_retrigger_irq(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4e3b8a587c88..317c5b38a318 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -113,6 +113,7 @@ static void make_8259A_irq(unsigned int irq) io_apic_irqs &= ~(1< Date: Wed, 13 Sep 2017 23:29:39 +0200 Subject: [PATCH 0121/1085] x86/vector: Add vector domain debugfs support Add the debug callback for the vector domain, which gives a detailed information about vector usage if invoked for the domain by using rhe matrix allocator debug function and vector/target information when invoked for a particular interrupt. Extra information foir the Vector domain: Online bitmaps: 32 Global available: 6352 Global reserved: 5 Total allocated: 20 System: 41: 0-19,32,50,128,238-255 | CPU | avl | man | act | vectors 0 183 4 19 33-48,51-53 1 199 4 1 33 2 199 4 0 Extra information for interrupts: Vector: 42 Target: 4 This allows a detailed analysis of the vector usage and the association to interrupts and devices. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.188137174@linutronix.de --- arch/x86/kernel/apic/vector.c | 50 +++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 078fbd08499c..acdc74df649d 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include +#include #include #include #include @@ -373,9 +374,54 @@ error: return err; } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + unsigned int cpu, vec, prev_cpu, prev_vec; + struct apic_chip_data *apicd; + unsigned long flags; + int irq; + + if (!irqd) { + irq_matrix_debug_show(m, vector_matrix, ind); + return; + } + + irq = irqd->irq; + if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) { + seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq)); + seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, ""); + return; + } + + apicd = irqd->chip_data; + if (!apicd) { + seq_printf(m, "%*sVector: Not assigned\n", ind, ""); + return; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + cpu = apicd->cpu; + vec = apicd->cfg.vector; + prev_cpu = apicd->prev_cpu; + prev_vec = apicd->cfg.old_vector; + raw_spin_unlock_irqrestore(&vector_lock, flags); + seq_printf(m, "%*sVector: %5u\n", ind, "", vec); + seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); + if (prev_vec) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vec); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); + } +} +#endif + static const struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = x86_vector_debug_show, +#endif }; int __init arch_probe_nr_irqs(void) From 8ed4f3e66665cd186bc6b1d35f25a481e35c62ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:40 +0200 Subject: [PATCH 0122/1085] x86/smpboot: Set online before setting up vectors There is no reason to set the CPU online after establishing the vectors on the upcoming CPU. The vector space is protected by the vector lock so no changes can happen. Marking the CPU online before setting up the vector space makes tracing work in the early vector management cpu online code. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.264311994@linutronix.de --- arch/x86/kernel/smpboot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 91c0d1cd651e..86739f04701b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -254,14 +254,14 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* - * Lock vector_lock and initialize the vectors on this cpu - * before setting the cpu online. We must set it online with - * vector_lock held to prevent a concurrent setup/teardown - * from seeing a half valid vector space. + * Lock vector_lock, set CPU online and bring the vector + * allocator online. Online must be set with vector_lock held + * to prevent a concurrent irq setup/teardown from seeing a + * half valid vector space. */ lock_vector_lock(); - lapic_online(); set_cpu_online(smp_processor_id(), true); + lapic_online(); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); From 8d1e3dca7de6e8513872799a748a1d47d8dce60d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:41 +0200 Subject: [PATCH 0123/1085] x86/vector: Add tracepoints for vector management Add tracepoints for analysing the new vector management Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.357986795@linutronix.de --- arch/x86/include/asm/trace/irq_vectors.h | 244 +++++++++++++++++++++++ arch/x86/kernel/apic/vector.c | 2 + 2 files changed, 246 insertions(+) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 1599d394c8c1..bc09c5cf6390 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -137,6 +137,250 @@ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); DEFINE_IRQ_VECTOR_EVENT(thermal_apic); #endif +TRACE_EVENT(vector_config, + + TP_PROTO(unsigned int irq, unsigned int vector, + unsigned int cpu, unsigned int apicdest), + + TP_ARGS(irq, vector, cpu, apicdest), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( unsigned int, cpu ) + __field( unsigned int, apicdest ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = vector; + __entry->cpu = cpu; + __entry->apicdest = apicdest; + ), + + TP_printk("irq=%u vector=%u cpu=%u apicdest=0x%08x", + __entry->irq, __entry->vector, __entry->cpu, + __entry->apicdest) +); + +DECLARE_EVENT_CLASS(vector_mod, + + TP_PROTO(unsigned int irq, unsigned int vector, + unsigned int cpu, unsigned int prev_vector, + unsigned int prev_cpu), + + TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( unsigned int, cpu ) + __field( unsigned int, prev_vector ) + __field( unsigned int, prev_cpu ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = vector; + __entry->cpu = cpu; + __entry->prev_vector = prev_vector; + __entry->prev_cpu = prev_cpu; + + ), + + TP_printk("irq=%u vector=%u cpu=%u prev_vector=%u prev_cpu=%u", + __entry->irq, __entry->vector, __entry->cpu, + __entry->prev_vector, __entry->prev_cpu) +); + +#define DEFINE_IRQ_VECTOR_MOD_EVENT(name) \ +DEFINE_EVENT_FN(vector_mod, name, \ + TP_PROTO(unsigned int irq, unsigned int vector, \ + unsigned int cpu, unsigned int prev_vector, \ + unsigned int prev_cpu), \ + TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_MOD_EVENT(vector_update); +DEFINE_IRQ_VECTOR_MOD_EVENT(vector_clear); + +DECLARE_EVENT_CLASS(vector_reserve, + + TP_PROTO(unsigned int irq, int ret), + + TP_ARGS(irq, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%u ret=%d", __entry->irq, __entry->ret) +); + +#define DEFINE_IRQ_VECTOR_RESERVE_EVENT(name) \ +DEFINE_EVENT_FN(vector_reserve, name, \ + TP_PROTO(unsigned int irq, int ret), \ + TP_ARGS(irq, ret), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve_managed); +DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve); + +TRACE_EVENT(vector_alloc, + + TP_PROTO(unsigned int irq, unsigned int vector, bool reserved, + int ret), + + TP_ARGS(irq, vector, ret, reserved), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, reserved ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = ret < 0 ? 0 : vector; + __entry->reserved = reserved; + __entry->ret = ret > 0 ? 0 : ret; + ), + + TP_printk("irq=%u vector=%u reserved=%d ret=%d", + __entry->irq, __entry->vector, + __entry->reserved, __entry->ret) +); + +TRACE_EVENT(vector_alloc_managed, + + TP_PROTO(unsigned int irq, unsigned int vector, + int ret), + + TP_ARGS(irq, vector, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = ret < 0 ? 0 : vector; + __entry->ret = ret > 0 ? 0 : ret; + ), + + TP_printk("irq=%u vector=%u ret=%d", + __entry->irq, __entry->vector, __entry->ret) +); + +DECLARE_EVENT_CLASS(vector_activate, + + TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, + bool early), + + TP_ARGS(irq, is_managed, can_reserve, early), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_managed ) + __field( bool, can_reserve ) + __field( bool, early ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_managed = is_managed; + __entry->can_reserve = can_reserve; + __entry->early = early; + ), + + TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d", + __entry->irq, __entry->is_managed, __entry->can_reserve, + __entry->early) +); + +#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \ +DEFINE_EVENT_FN(vector_activate, name, \ + TP_PROTO(unsigned int irq, bool is_managed, \ + bool can_reserve, bool early), \ + TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate); +DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); + +TRACE_EVENT(vector_teardown, + + TP_PROTO(unsigned int irq, bool is_managed, bool has_reserved), + + TP_ARGS(irq, is_managed, has_reserved), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_managed ) + __field( bool, has_reserved ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_managed = is_managed; + __entry->has_reserved = has_reserved; + ), + + TP_printk("irq=%u is_managed=%d has_reserved=%d", + __entry->irq, __entry->is_managed, __entry->has_reserved) +); + +TRACE_EVENT(vector_setup, + + TP_PROTO(unsigned int irq, bool is_legacy, int ret), + + TP_ARGS(irq, is_legacy, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_legacy ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_legacy = is_legacy; + __entry->ret = ret; + ), + + TP_printk("irq=%u is_legacy=%d ret=%d", + __entry->irq, __entry->is_legacy, __entry->ret) +); + +TRACE_EVENT(vector_free_moved, + + TP_PROTO(unsigned int irq, unsigned int vector, bool is_managed), + + TP_ARGS(irq, vector, is_managed), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, is_managed ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = vector; + __entry->is_managed = is_managed; + ), + + TP_printk("irq=%u vector=%u is_managed=%d", + __entry->irq, __entry->vector, __entry->is_managed) +); + + #endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index acdc74df649d..a2761740d345 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -22,6 +22,8 @@ #include #include +#include + struct apic_chip_data { struct irq_cfg cfg; unsigned int cpu; From 69cde0004a4b5cfc7d1cec4ef9ce4cf4e26142f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:42 +0200 Subject: [PATCH 0124/1085] x86/vector: Use matrix allocator for vector assignment Replace the magic vector allocation code by a simple bitmap matrix allocator. This avoids loops and hoops over CPUs and vector arrays, so in case of densly used vector spaces it's way faster. This also gets rid of the magic 'spread the vectors accross priority levels' heuristics in the current allocator: The comment in __asign_irq_vector says: * NOTE! The local APIC isn't very good at handling * multiple interrupts at the same interrupt level. * As the interrupt level is determined by taking the * vector number and shifting that right by 4, we * want to spread these out a bit so that they don't * all fall in the same interrupt level. After doing some palaeontological research the following was found the following in the PPro Developer Manual Volume 3: "7.4.2. Valid Interrupts The local and I/O APICs support 240 distinct vectors in the range of 16 to 255. Interrupt priority is implied by its vector, according to the following relationship: priority = vector / 16 One is the lowest priority and 15 is the highest. Vectors 16 through 31 are reserved for exclusive use by the processor. The remaining vectors are for general use. The processor's local APIC includes an in-service entry and a holding entry for each priority level. To avoid losing inter- rupts, software should allocate no more than 2 interrupt vectors per priority." The current SDM tells nothing about that, instead it states: "If more than one interrupt is generated with the same vector number, the local APIC can set the bit for the vector both in the IRR and the ISR. This means that for the Pentium 4 and Intel Xeon processors, the IRR and ISR can queue two interrupts for each interrupt vector: one in the IRR and one in the ISR. Any additional interrupts issued for the same interrupt vector are collapsed into the single bit in the IRR. For the P6 family and Pentium processors, the IRR and ISR registers can queue no more than two interrupts per interrupt vector and will reject other interrupts that are received within the same vector." Which means, that on P6/Pentium the APIC will reject a new message and tell the sender to retry, which increases the load on the APIC bus and nothing more. There is no affirmative answer from Intel on that, but it's a sane approach to remove that for the following reasons: 1) No other (relevant Open Source) operating systems bothers to implement this or mentiones this at all. 2) The current allocator has no enforcement for this and especially the legacy interrupts, which are the main source of interrupts on these P6 and older systmes, are allocated linearly in the same priority level and just work. 3) The current machines have no problem with that at all as verified with some experiments. 4) AMD at least confirmed that such an issue is unknown. 5) P6 and older are dinosaurs almost 20 years EOL, so there is really no reason to worry about that too much. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.443678104@linutronix.de --- arch/x86/kernel/apic/vector.c | 290 ++++++++++++++-------------------- 1 file changed, 117 insertions(+), 173 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a2761740d345..88219b80d9ec 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -28,16 +28,15 @@ struct apic_chip_data { struct irq_cfg cfg; unsigned int cpu; unsigned int prev_cpu; + unsigned int irq; struct hlist_node clist; - cpumask_var_t domain; - cpumask_var_t old_domain; u8 move_in_progress : 1; }; struct irq_domain *x86_vector_domain; EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); -static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; +static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP @@ -101,194 +100,124 @@ static struct apic_chip_data *alloc_apic_chip_data(int node) struct apic_chip_data *apicd; apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); - if (!apicd) - return NULL; - if (!zalloc_cpumask_var_node(&apicd->domain, GFP_KERNEL, node)) - goto out_data; - if (!zalloc_cpumask_var_node(&apicd->old_domain, GFP_KERNEL, node)) - goto out_domain; - INIT_HLIST_NODE(&apicd->clist); + if (apicd) + INIT_HLIST_NODE(&apicd->clist); return apicd; -out_domain: - free_cpumask_var(apicd->domain); -out_data: - kfree(apicd); - return NULL; } static void free_apic_chip_data(struct apic_chip_data *apicd) { - if (apicd) { - free_cpumask_var(apicd->domain); - free_cpumask_var(apicd->old_domain); - kfree(apicd); - } + kfree(apicd); } -static int __assign_irq_vector(int irq, struct apic_chip_data *d, - const struct cpumask *mask, - struct irq_data *irqd) +static void apic_update_irq_cfg(struct irq_data *irqd) { - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 16; - int cpu, vector; + struct apic_chip_data *apicd = apic_chip_data(irqd); - /* - * If there is still a move in progress or the previous move has not - * been cleaned up completely, tell the caller to come back later. - */ - if (d->cfg.old_vector) - return -EBUSY; + lockdep_assert_held(&vector_lock); - /* Only try and allocate irqs on cpus that are present */ - cpumask_clear(d->old_domain); - cpumask_clear(searched_cpumask); - cpu = cpumask_first_and(mask, cpu_online_mask); - while (cpu < nr_cpu_ids) { - int new_cpu, offset; + apicd->cfg.dest_apicid = apic->calc_dest_apicid(apicd->cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(apicd->cpu)); + trace_vector_config(irqd->irq, apicd->cfg.vector, apicd->cpu, + apicd->cfg.dest_apicid); +} - cpumask_copy(vector_cpumask, cpumask_of(cpu)); +static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, + unsigned int newcpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + struct irq_desc *desc = irq_data_to_desc(irqd); - /* - * Clear the offline cpus from @vector_cpumask for searching - * and verify whether the result overlaps with @mask. If true, - * then the call to apic->cpu_mask_to_apicid() will - * succeed as well. If not, no point in trying to find a - * vector in this mask. - */ - cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask); - if (!cpumask_intersects(vector_searchmask, mask)) - goto next_cpu; + lockdep_assert_held(&vector_lock); - if (cpumask_subset(vector_cpumask, d->domain)) { - if (cpumask_equal(vector_cpumask, d->domain)) - goto success; - /* - * Mark the cpus which are not longer in the mask for - * cleanup. - */ - cpumask_andnot(d->old_domain, d->domain, vector_cpumask); - vector = d->cfg.vector; - goto update; - } + trace_vector_update(irqd->irq, newvec, newcpu, apicd->cfg.vector, + apicd->cpu); - vector = current_vector; - offset = current_offset; -next: - vector += 16; - if (vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) % 16; - vector = FIRST_EXTERNAL_VECTOR + offset; - } - - /* If the search wrapped around, try the next cpu */ - if (unlikely(current_vector == vector)) - goto next_cpu; - - if (test_bit(vector, system_vectors)) - goto next; - - for_each_cpu(new_cpu, vector_searchmask) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) - goto next; - } - /* Found one! */ - current_vector = vector; - current_offset = offset; - /* Schedule the old vector for cleanup on all cpus */ - if (d->cfg.vector) - cpumask_copy(d->old_domain, d->domain); - for_each_cpu(new_cpu, vector_searchmask) - per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); - goto update; - -next_cpu: - /* - * We exclude the current @vector_cpumask from the requested - * @mask and try again with the next online cpu in the - * result. We cannot modify @mask, so we use @vector_cpumask - * as a temporary buffer here as it will be reassigned when - * calling apic->vector_allocation_domain() above. - */ - cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask); - cpumask_andnot(vector_cpumask, mask, searched_cpumask); - cpu = cpumask_first_and(vector_cpumask, cpu_online_mask); - continue; + /* Setup the vector move, if required */ + if (apicd->cfg.vector && cpu_online(apicd->cpu)) { + apicd->move_in_progress = true; + apicd->cfg.old_vector = apicd->cfg.vector; + apicd->prev_cpu = apicd->cpu; + } else { + apicd->cfg.old_vector = 0; } - return -ENOSPC; -update: + apicd->cfg.vector = newvec; + apicd->cpu = newcpu; + BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); + per_cpu(vector_irq, newcpu)[newvec] = desc; +} + +static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector = apicd->cfg.vector; + unsigned int cpu = apicd->cpu; + /* - * Exclude offline cpus from the cleanup mask and set the - * move_in_progress flag when the result is not empty. + * If the current target CPU is online and in the new requested + * affinity mask, there is no point in moving the interrupt from + * one CPU to another. */ - cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); - d->move_in_progress = !cpumask_empty(d->old_domain); - d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; - d->prev_cpu = d->cpu; - d->cfg.vector = vector; - cpumask_copy(d->domain, vector_cpumask); -success: - /* - * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail - * as we already established, that mask & d->domain & cpu_online_mask - * is not empty. - * - * vector_searchmask is a subset of d->domain and has the offline - * cpus masked out. - */ - cpumask_and(vector_searchmask, vector_searchmask, mask); - BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqd, - &d->cfg.dest_apicid)); - d->cpu = cpumask_first(vector_searchmask); + if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) + return 0; + + vector = irq_matrix_alloc(vector_matrix, dest, false, &cpu); + if (vector > 0) + apic_update_vector(irqd, vector, cpu); + trace_vector_alloc(irqd->irq, vector, false, vector); + return vector; +} + +static int assign_vector_locked(struct irq_data *irqd, + const struct cpumask *dest) +{ + int vector = allocate_vector(irqd, dest); + + if (vector < 0) + return vector; + + apic_update_irq_cfg(irqd); return 0; } -static int assign_irq_vector(int irq, struct apic_chip_data *apicd, - const struct cpumask *mask, - struct irq_data *irqd) +static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) { - int err; unsigned long flags; + int ret; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, apicd, mask, irqd); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + ret = assign_vector_locked(irqd, vector_searchmask); raw_spin_unlock_irqrestore(&vector_lock, flags); - return err; + return ret; } -static int assign_irq_vector_policy(int irq, int node, - struct apic_chip_data *apicd, - struct irq_alloc_info *info, - struct irq_data *irqd) +static int assign_irq_vector_policy(struct irq_data *irqd, + struct irq_alloc_info *info, int node) { if (info->mask) - return assign_irq_vector(irq, apicd, info->mask, irqd); + return assign_irq_vector(irqd, info->mask); if (node != NUMA_NO_NODE && - assign_irq_vector(irq, apicd, cpumask_of_node(node), irqd) == 0) + !assign_irq_vector(irqd, cpumask_of_node(node))) return 0; - return assign_irq_vector(irq, apicd, cpu_online_mask, irqd); + return assign_irq_vector(irqd, cpu_online_mask); } -static void clear_irq_vector(int irq, struct apic_chip_data *apicd) +static void clear_irq_vector(struct irq_data *irqd) { + struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned int vector = apicd->cfg.vector; + lockdep_assert_held(&vector_lock); if (!vector) return; + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->cfg.old_vector, + apicd->prev_cpu); + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->cpu, vector, false); apicd->cfg.vector = 0; /* Clean up move in progress */ @@ -297,6 +226,8 @@ static void clear_irq_vector(int irq, struct apic_chip_data *apicd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, false); + apicd->cfg.old_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); } @@ -313,7 +244,7 @@ static void x86_vector_free_irqs(struct irq_domain *domain, irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irqd && irqd->chip_data) { raw_spin_lock_irqsave(&vector_lock, flags); - clear_irq_vector(virq + i, irqd->chip_data); + clear_irq_vector(irqd); apicd = irqd->chip_data; irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -328,6 +259,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_alloc_info *info = arg; struct apic_chip_data *apicd; struct irq_data *irqd; + unsigned long flags; int i, err, node; if (disable_apic) @@ -348,23 +280,30 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, goto error; } + apicd->irq = virq + i; irqd->chip = &lapic_controller; irqd->chip_data = apicd; irqd->hwirq = virq + i; irqd_set_single_target(irqd); /* - * Make sure, that the legacy to IOAPIC transition stays on - * the same vector. This is required for check_timer() to - * work correctly as it might switch back to legacy mode. + * Legacy vectors are already assigned when the IOAPIC + * takes them over. They stay on the same vector. This is + * required for check_timer() to work correctly as it might + * switch back to legacy mode. Only update the hardware + * config. */ if (info->flags & X86_IRQ_ALLOC_LEGACY) { apicd->cfg.vector = ISA_IRQ_VECTOR(virq + i); apicd->cpu = 0; - cpumask_copy(apicd->domain, cpumask_of(0)); + trace_vector_setup(virq + i, true, 0); + raw_spin_lock_irqsave(&vector_lock, flags); + apic_update_irq_cfg(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + continue; } - err = assign_irq_vector_policy(virq + i, node, apicd, info, - irqd); + err = assign_irq_vector_policy(irqd, info, node); + trace_vector_setup(virq + i, false, err); if (err) goto error; } @@ -498,9 +437,7 @@ int __init arch_early_irq_init(void) arch_init_msi_domain(x86_vector_domain); arch_init_htirq_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL)); /* * Allocate the vector matrix allocator data structure and limit the @@ -523,8 +460,10 @@ static void vector_update_shutdown_irqs(void) struct irq_data *irqd = irq_desc_get_irq_data(desc); struct apic_chip_data *ad = apic_chip_data(irqd); - if (ad && ad->cfg.vector && ad->cpu == smp_processor_id()) - this_cpu_write(vector_irq[ad->cfg.vector], desc); + if (!ad || !ad->cfg.vector || ad->cpu != smp_processor_id()) + continue; + this_cpu_write(vector_irq[ad->cfg.vector], desc); + irq_matrix_assign(vector_matrix, ad->cfg.vector); } } @@ -600,8 +539,7 @@ void apic_ack_edge(struct irq_data *irqd) static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { - struct apic_chip_data *apicd = irqd->chip_data; - int err, irq = irqd->irq; + int err; if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; @@ -609,7 +547,7 @@ static int apic_set_affinity(struct irq_data *irqd, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, apicd, dest, irqd); + err = assign_irq_vector(irqd, dest); return err ? err : IRQ_SET_MASK_OK; } @@ -622,6 +560,19 @@ static struct irq_chip lapic_controller = { #ifdef CONFIG_SMP +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->cfg.old_vector; + unsigned int cpu = apicd->prev_cpu; + + trace_vector_free_moved(apicd->irq, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, false); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + hlist_del_init(&apicd->clist); + apicd->cfg.old_vector = 0; + apicd->move_in_progress = 0; +} + asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); @@ -649,9 +600,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); continue; } - hlist_del_init(&apicd->clist); - __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); - apicd->cfg.old_vector = 0; + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); @@ -786,12 +735,7 @@ void irq_force_complete_move(struct irq_desc *desc) pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", irqd->irq, vector); } - per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; - /* Cleanup the left overs of the (half finished) move */ - cpumask_clear(apicd->old_domain); - apicd->cfg.old_vector = 0; - apicd->move_in_progress = 0; - hlist_del_init(&apicd->clist); + free_moved_vector(apicd); unlock: raw_spin_unlock(&vector_lock); } From baab1e84b1124bfd3e40ef6c8e05b2a15136e3d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:43 +0200 Subject: [PATCH 0125/1085] x86/apic: Remove unused callbacks Now that the old allocator is gone, these apic functions are unused. Remove them. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.524662349@linutronix.de --- arch/x86/kernel/apic/apic_common.c | 48 --------------------------- arch/x86/kernel/apic/apic_flat_64.c | 4 --- arch/x86/kernel/apic/apic_noop.c | 10 ------ arch/x86/kernel/apic/apic_numachip.c | 4 --- arch/x86/kernel/apic/bigsmp_32.c | 2 -- arch/x86/kernel/apic/probe_32.c | 2 -- arch/x86/kernel/apic/x2apic_cluster.c | 48 --------------------------- arch/x86/kernel/apic/x2apic_phys.c | 2 -- arch/x86/kernel/apic/x2apic_uv_x.c | 14 -------- arch/x86/kernel/vsmp_64.c | 19 ----------- arch/x86/xen/apic.c | 2 -- 11 files changed, 155 deletions(-) diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index ddc6a4301588..a360801779ae 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -11,64 +11,16 @@ u32 apic_default_calc_apicid(unsigned int cpu) return per_cpu(x86_cpu_to_apicid, cpu); } -int default_cpu_mask_to_apicid(const struct cpumask *msk, struct irq_data *irqd, - unsigned int *apicid) -{ - unsigned int cpu = cpumask_first(msk); - - if (cpu >= nr_cpu_ids) - return -EINVAL; - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - return 0; -} - u32 apic_flat_calc_apicid(unsigned int cpu) { return 1U << cpu; } -int flat_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqd, - unsigned int *apicid) - -{ - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqd); - unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; - - if (!cpu_mask) - return -EINVAL; - *apicid = (unsigned int)cpu_mask; - cpumask_bits(effmsk)[0] = cpu_mask; - return 0; -} - bool default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); } -void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - /* - * Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - -void default_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_copy(retmask, cpumask_of(cpu)); -} - void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { *retmap = *phys_map; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 697704443fda..aa85690e9b64 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -158,7 +158,6 @@ static struct apic apic_flat __ro_after_init = { .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -171,7 +170,6 @@ static struct apic apic_flat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, @@ -253,7 +251,6 @@ static struct apic apic_physflat __ro_after_init = { .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, @@ -267,7 +264,6 @@ static struct apic apic_physflat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index d8c24e6f1a11..0285f28d531a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -83,14 +83,6 @@ static int noop_apic_id_registered(void) return physid_isset(0, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - if (cpu != 0) - pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_copy(retmask, cpumask_of(cpu)); -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); @@ -125,7 +117,6 @@ struct apic apic_noop __ro_after_init = { .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -141,7 +132,6 @@ struct apic apic_noop __ro_after_init = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = noop_send_IPI, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4ec293b30eb8..134e04506ab4 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -253,7 +253,6 @@ static const struct apic apic_numachip1 __refconst = { .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -266,7 +265,6 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, @@ -304,7 +302,6 @@ static const struct apic apic_numachip2 __refconst = { .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -317,7 +314,6 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index de2e8597f2df..7b754c513fa5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -158,7 +158,6 @@ static struct apic apic_bigsmp __ro_after_init = { .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, @@ -171,7 +170,6 @@ static struct apic apic_bigsmp __ro_after_init = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 6a9020a3c243..fa22017de806 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -113,7 +113,6 @@ static struct apic apic_default __ro_after_init = { .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -126,7 +125,6 @@ static struct apic apic_default __ro_after_init = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 17bf63f580d7..3da94277140f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -91,29 +91,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static int -x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) -{ - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - struct cluster_mask *cmsk; - unsigned int cpu; - u32 dest = 0; - - cpu = cpumask_first(mask); - if (cpu >= nr_cpu_ids) - return -EINVAL; - - cmsk = per_cpu(cluster_masks, cpu); - cpumask_clear(effmsk); - for_each_cpu_and(cpu, &cmsk->mask, mask) { - dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); - cpumask_set_cpu(cpu, effmsk); - } - *apicid = dest; - return 0; -} - static u32 x2apic_calc_apicid(unsigned int cpu) { return per_cpu(x86_cpu_to_logical_apicid, cpu); @@ -198,29 +175,6 @@ static int x2apic_cluster_probe(void) return 1; } -/* - * Each x2apic cluster is an allocation domain. - */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); - - /* - * To minimize vector pressure, default case of boot, device bringup - * etc will use a single cpu for the interrupt destination. - * - * On explicit migration requests coming from irqbalance etc, - * interrupts will be routed to the x2apic cluster (cluster-id - * derived from the first cpu in the mask) members specified - * in the mask. - */ - if (cpumask_equal(mask, cpu_online_mask)) - cpumask_copy(retmask, cpumask_of(cpu)); - else - cpumask_and(retmask, mask, &cmsk->mask); -} - static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", @@ -236,7 +190,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -249,7 +202,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .calc_dest_apicid = x2apic_calc_apicid, .send_IPI = x2apic_send_IPI, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index ebad7ddbfdfc..17c2c5b0b7b9 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -151,7 +151,6 @@ static struct apic apic_x2apic_phys __ro_after_init = { .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -164,7 +163,6 @@ static struct apic apic_x2apic_phys __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = x2apic_send_IPI, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 99c3c039646d..5832df6d9c37 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -525,18 +525,6 @@ static void uv_init_apic_ldr(void) { } -static int -uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) -{ - int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid); - - if (!ret) - *apicid |= uv_apicid_hibits; - - return ret; -} - static u32 apic_uv_calc_apicid(unsigned int cpu) { return apic_default_calc_apicid(cpu) | uv_apicid_hibits; @@ -593,7 +581,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -606,7 +593,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .calc_dest_apicid = apic_uv_calc_apicid, .send_IPI = uv_send_IPI_one, diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index b034b1b14b9c..44685fb2a192 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,9 +26,6 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 -/* Flag below is initialized once during vSMP PCI initialization. */ -static int irq_routing_comply = 1; - #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -105,9 +102,6 @@ static void __init set_vsmp_pv_ops(void) if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); - /* Interrupt routing set to ignore */ - irq_routing_comply = 0; - #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -211,23 +205,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) return hard_smp_processor_id() >> index_msb; } -/* - * In vSMP, all cpus should be capable of handling interrupts, regardless of - * the APIC used. - */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_setall(retmask); -} - static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - - if (!irq_routing_comply) - apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index fb8522bed08c..4ba3fd7039b0 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -164,7 +164,6 @@ static struct apic xen_pv_apic = { /* .dest_logical - default_send_IPI_ use it but we use our own. */ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ @@ -177,7 +176,6 @@ static struct apic xen_pv_apic = { .get_apic_id = xen_get_apic_id, .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP From ba801640b10d87b1c4e26cbcbe414a001255404f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:44 +0200 Subject: [PATCH 0126/1085] x86/vector: Compile SMP only code conditionally No point in compiling this for UP. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.603191841@linutronix.de --- arch/x86/kernel/apic/vector.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 88219b80d9ec..17d7d7fd45d9 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -450,6 +450,7 @@ int __init arch_early_irq_init(void) return arch_early_ioapic_init(); } +#ifdef CONFIG_SMP /* Temporary hack to keep things working */ static void vector_update_shutdown_irqs(void) { @@ -517,6 +518,25 @@ void lapic_offline(void) unlock_vector_lock(); } +static int apic_set_affinity(struct irq_data *irqd, + const struct cpumask *dest, bool force) +{ + int err; + + if (!IS_ENABLED(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(dest, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irqd, dest); + return err ? err : IRQ_SET_MASK_OK; +} + +#else +# define apic_set_affinity NULL +#endif + static int apic_retrigger_irq(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); @@ -536,21 +556,6 @@ void apic_ack_edge(struct irq_data *irqd) ack_APIC_irq(); } -static int apic_set_affinity(struct irq_data *irqd, - const struct cpumask *dest, bool force) -{ - int err; - - if (!IS_ENABLED(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(dest, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irqd, dest); - return err ? err : IRQ_SET_MASK_OK; -} - static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, From ba224feac8bb367edd62da33552353d4bdc3fe3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:45 +0200 Subject: [PATCH 0127/1085] x86/vector: Untangle internal state from irq_cfg The vector management state is not required to live in irq_cfg. irq_cfg is only relevant for the depending irq domains (IOAPIC, DMAR, MSI ...). The seperation of the vector management status allows to direct a shut down interrupt to a special shutdown vector w/o confusing the internal state of the vector management. Preparatory change for the rework of managed interrupts and the global vector reservation scheme. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.683712356@linutronix.de --- arch/x86/include/asm/hw_irq.h | 3 +- arch/x86/kernel/apic/vector.c | 88 +++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 386368890376..661540a93072 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -124,8 +124,7 @@ struct irq_alloc_info { struct irq_cfg { unsigned int dest_apicid; - u8 vector; - u8 old_vector; + unsigned int vector; }; extern struct irq_cfg *irq_cfg(unsigned int irq); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 17d7d7fd45d9..f08d44fabef4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -25,7 +25,9 @@ #include struct apic_chip_data { - struct irq_cfg cfg; + struct irq_cfg hw_irq_cfg; + unsigned int vector; + unsigned int prev_vector; unsigned int cpu; unsigned int prev_cpu; unsigned int irq; @@ -86,7 +88,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); - return apicd ? &apicd->cfg : NULL; + return apicd ? &apicd->hw_irq_cfg : NULL; } EXPORT_SYMBOL_GPL(irqd_cfg); @@ -110,16 +112,18 @@ static void free_apic_chip_data(struct apic_chip_data *apicd) kfree(apicd); } -static void apic_update_irq_cfg(struct irq_data *irqd) +static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, + unsigned int cpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); lockdep_assert_held(&vector_lock); - apicd->cfg.dest_apicid = apic->calc_dest_apicid(apicd->cpu); - irq_data_update_effective_affinity(irqd, cpumask_of(apicd->cpu)); - trace_vector_config(irqd->irq, apicd->cfg.vector, apicd->cpu, - apicd->cfg.dest_apicid); + apicd->hw_irq_cfg.vector = vector; + apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + trace_vector_config(irqd->irq, vector, cpu, + apicd->hw_irq_cfg.dest_apicid); } static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, @@ -130,19 +134,19 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, lockdep_assert_held(&vector_lock); - trace_vector_update(irqd->irq, newvec, newcpu, apicd->cfg.vector, + trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, apicd->cpu); /* Setup the vector move, if required */ - if (apicd->cfg.vector && cpu_online(apicd->cpu)) { + if (apicd->vector && cpu_online(apicd->cpu)) { apicd->move_in_progress = true; - apicd->cfg.old_vector = apicd->cfg.vector; + apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; } else { - apicd->cfg.old_vector = 0; + apicd->prev_vector = 0; } - apicd->cfg.vector = newvec; + apicd->vector = newvec; apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); per_cpu(vector_irq, newcpu)[newvec] = desc; @@ -151,8 +155,10 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); - int vector = apicd->cfg.vector; unsigned int cpu = apicd->cpu; + int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); /* * If the current target CPU is online and in the new requested @@ -172,12 +178,13 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) static int assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) { + struct apic_chip_data *apicd = apic_chip_data(irqd); int vector = allocate_vector(irqd, dest); if (vector < 0) return vector; - apic_update_irq_cfg(irqd); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } @@ -207,27 +214,28 @@ static int assign_irq_vector_policy(struct irq_data *irqd, static void clear_irq_vector(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); - unsigned int vector = apicd->cfg.vector; + unsigned int vector = apicd->vector; lockdep_assert_held(&vector_lock); + if (!vector) return; - trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->cfg.old_vector, + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; irq_matrix_free(vector_matrix, apicd->cpu, vector, false); - apicd->cfg.vector = 0; + apicd->vector = 0; /* Clean up move in progress */ - vector = apicd->cfg.old_vector; + vector = apicd->prev_vector; if (!vector) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, false); - apicd->cfg.old_vector = 0; + apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); } @@ -293,11 +301,11 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * config. */ if (info->flags & X86_IRQ_ALLOC_LEGACY) { - apicd->cfg.vector = ISA_IRQ_VECTOR(virq + i); + apicd->vector = ISA_IRQ_VECTOR(virq + i); apicd->cpu = 0; trace_vector_setup(virq + i, true, 0); raw_spin_lock_irqsave(&vector_lock, flags); - apic_update_irq_cfg(irqd); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); raw_spin_unlock_irqrestore(&vector_lock, flags); continue; } @@ -319,7 +327,7 @@ error: void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) { - unsigned int cpu, vec, prev_cpu, prev_vec; + unsigned int cpu, vector, prev_cpu, prev_vector; struct apic_chip_data *apicd; unsigned long flags; int irq; @@ -344,14 +352,14 @@ void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, raw_spin_lock_irqsave(&vector_lock, flags); cpu = apicd->cpu; - vec = apicd->cfg.vector; + vector = apicd->vector; prev_cpu = apicd->prev_cpu; - prev_vec = apicd->cfg.old_vector; + prev_vector = apicd->prev_vector; raw_spin_unlock_irqrestore(&vector_lock, flags); - seq_printf(m, "%*sVector: %5u\n", ind, "", vec); + seq_printf(m, "%*sVector: %5u\n", ind, "", vector); seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); - if (prev_vec) { - seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vec); + if (prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector); seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); } } @@ -461,10 +469,10 @@ static void vector_update_shutdown_irqs(void) struct irq_data *irqd = irq_desc_get_irq_data(desc); struct apic_chip_data *ad = apic_chip_data(irqd); - if (!ad || !ad->cfg.vector || ad->cpu != smp_processor_id()) + if (!ad || !ad->vector || ad->cpu != smp_processor_id()) continue; - this_cpu_write(vector_irq[ad->cfg.vector], desc); - irq_matrix_assign(vector_matrix, ad->cfg.vector); + this_cpu_write(vector_irq[ad->vector], desc); + irq_matrix_assign(vector_matrix, ad->vector); } } @@ -543,7 +551,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI(apicd->cpu, apicd->cfg.vector); + apic->send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -567,14 +575,14 @@ static struct irq_chip lapic_controller = { static void free_moved_vector(struct apic_chip_data *apicd) { - unsigned int vector = apicd->cfg.old_vector; + unsigned int vector = apicd->prev_vector; unsigned int cpu = apicd->prev_cpu; trace_vector_free_moved(apicd->irq, vector, false); irq_matrix_free(vector_matrix, cpu, vector, false); __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); hlist_del_init(&apicd->clist); - apicd->cfg.old_vector = 0; + apicd->prev_vector = 0; apicd->move_in_progress = 0; } @@ -589,7 +597,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) raw_spin_lock(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { - unsigned int irr, vector = apicd->cfg.old_vector; + unsigned int irr, vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -623,7 +631,7 @@ static void __send_cleanup_vector(struct apic_chip_data *apicd) hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); } else { - apicd->cfg.old_vector = 0; + apicd->prev_vector = 0; } raw_spin_unlock(&vector_lock); } @@ -632,7 +640,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) { struct apic_chip_data *apicd; - apicd = container_of(cfg, struct apic_chip_data, cfg); + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (apicd->move_in_progress) __send_cleanup_vector(apicd); } @@ -641,11 +649,11 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { struct apic_chip_data *apicd; - apicd = container_of(cfg, struct apic_chip_data, cfg); + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (likely(!apicd->move_in_progress)) return; - if (vector == apicd->cfg.vector && apicd->cpu == smp_processor_id()) + if (vector == apicd->vector && apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); } @@ -683,9 +691,9 @@ void irq_force_complete_move(struct irq_desc *desc) goto unlock; /* - * If old_vector is empty, no action required. + * If prev_vector is empty, no action required. */ - vector = apicd->cfg.old_vector; + vector = apicd->prev_vector; if (!vector) goto unlock; From 2a85386a73fa57b114ba66421b57d3850dbcef9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:46 +0200 Subject: [PATCH 0128/1085] x86/apic/msi: Force reactivation of interrupts at startup time MSI(X) interrupts need a valid vector configuration early at allocation time, i.e. before the PCI core enables MSI(X). With managed interrupts and the new global reservation scheme, the early configuration will not assign a real device vector, but a special shutdown vector. When the irq is started up, then the interrupt must be reconfigured. Tell the MSI irqdomain core about it. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.774066582@linutronix.de --- arch/x86/kernel/apic/msi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..5b6dd1a85ec4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -129,7 +129,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, + MSI_FLAG_PCI_MSIX | MSI_FLAG_MUST_REACTIVATE, .ops = &pci_msi_domain_ops, .chip = &pci_msi_controller, .handler = handle_edge_irq, @@ -167,7 +167,8 @@ static struct irq_chip pci_msi_ir_controller = { static struct msi_domain_info pci_msi_ir_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | + MSI_FLAG_MUST_REACTIVATE, .ops = &pci_msi_domain_ops, .chip = &pci_msi_ir_controller, .handler = handle_edge_irq, From d491bdff888e8a287f6017c70a8dd10f46984851 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:47 +0200 Subject: [PATCH 0129/1085] iommu/vt-d: Reevaluate vector configuration on activate() With the upcoming reservation/management scheme, early activation will assign a special vector. The final activation at request_irq() assigns a real vector, which needs to be updated in the tables. Split out the reconfiguration code in set_affinity and use it for reactivation. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: iommu@lists.linux-foundation.org Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.853028808@linutronix.de --- drivers/iommu/intel_irq_remapping.c | 38 ++++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 762d84713b7a..e274d9d12ba4 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1121,6 +1121,24 @@ struct irq_remap_ops intel_irq_remap_ops = { .get_irq_domain = intel_get_irq_domain, }; +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irq_cfg *cfg = irqd_cfg(irqd); + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + irte->vector = cfg->vector; + irte->dest_id = IRTE_DEST(cfg->dest_apicid); + + /* Update the hardware only if the interrupt is in remapped mode. */ + if (!force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) + modify_irte(&ir_data->irq_2_iommu, irte); +} + /* * Migrate the IO-APIC irq in the presence of intr-remapping. * @@ -1139,27 +1157,15 @@ static int intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct intel_ir_data *ir_data = data->chip_data; - struct irte *irte = &ir_data->irte_entry; - struct irq_cfg *cfg = irqd_cfg(data); struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - irte->vector = cfg->vector; - irte->dest_id = IRTE_DEST(cfg->dest_apicid); - - /* Update the hardware only if the interrupt is in remapped mode. */ - if (ir_data->irq_2_iommu.mode == IRQ_REMAPPING) - modify_irte(&ir_data->irq_2_iommu, irte); - + intel_ir_reconfigure_irte(data, false); /* * After this point, all the interrupts will start arriving * at the new destination. So, time to cleanup the previous @@ -1392,9 +1398,7 @@ static void intel_irq_remapping_free(struct irq_domain *domain, static int intel_irq_remapping_activate(struct irq_domain *domain, struct irq_data *irq_data, bool early) { - struct intel_ir_data *data = irq_data->chip_data; - - modify_irte(&data->irq_2_iommu, &data->irte_entry); + intel_ir_reconfigure_irte(irq_data, true); return 0; } From 5ba204a1817ba95a7b24dbe8ef2c7ddd4cea886e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:48 +0200 Subject: [PATCH 0130/1085] iommu/amd: Reevaluate vector configuration on activate() With the upcoming reservation/management scheme, early activation will assign a special vector. The final activation at request_irq() assigns a real vector, which needs to be updated in the tables. Split out the reconfiguration code in set_affinity and use it for reactivation. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: iommu@lists.linux-foundation.org Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213155.944883733@linutronix.de --- drivers/iommu/amd_iommu.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index ea03f4138f5f..a78fa34f113a 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4170,16 +4170,25 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_common(domain, virq, nr_irqs); } +static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, + struct amd_ir_data *ir_data, + struct irq_2_irte *irte_info, + struct irq_cfg *cfg); + static int irq_remapping_activate(struct irq_domain *domain, struct irq_data *irq_data, bool early) { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; + struct irq_cfg *cfg = irqd_cfg(irq_data); - if (iommu) - iommu->irte_ops->activate(data->entry, irte_info->devid, - irte_info->index); + if (!iommu) + return 0; + + iommu->irte_ops->activate(data->entry, irte_info->devid, + irte_info->index); + amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg); return 0; } @@ -4267,6 +4276,22 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data); } + +static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, + struct amd_ir_data *ir_data, + struct irq_2_irte *irte_info, + struct irq_cfg *cfg) +{ + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid, + irte_info->index, cfg->vector, + cfg->dest_apicid); +} + static int amd_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -4284,13 +4309,7 @@ static int amd_ir_set_affinity(struct irq_data *data, if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid, - irte_info->index, cfg->vector, cfg->dest_apicid); - + amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg); /* * After this point, all the interrupts will start arriving * at the new destination. So, time to cleanup the previous From 90ad9e2d91067983f3328e21b306323877e5f48a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:49 +0200 Subject: [PATCH 0131/1085] x86/io_apic: Reevaluate vector configuration on activate() With the upcoming reservation/management scheme, early activation will assign a special vector. The final activation at request_irq() assigns a real vector, which needs to be updated in the ioapic. Split out the reconfiguration code in set_affinity and use it for reactivation. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.025232175@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 37 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a4b0c60ab8e1..18c6a4861586 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1862,26 +1862,36 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +static void ioapic_configure_entry(struct irq_data *irqd) +{ + struct mp_chip_data *mpd = irqd->chip_data; + struct irq_cfg *cfg = irqd_cfg(irqd); + struct irq_pin_list *entry; + + /* + * Only update when the parent is the vector domain, don't touch it + * if the parent is the remapping domain. Check the installed + * ioapic chip to verify that. + */ + if (irqd->chip == &ioapic_chip) { + mpd->entry.dest = cfg->dest_apicid; + mpd->entry.vector = cfg->vector; + } + for_each_irq_pin(entry, mpd->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); +} + static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - struct mp_chip_data *data = irq_data->chip_data; - struct irq_pin_list *entry; - struct irq_cfg *cfg; unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); raw_spin_lock_irqsave(&ioapic_lock, flags); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - cfg = irqd_cfg(irq_data); - data->entry.dest = cfg->dest_apicid; - data->entry.vector = cfg->vector; - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, - data->entry); - } + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; @@ -2980,12 +2990,9 @@ int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool early) { unsigned long flags; - struct irq_pin_list *entry; - struct mp_chip_data *data = irq_data->chip_data; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, data->entry); + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } From 2db1f959d9dc16035f2eb44ed5fdb2789b754d6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:50 +0200 Subject: [PATCH 0132/1085] x86/vector: Handle managed interrupts proper Managed interrupts need to reserve interrupt vectors permanently, but as long as the interrupt is deactivated, the vector should not be active. Reserve a new system vector, which can be used to initially initialize MSI/DMAR/IOAPIC entries. In that situation the interrupts are disabled in the corresponding MSI/DMAR/IOAPIC devices. So the vector should never be sent to any CPU. When the managed interrupt is started up, a real vector is assigned from the managed vector space and configured in MSI/DMAR/IOAPIC. This allows a clear separation of inactive and active modes and simplifies the final decisions whether the global vector space is sufficient for CPU offline operations. The vector space can be reserved even on offline CPUs and will survive CPU offline/online operations. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.104616625@linutronix.de --- arch/x86/include/asm/irq_vectors.h | 8 +- arch/x86/kernel/apic/vector.c | 190 ++++++++++++++++++++++++++--- 2 files changed, 174 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index aaf8d28b5d00..1e9bd28f842d 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -101,12 +101,8 @@ #define POSTED_INTR_NESTED_VECTOR 0xf0 #endif -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef +#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef +#define LOCAL_TIMER_VECTOR 0xee #define NR_VECTORS 256 diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f08d44fabef4..3f53572c89cb 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -32,7 +32,8 @@ struct apic_chip_data { unsigned int prev_cpu; unsigned int irq; struct hlist_node clist; - u8 move_in_progress : 1; + unsigned int move_in_progress : 1, + is_managed : 1; }; struct irq_domain *x86_vector_domain; @@ -152,6 +153,28 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, per_cpu(vector_irq, newcpu)[newvec] = desc; } +static void vector_assign_managed_shutdown(struct irq_data *irqd) +{ + unsigned int cpu = cpumask_first(cpu_online_mask); + + apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu); +} + +static int reserve_managed_vector(struct irq_data *irqd) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + apicd->is_managed = true; + ret = irq_matrix_reserve_managed(vector_matrix, affmsk); + raw_spin_unlock_irqrestore(&vector_lock, flags); + trace_vector_reserve_managed(irqd->irq, ret); + return ret; +} + static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); @@ -200,20 +223,65 @@ static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) return ret; } -static int assign_irq_vector_policy(struct irq_data *irqd, - struct irq_alloc_info *info, int node) +static int assign_irq_vector_any_locked(struct irq_data *irqd) { + int node = irq_data_get_node(irqd); + + if (node != NUMA_NO_NODE) { + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; + } + return assign_vector_locked(irqd, cpu_online_mask); +} + +static int assign_irq_vector_any(struct irq_data *irqd) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + ret = assign_irq_vector_any_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static int +assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) +{ + if (irqd_affinity_is_managed(irqd)) + return reserve_managed_vector(irqd); if (info->mask) return assign_irq_vector(irqd, info->mask); - if (node != NUMA_NO_NODE && - !assign_irq_vector(irqd, cpumask_of_node(node))) + return assign_irq_vector_any(irqd); +} + +static int +assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; + + cpumask_and(vector_searchmask, vector_searchmask, affmsk); + cpu = cpumask_first(vector_searchmask); + if (cpu >= nr_cpu_ids) + return -EINVAL; + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) return 0; - return assign_irq_vector(irqd, cpu_online_mask); + vector = irq_matrix_alloc_managed(vector_matrix, cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + return 0; } static void clear_irq_vector(struct irq_data *irqd) { struct apic_chip_data *apicd = apic_chip_data(irqd); + bool managed = irqd_affinity_is_managed(irqd); unsigned int vector = apicd->vector; lockdep_assert_held(&vector_lock); @@ -225,7 +293,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; - irq_matrix_free(vector_matrix, apicd->cpu, vector, false); + irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -234,12 +302,86 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, false); + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); } +static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + trace_vector_deactivate(irqd->irq, apicd->is_managed, + false, false); + + if (apicd->is_managed) + return; + + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + vector_assign_managed_shutdown(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +static int activate_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + int ret; + + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { + /* Something in the core code broke! Survive gracefully */ + pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); + return EINVAL; + } + + ret = assign_managed_vector(irqd, vector_searchmask); + /* + * This should not happen. The vector reservation got buggered. Handle + * it gracefully. + */ + if (WARN_ON_ONCE(ret < 0)) { + pr_err("Managed startup irq %u, no vector available\n", + irqd->irq); + } + return ret; +} + +static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, + bool early) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret = 0; + + trace_vector_activate(irqd->irq, apicd->is_managed, + false, early); + + if (!apicd->is_managed) + return 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + if (early || irqd_is_managed_and_shutdown(irqd)) + vector_assign_managed_shutdown(irqd); + else + ret = activate_managed(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void vector_free_reserved_and_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + + trace_vector_teardown(irqd->irq, apicd->is_managed, false); + + if (apicd->is_managed) + irq_matrix_remove_managed(vector_matrix, dest); +} + static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { @@ -253,6 +395,7 @@ static void x86_vector_free_irqs(struct irq_domain *domain, if (irqd && irqd->chip_data) { raw_spin_lock_irqsave(&vector_lock, flags); clear_irq_vector(irqd); + vector_free_reserved_and_managed(irqd); apicd = irqd->chip_data; irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -310,7 +453,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, continue; } - err = assign_irq_vector_policy(irqd, info, node); + err = assign_irq_vector_policy(irqd, info); trace_vector_setup(virq + i, false, err); if (err) goto error; @@ -368,6 +511,8 @@ void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, static const struct irq_domain_ops x86_vector_domain_ops = { .alloc = x86_vector_alloc_irqs, .free = x86_vector_free_irqs, + .activate = x86_vector_activate, + .deactivate = x86_vector_deactivate, #ifdef CONFIG_GENERIC_IRQ_DEBUGFS .debug_show = x86_vector_debug_show, #endif @@ -531,13 +676,13 @@ static int apic_set_affinity(struct irq_data *irqd, { int err; - if (!IS_ENABLED(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(dest, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irqd, dest); + raw_spin_lock(&vector_lock); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (irqd_affinity_is_managed(irqd)) + err = assign_managed_vector(irqd, vector_searchmask); + else + err = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock(&vector_lock); return err ? err : IRQ_SET_MASK_OK; } @@ -577,9 +722,18 @@ static void free_moved_vector(struct apic_chip_data *apicd) { unsigned int vector = apicd->prev_vector; unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; - trace_vector_free_moved(apicd->irq, vector, false); - irq_matrix_free(vector_matrix, cpu, vector, false); + /* + * This should never happen. Managed interrupts are not + * migrated except on CPU down, which does not involve the + * cleanup vector. But try to keep the accounting correct + * nevertheless. + */ + WARN_ON_ONCE(managed); + + trace_vector_free_moved(apicd->irq, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); hlist_del_init(&apicd->clist); apicd->prev_vector = 0; From 4900be83602b6be07366d3e69f756c1959f4169a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:51 +0200 Subject: [PATCH 0133/1085] x86/vector/msi: Switch to global reservation mode Devices with many queues allocate a huge number of interrupts and get assigned a vector for each of them, even if the queues are not active and the interrupts never requested. This causes problems with the decision whether the global vector space is sufficient for CPU hot unplug operations. Change it to a reservation scheme, which allows overcommitment. When the interrupt is allocated and initialized the vector assignment merily updates the reservation request counter in the matrix allocator. This counter is used to emit warnings when the reservation exceeds the available vector space, but does not affect CPU offline operations. Like the managed interrupts the corresponding MSI/DMAR/IOAPIC entries are directed to the special shutdown vector. When the interrupt is requested, then the activation code tries to assign a real vector. If that succeeds the interrupt is started up and functional. If that fails, then subsequently request_irq() fails with -ENOSPC. This allows a clear separation of inactive and active modes and simplifies the final decisions whether the global vector space is sufficient for CPU offline operations. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.184211133@linutronix.de --- arch/x86/kernel/apic/vector.c | 97 +++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3f53572c89cb..46a9ae921819 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -33,7 +33,9 @@ struct apic_chip_data { unsigned int irq; struct hlist_node clist; unsigned int move_in_progress : 1, - is_managed : 1; + is_managed : 1, + can_reserve : 1, + has_reserved : 1; }; struct irq_domain *x86_vector_domain; @@ -175,9 +177,31 @@ static int reserve_managed_vector(struct irq_data *irqd) return ret; } +static void reserve_irq_vector_locked(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + irq_matrix_reserve(vector_matrix); + apicd->can_reserve = true; + apicd->has_reserved = true; + trace_vector_reserve(irqd->irq, 0); + vector_assign_managed_shutdown(irqd); +} + +static int reserve_irq_vector(struct irq_data *irqd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + reserve_irq_vector_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return 0; +} + static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); + bool resvd = apicd->has_reserved; unsigned int cpu = apicd->cpu; int vector = apicd->vector; @@ -191,10 +215,10 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) return 0; - vector = irq_matrix_alloc(vector_matrix, dest, false, &cpu); + vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); if (vector > 0) apic_update_vector(irqd, vector, cpu); - trace_vector_alloc(irqd->irq, vector, false, vector); + trace_vector_alloc(irqd->irq, vector, resvd, vector); return vector; } @@ -252,7 +276,11 @@ assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) return reserve_managed_vector(irqd); if (info->mask) return assign_irq_vector(irqd, info->mask); - return assign_irq_vector_any(irqd); + if (info->type != X86_IRQ_ALLOC_TYPE_MSI && + info->type != X86_IRQ_ALLOC_TYPE_MSIX) + return assign_irq_vector_any(irqd); + /* For MSI(X) make only a global reservation with no guarantee */ + return reserve_irq_vector(irqd); } static int @@ -314,17 +342,35 @@ static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) unsigned long flags; trace_vector_deactivate(irqd->irq, apicd->is_managed, - false, false); + apicd->can_reserve, false); - if (apicd->is_managed) + /* Regular fixed assigned interrupt */ + if (!apicd->is_managed && !apicd->can_reserve) + return; + /* If the interrupt has a global reservation, nothing to do */ + if (apicd->has_reserved) return; raw_spin_lock_irqsave(&vector_lock, flags); clear_irq_vector(irqd); - vector_assign_managed_shutdown(irqd); + if (apicd->can_reserve) + reserve_irq_vector_locked(irqd); + else + vector_assign_managed_shutdown(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); } +static int activate_reserved(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int ret; + + ret = assign_irq_vector_any_locked(irqd); + if (!ret) + apicd->has_reserved = false; + return ret; +} + static int activate_managed(struct irq_data *irqd) { const struct cpumask *dest = irq_data_get_affinity_mask(irqd); @@ -357,16 +403,19 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, int ret = 0; trace_vector_activate(irqd->irq, apicd->is_managed, - false, early); + apicd->can_reserve, early); - if (!apicd->is_managed) + /* Nothing to do for fixed assigned vectors */ + if (!apicd->can_reserve && !apicd->is_managed) return 0; raw_spin_lock_irqsave(&vector_lock, flags); if (early || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); - else + else if (apicd->is_managed) ret = activate_managed(irqd); + else if (apicd->has_reserved) + ret = activate_reserved(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); return ret; } @@ -376,8 +425,11 @@ static void vector_free_reserved_and_managed(struct irq_data *irqd) const struct cpumask *dest = irq_data_get_affinity_mask(irqd); struct apic_chip_data *apicd = apic_chip_data(irqd); - trace_vector_teardown(irqd->irq, apicd->is_managed, false); + trace_vector_teardown(irqd->irq, apicd->is_managed, + apicd->has_reserved); + if (apicd->has_reserved) + irq_matrix_remove_reserved(vector_matrix); if (apicd->is_managed) irq_matrix_remove_managed(vector_matrix, dest); } @@ -604,22 +656,6 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SMP -/* Temporary hack to keep things working */ -static void vector_update_shutdown_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - struct irq_data *irqd = irq_desc_get_irq_data(desc); - struct apic_chip_data *ad = apic_chip_data(irqd); - - if (!ad || !ad->vector || ad->cpu != smp_processor_id()) - continue; - this_cpu_write(vector_irq[ad->vector], desc); - irq_matrix_assign(vector_matrix, ad->vector); - } -} static struct irq_desc *__setup_vector_irq(int vector) { @@ -655,13 +691,6 @@ void lapic_online(void) */ for (vector = 0; vector < NR_VECTORS; vector++) this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); - - /* - * Until the rewrite of the managed interrupt management is in - * place it's necessary to walk the irq descriptors and check for - * interrupts which are targeted at this CPU. - */ - vector_update_shutdown_irqs(); } void lapic_offline(void) From 464d12309e1b5829597793db551ae8ecaecf4036 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:52 +0200 Subject: [PATCH 0134/1085] x86/vector: Switch IOAPIC to global reservation mode IOAPICs install and allocate vectors for inactive interrupts. This results in problems on CPU offline and wastes vector resources for nothing. Handle inactive IOAPIC interrupts in the same way as inactive MSI interrupts and switch them to the global reservation mode. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.273454591@linutronix.de --- arch/x86/kernel/apic/vector.c | 56 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 46a9ae921819..5e58da8efe77 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -258,17 +258,6 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd) return assign_vector_locked(irqd, cpu_online_mask); } -static int assign_irq_vector_any(struct irq_data *irqd) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = assign_irq_vector_any_locked(irqd); - raw_spin_unlock_irqrestore(&vector_lock, flags); - return ret; -} - static int assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) { @@ -276,10 +265,10 @@ assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) return reserve_managed_vector(irqd); if (info->mask) return assign_irq_vector(irqd, info->mask); - if (info->type != X86_IRQ_ALLOC_TYPE_MSI && - info->type != X86_IRQ_ALLOC_TYPE_MSIX) - return assign_irq_vector_any(irqd); - /* For MSI(X) make only a global reservation with no guarantee */ + /* + * Make only a global reservation with no guarantee. A real vector + * is associated at activation time. + */ return reserve_irq_vector(irqd); } @@ -456,13 +445,39 @@ static void x86_vector_free_irqs(struct irq_domain *domain, } } +static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, + struct apic_chip_data *apicd) +{ + unsigned long flags; + bool realloc = false; + + apicd->vector = ISA_IRQ_VECTOR(virq); + apicd->cpu = 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + /* + * If the interrupt is activated, then it must stay at this vector + * position. That's usually the timer interrupt (0). + */ + if (irqd_is_activated(irqd)) { + trace_vector_setup(virq, true, 0); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); + } else { + /* Release the vector */ + apicd->can_reserve = true; + clear_irq_vector(irqd); + realloc = true; + } + raw_spin_unlock_irqrestore(&vector_lock, flags); + return realloc; +} + static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; struct apic_chip_data *apicd; struct irq_data *irqd; - unsigned long flags; int i, err, node; if (disable_apic) @@ -496,13 +511,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * config. */ if (info->flags & X86_IRQ_ALLOC_LEGACY) { - apicd->vector = ISA_IRQ_VECTOR(virq + i); - apicd->cpu = 0; - trace_vector_setup(virq + i, true, 0); - raw_spin_lock_irqsave(&vector_lock, flags); - apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); - raw_spin_unlock_irqrestore(&vector_lock, flags); - continue; + if (!vector_configure_legacy(virq + i, irqd, apicd)) + continue; } err = assign_irq_vector_policy(irqd, info); From 2cffad7bad83157f89332872015f4305d2ac09ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:53 +0200 Subject: [PATCH 0135/1085] x86/irq: Simplify hotplug vector accounting Before a CPU is taken offline the number of active interrupt vectors on the outgoing CPU and the number of vectors which are available on the other online CPUs are counted and compared. If the active vectors are more than the available vectors on the other CPUs then the CPU hot-unplug operation is aborted. This again uses loop based search and is inaccurate. The bitmap matrix allocator has accurate accounting information and can tell exactly whether the vector space is sufficient or not. Emit a message when the number of globaly reserved (unallocated) vectors is larger than the number of available vectors after offlining a CPU because after that point request_irq() might fail. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.351193962@linutronix.de --- arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/irq.h | 4 -- arch/x86/kernel/apic/vector.c | 32 ++++++++++- arch/x86/kernel/irq.c | 99 ----------------------------------- arch/x86/kernel/smpboot.c | 2 +- 5 files changed, 33 insertions(+), 105 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 7a8651921ed5..a9e57f08bfa6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -386,6 +386,7 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; */ #ifdef CONFIG_SMP extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 9958ceea2fa3..1002a3e8fccc 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -25,11 +25,7 @@ extern void irq_ctx_init(int cpu); struct irq_desc; -#ifdef CONFIG_HOTPLUG_CPU -#include -extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -#endif #ifdef CONFIG_HAVE_KVM extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5e58da8efe77..14b21ca4483c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -945,7 +945,37 @@ void irq_force_complete_move(struct irq_desc *desc) unlock: raw_spin_unlock(&vector_lock); } -#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Note, this is not accurate accounting, but at least good enough to + * prevent that the actual interrupt move will run out of vectors. + */ +int lapic_can_unplug_cpu(void) +{ + unsigned int rsvd, avl, tomove, cpu = smp_processor_id(); + int ret = 0; + + raw_spin_lock(&vector_lock); + tomove = irq_matrix_allocated(vector_matrix); + avl = irq_matrix_available(vector_matrix, true); + if (avl < tomove) { + pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n", + cpu, tomove, avl); + ret = -ENOSPC; + goto out; + } + rsvd = irq_matrix_reserved(vector_matrix); + if (avl < rsvd) { + pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n", + rsvd, avl); + } +out: + raw_spin_unlock(&vector_lock); + return ret; +} +#endif /* HOTPLUG_CPU */ +#endif /* SMP */ static void __init print_APIC_field(int base) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 188990c3a514..49cfd9fe7589 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -333,105 +333,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU - -/* These two declarations are only used in check_irq_vectors_for_cpu_disable() - * below, which is protected by stop_machine(). Putting them on the stack - * results in a stack frame overflow. Dynamically allocating could result in a - * failure so declare these two cpumasks as global. - */ -static struct cpumask affinity_new, online_new; - -/* - * This cpu is going to be removed and its vectors migrated to the remaining - * online cpus. Check to see if there are enough vectors in the remaining cpus. - * This function is protected by stop_machine(). - */ -int check_irq_vectors_for_cpu_disable(void) -{ - unsigned int this_cpu, vector, this_count, count; - struct irq_desc *desc; - struct irq_data *data; - int cpu; - - this_cpu = smp_processor_id(); - cpumask_copy(&online_new, cpu_online_mask); - cpumask_clear_cpu(this_cpu, &online_new); - - this_count = 0; - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - /* - * Protect against concurrent action removal, affinity - * changes etc. - */ - raw_spin_lock(&desc->lock); - data = irq_desc_get_irq_data(desc); - cpumask_copy(&affinity_new, - irq_data_get_affinity_mask(data)); - cpumask_clear_cpu(this_cpu, &affinity_new); - - /* Do not count inactive or per-cpu irqs. */ - if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) { - raw_spin_unlock(&desc->lock); - continue; - } - - raw_spin_unlock(&desc->lock); - /* - * A single irq may be mapped to multiple cpu's - * vector_irq[] (for example IOAPIC cluster mode). In - * this case we have two possibilities: - * - * 1) the resulting affinity mask is empty; that is - * this the down'd cpu is the last cpu in the irq's - * affinity mask, or - * - * 2) the resulting affinity mask is no longer a - * subset of the online cpus but the affinity mask is - * not zero; that is the down'd cpu is the last online - * cpu in a user set affinity mask. - */ - if (cpumask_empty(&affinity_new) || - !cpumask_subset(&affinity_new, &online_new)) - this_count++; - } - /* No need to check any further. */ - if (!this_count) - return 0; - - count = 0; - for_each_online_cpu(cpu) { - if (cpu == this_cpu) - continue; - /* - * We scan from FIRST_EXTERNAL_VECTOR to first system - * vector. If the vector is marked in the used vectors - * bitmap or an irq is assigned to it, we don't count - * it as available. - * - * As this is an inaccurate snapshot anyway, we can do - * this w/o holding vector_lock. - */ - for (vector = FIRST_EXTERNAL_VECTOR; - vector < FIRST_SYSTEM_VECTOR; vector++) { - if (!test_bit(vector, system_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { - if (++count == this_count) - return 0; - } - } - } - - if (count < this_count) { - pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", - this_cpu, this_count, count); - return -ERANGE; - } - return 0; -} - /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 86739f04701b..92aadfa30d61 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1525,7 +1525,7 @@ int native_cpu_disable(void) { int ret; - ret = check_irq_vectors_for_cpu_disable(); + ret = lapic_can_unplug_cpu(); if (ret) return ret; From d6ffc6ac83b1f9f12652d89b9cb5bcbfbea7796c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:54 +0200 Subject: [PATCH 0136/1085] x86/vector: Respect affinity mask in irq descriptor The interrupt descriptor has a preset affinity mask at allocation time, which is usually the default affinity mask. The current code does not respect that mask and places the vector at some random CPU, which gets corrected later by a set_affinity() call. That's silly because the vector allocation can respect the mask upfront and place the interrupt on a CPU which is in the mask. If that fails, then the affinity is broken and a interrupt assigned on any online CPU. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213156.431670325@linutronix.de --- arch/x86/kernel/apic/vector.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 14b21ca4483c..6789e286def9 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -249,12 +249,25 @@ static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) static int assign_irq_vector_any_locked(struct irq_data *irqd) { + /* Get the affinity mask - either irq_default_affinity or (user) set */ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); int node = irq_data_get_node(irqd); - if (node != NUMA_NO_NODE) { - if (!assign_vector_locked(irqd, cpumask_of_node(node))) - return 0; - } + if (node == NUMA_NO_NODE) + goto all; + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; +all: + /* Try the full affinity mask */ + cpumask_and(vector_searchmask, affmsk, cpu_online_mask); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + /* Try the full online mask */ return assign_vector_locked(irqd, cpu_online_mask); } From 5df32107f609c1f621bcdac0a685c23677ef671e Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 28 Aug 2017 08:21:53 -0400 Subject: [PATCH 0137/1085] timekeeping: Make fast accessors return 0 before timekeeping is initialized printk timestamps will be extended to include mono and boot time by using the fast timekeeping accessors ktime_get_mono|boot_fast_ns(). The functions can return garbage before timekeeping is initialized resulting in garbage timestamps. Initialize the fast timekeepers with dummy clocks which guarantee a 0 readout up to timekeeping_init(). Suggested-by: Peter Zijlstra Signed-off-by: Prarit Bhargava Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: John Stultz Link: http://lkml.kernel.org/r/1503922914-10660-2-git-send-email-prarit@redhat.com --- kernel/time/timekeeping.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..6a92794427c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -60,8 +60,27 @@ struct tk_fast { struct tk_read_base base[2]; }; -static struct tk_fast tk_fast_mono ____cacheline_aligned; -static struct tk_fast tk_fast_raw ____cacheline_aligned; +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -477,18 +496,6 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* Suspend-time cycles value for halted fast timekeeper. */ -static u64 cycles_at_suspend; - -static u64 dummy_clock_read(struct clocksource *cs) -{ - return cycles_at_suspend; -} - -static struct clocksource dummy_clock = { - .read = dummy_clock_read, -}; - /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. From 4c3711d7fb4763c63b2654f2d07cbe21ca5aadd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Aug 2017 17:12:48 +0200 Subject: [PATCH 0138/1085] timekeeping: Provide NMI safe access to clock realtime The configurable printk timestamping wants access to clock realtime. Right now there is no ktime_get_real_fast_ns() accessor because reading the monotonic base and the realtime offset cannot be done atomically. Contrary to boot time this offset can change during runtime and cause half updated readouts. struct tk_read_base was fully packed when the fast timekeeper access was implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around clocksource changes") removed the 'read' function pointer from the structure, but of course left the comment stale. So now the structure can fit a new 64bit member w/o violating the cache line constraints. Add real_base to tk_read_base and update it in the fast timekeeper update sequence. Implement an accessor which follows the same scheme as the accessor to clock monotonic, but uses the new real_base to access clock real time. The runtime overhead for updating real_base is minimal as it just adds two cache hot values and stores them into an already dirtied cache line along with the other fast timekeeper updates. Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: John Stultz Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/1505757060-2004-3-git-send-email-prarit@redhat.com --- include/linux/timekeeper_internal.h | 6 ++++- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 35 +++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 0a0a53daf2a2..98f645ee8409 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -13,19 +13,22 @@ /** * struct tk_read_base - base structure for timekeeping readout * @clock: Current clocksource used for timekeeping. - * @read: Read function of @clock * @mask: Bitmask for two's complement subtraction of non 64bit clocks * @cycle_last: @clock cycle value at last update * @mult: (NTP adjusted) multiplier for scaled math conversion * @shift: Shift value for scaled math conversion * @xtime_nsec: Shifted (fractional) nano seconds offset for readout * @base: ktime_t (nanoseconds) base time for readout + * @base_real: Nanoseconds base value for clock REALTIME readout * * This struct has size 56 byte on 64 bit. Together with a seqcount it * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used * for a fast NMI safe accessors. + * + * @base_real is for the fast NMI safe accessor to allow reading clock + * realtime from any context. */ struct tk_read_base { struct clocksource *clock; @@ -35,6 +38,7 @@ struct tk_read_base { u32 shift; u64 xtime_nsec; ktime_t base; + u64 base_real; }; /** diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ddc229ff6d1e..eb98cbdbb323 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -239,6 +239,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); +extern u64 ktime_get_real_fast_ns(void); /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6a92794427c9..8af77006e937 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -496,6 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + +/* + * See comment for __ktime_get_fast_ns() vs. timestamp ordering + */ +static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base_real); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast_ns(&tk_fast_mono); +} + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -514,6 +547,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; @@ -661,6 +695,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); From 98bfa34462fb6af70b845d28561072f80bacdb9b Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sat, 16 Sep 2017 13:48:37 +0200 Subject: [PATCH 0139/1085] Documentation: fix little inconsistencies Fix little inconsistencies in Documentation: make case and spacing match surrounding text. Signed-off-by: Pavel Machek Reviewed-by: Darrick J. Wong Signed-off-by: Jonathan Corbet --- Documentation/filesystems/ext4.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 5a8f7f4d2bca..75236c0c2ac2 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -94,10 +94,10 @@ Note: More extensive information for getting started with ext4 can be * ability to pack bitmaps and inode tables into larger virtual groups via the flex_bg feature * large file support -* Inode allocation using large virtual block groups via flex_bg +* inode allocation using large virtual block groups via flex_bg * delayed allocation * large block (up to pagesize) support -* efficient new ordered mode in JBD2 and ext4(avoid using buffer head to force +* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force the ordering) [1] Filesystems with a block size of 1k may see a limit imposed by the @@ -105,7 +105,7 @@ directory hash tree having a maximum depth of two. 2.2 Candidate features for future inclusion -* Online defrag (patches available but not well tested) +* online defrag (patches available but not well tested) * reduced mke2fs time via lazy itable initialization in conjunction with the uninit_bg feature (capability to do this is available in e2fsprogs but a kernel thread to do lazy zeroing of unused inode table blocks @@ -602,7 +602,7 @@ Table of Ext4 specific ioctls bitmaps and inode table, the userspace tool thus just passes the new number of blocks. -EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes + EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes (like i_blocks, i_size, i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO (#5). This is typically From 416c7517359b8812c8bcddc31348934b901c3cdc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 17 Sep 2017 15:19:10 -0700 Subject: [PATCH 0140/1085] Documentation: kernel-api: drop "Data Types" section In the kernel-api chapter, the section for Data Types only contains "Doubly Linked Lists" and all of the function interfaces for list management. There are no other data types in this section, so collapse this section into "List Management Functions". Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 8282099e0cbf..e557509574a0 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -2,11 +2,9 @@ The Linux Kernel API ==================== -Data Types -========== -Doubly Linked Lists -------------------- +List Management Functions +========================= .. kernel-doc:: include/linux/list.h :internal: From 404376af788a76cca760efdc05f26fd73bd94b17 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 17 Sep 2017 19:07:10 -0700 Subject: [PATCH 0141/1085] Documentation: kernel-api: add bitmap operations from linux/bitmap.h Add to kernel-api Bitmap Operations section. Fix kernel-doc nitpicks in . Signed-off-by: Randy Dunlap Acked-by: Yury Norov Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 3 +++ include/linux/bitmap.h | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index e557509574a0..f37c73faa5b7 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -59,6 +59,9 @@ Bitmap Operations .. kernel-doc:: lib/bitmap.c :internal: +.. kernel-doc:: include/linux/bitmap.h + :internal: + Command-line Parsing -------------------- diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 700cf5f67118..5c4178016b1e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -360,8 +360,9 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen, return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); } -/* +/** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. + * @n: u64 value * * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit * integers in 32-bit environment, and 64-bit integers in 64-bit one. @@ -392,14 +393,14 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen, ((unsigned long) ((u64)(n) >> 32)) #endif -/* +/** * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap * @dst: destination bitmap * - * In 32-bit Big Endian kernel, when using (u32 *)(&val)[*] + * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` * to read u64 mask, we will get the wrong word. - * That is "(u32 *)(&val)[0]" gets the upper 32 bits, + * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) From ac0a314caed1827f4ef5d7145eb609147b48b45d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 18 Sep 2017 22:21:25 -0700 Subject: [PATCH 0142/1085] console: Update to reflect new default value The default value was changed from 10 minutes to disabled in commit a4199f5eb8096d6. Signed-off-by: Daniel Xu Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..fcaa8558ed7e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -641,8 +641,8 @@ For now, only VisioBraille is supported. consoleblank= [KNL] The console blank (screen saver) timeout in - seconds. Defaults to 10*60 = 10mins. A value of 0 - disables the blank timer. + seconds. A value of 0 disables the blank timer. + Defaults to 0. coredump_filter= [KNL] Change the default value for From d19b3e32375bd21b5d89cc484dfc56cbd9b7fee4 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 25 Sep 2017 18:36:19 +0900 Subject: [PATCH 0143/1085] Documentation/process: fix the canonical patch format description There shouldn't be a blank line at the beginning, if there is no optional in-body "From" line. There must be a blank line between the body of the explanation and the beginning of the S-o-b lines. Signed-off-by: Junio C Hamano Signed-off-by: Jonathan Corbet --- Documentation/process/submitting-patches.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 733478ade91b..1ef19d3a3eee 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -621,14 +621,14 @@ The canonical patch subject line is:: The canonical patch message body contains the following: - - A ``from`` line specifying the patch author (only needed if the person - sending the patch is not the author). - - - An empty line. + - A ``from`` line specifying the patch author, followed by an empty + line (only needed if the person sending the patch is not the author). - The body of the explanation, line wrapped at 75 columns, which will be copied to the permanent changelog to describe this patch. + - An empty line. + - The ``Signed-off-by:`` lines, described above, which will also go in the changelog. From d4306db189d04886e07751f682495aeae61a4c4f Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 25 Sep 2017 18:36:20 +0900 Subject: [PATCH 0144/1085] Documentation/process: phrasofix Devils in the details are found only when the high level design is refined and gets more detailed, and the appropriate phrase to use to describe this is "problems are revealed", not "problems are reviewed". Reviews may reveal these problems, though ;-) Signed-off-by: Junio C Hamano Signed-off-by: Jonathan Corbet --- Documentation/process/3.Early-stage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/3.Early-stage.rst b/Documentation/process/3.Early-stage.rst index af2c0af931d6..be00716071d4 100644 --- a/Documentation/process/3.Early-stage.rst +++ b/Documentation/process/3.Early-stage.rst @@ -178,7 +178,7 @@ matter is (1) kernel developers tend to be busy, (2) there is no shortage of people with grand plans and little code (or even prospect of code) to back them up, and (3) nobody is obligated to review or comment on ideas posted by others. Beyond that, high-level designs often hide problems -which are only reviewed when somebody actually tries to implement those +which are only revealed when somebody actually tries to implement those designs; for that reason, kernel developers would rather see the code. If a request-for-comments posting yields little in the way of comments, do From 44e9f099367716dde33618b7d5d25c3123437ba6 Mon Sep 17 00:00:00 2001 From: stephen lu Date: Tue, 29 Aug 2017 10:42:24 +0800 Subject: [PATCH 0145/1085] docs: highres: fix broken urls Some urls is invalid. I find alternative urls. Signed-off-by: stephen lu Signed-off-by: Jonathan Corbet --- Documentation/timers/highres.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/timers/highres.txt b/Documentation/timers/highres.txt index e8789976e77c..9d88f67781c2 100644 --- a/Documentation/timers/highres.txt +++ b/Documentation/timers/highres.txt @@ -4,10 +4,10 @@ High resolution timers and dynamic ticks design notes Further information can be found in the paper of the OLS 2006 talk "hrtimers and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can be found on the OLS website: -http://www.linuxsymposium.org/2006/linuxsymposium_procv1.pdf +https://www.kernel.org/doc/ols/2006/ols2006v1-pages-333-346.pdf The slides to this talk are available from: -http://tglx.de/projects/hrtimers/ols2006-hrtimers.pdf +http://www.cs.columbia.edu/~nahum/w6998/papers/ols2006-hrtimers-slides.pdf The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the From 8a29896a6e31c7aa2ca3b50d8aefe05f280b0b7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Sep 2017 16:35:55 -0700 Subject: [PATCH 0146/1085] docs: clean up and add rest of CRC functions to kernel-api.rst Add the rest of the CRC library functions to kernel-api. - try to clarify crc32() by adding '@' to a function parameter - reorder kernel-api CRC functions to be less random - add more CRC functions to kernel-api - correct the function parameter names in several places Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 12 +++++++++--- lib/crc32.c | 2 +- lib/crc4.c | 2 +- lib/crc8.c | 22 +++++++++++----------- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index f37c73faa5b7..657a89a219b2 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -71,20 +71,26 @@ Command-line Parsing CRC Functions ------------- +.. kernel-doc:: lib/crc4.c + :export: + .. kernel-doc:: lib/crc7.c :export: +.. kernel-doc:: lib/crc8.c + :export: + .. kernel-doc:: lib/crc16.c :export: -.. kernel-doc:: lib/crc-itu-t.c - :export: - .. kernel-doc:: lib/crc32.c .. kernel-doc:: lib/crc-ccitt.c :export: +.. kernel-doc:: lib/crc-itu-t.c + :export: + idr/ida Functions ----------------- diff --git a/lib/crc32.c b/lib/crc32.c index 6ddc92bc1460..2ef20fe84b69 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -225,7 +225,7 @@ static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus) } /** - * crc32_generic_shift - Append len 0 bytes to crc, in logarithmic time + * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) * @len: The number of bytes. @crc is multiplied by x^(8*@len) * @polynomial: The modulus used to reduce the result to 32 bits. diff --git a/lib/crc4.c b/lib/crc4.c index cf6db46661be..164ed9444cd3 100644 --- a/lib/crc4.c +++ b/lib/crc4.c @@ -15,7 +15,7 @@ static const uint8_t crc4_tab[] = { /** * crc4 - calculate the 4-bit crc of a value. - * @crc: starting crc4 + * @c: starting crc4 * @x: value to checksum * @bits: number of bits in @x to checksum * diff --git a/lib/crc8.c b/lib/crc8.c index 87b59cafdb83..595a5a75e3cd 100644 --- a/lib/crc8.c +++ b/lib/crc8.c @@ -20,11 +20,11 @@ #include #include -/* +/** * crc8_populate_msb - fill crc table for given polynomial in reverse bit order. * - * table: table to be filled. - * polynomial: polynomial for which table is to be filled. + * @table: table to be filled. + * @polynomial: polynomial for which table is to be filled. */ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) { @@ -42,11 +42,11 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) } EXPORT_SYMBOL(crc8_populate_msb); -/* +/** * crc8_populate_lsb - fill crc table for given polynomial in regular bit order. * - * table: table to be filled. - * polynomial: polynomial for which table is to be filled. + * @table: table to be filled. + * @polynomial: polynomial for which table is to be filled. */ void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) { @@ -63,13 +63,13 @@ void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) } EXPORT_SYMBOL(crc8_populate_lsb); -/* +/** * crc8 - calculate a crc8 over the given input data. * - * table: crc table used for calculation. - * pdata: pointer to data buffer. - * nbytes: number of bytes in data buffer. - * crc: previous returned crc8 value. + * @table: crc table used for calculation. + * @pdata: pointer to data buffer. + * @nbytes: number of bytes in data buffer. + * @crc: previous returned crc8 value. */ u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc) { From 5cb5c31cdf246099f7d48a57f448b05b7941cd6a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2017 13:08:13 +0200 Subject: [PATCH 0147/1085] scripts/kernel-doc: warn on excess enum value descriptions The existing message "Excess struct/union/enum/typedef member [...]" made it sound like this would already be done, but the code is never invoked for enums or typedefs (and really can't be). Add some code to the enum dumper to handle this there instead. While at it, also make the above message more accurate by simply dumping the type that was passed in, and pass the struct/union differentiation in. Signed-off-by: Johannes Berg Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 9d3eafea58f0..67d051edd615 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -2168,7 +2168,7 @@ sub dump_struct($$) { my $nested; if ($x =~ /(struct|union)\s+(\w+)\s*{(.*)}/) { - #my $decl_type = $1; + my $decl_type = $1; $declaration_name = $2; my $members = $3; @@ -2194,7 +2194,7 @@ sub dump_struct($$) { $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+), ([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos; create_parameterlist($members, ';', $file); - check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested); + check_sections($file, $declaration_name, $decl_type, $sectcheck, $struct_actual, $nested); output_declaration($declaration_name, 'struct', @@ -2226,6 +2226,8 @@ sub dump_enum($$) { if ($x =~ /enum\s+(\w+)\s*{(.*)}/) { $declaration_name = $1; my $members = $2; + my %_members; + $members =~ s/\s+$//; foreach my $arg (split ',', $members) { @@ -2236,9 +2238,16 @@ sub dump_enum($$) { print STDERR "${file}:$.: warning: Enum value '$arg' ". "not described in enum '$declaration_name'\n"; } - + $_members{$arg} = 1; } + while (my ($k, $v) = each %parameterdescs) { + if (!exists($_members{$k})) { + print STDERR "${file}:$.: warning: Excess enum value " . + "'$k' description in '$declaration_name'\n"; + } + } + output_declaration($declaration_name, 'enum', {'enum' => $declaration_name, @@ -2506,7 +2515,7 @@ sub check_sections($$$$$$) { } else { if ($nested !~ m/\Q$sects[$sx]\E/) { print STDERR "${file}:$.: warning: " . - "Excess struct/union/enum/typedef member " . + "Excess $decl_type member " . "'$sects[$sx]' " . "description in '$decl_name'\n"; ++$warnings; From 9c71206d060d4e84896f3bd680319b29fe88b8e8 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 13 Sep 2017 17:17:54 +0800 Subject: [PATCH 0148/1085] ACPI/init: Invoke early ACPI initialization earlier acpi_early_init() unmaps the temporary ACPI Table mappings which are used in the early startup code and prepares for permanent table mappings. Before the consolidation of the x86 APIC setup code the invocation of acpi_early_init() happened before the interrupt remapping unit was initialized. With the rework the remapping unit initialization moved in front of acpi_early_init() which causes an ACPI warning when the ACPI root tables get reallocated afterwards. Invoke acpi_early_init() before late_time_init() which is before the access to the DMAR tables happens. Fixes: 935356cecda8 ("x86/apic: Initialize interrupt mode after timer init") Reported-by: Xiaolong Ye Signed-off-by: Dou Liyang Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: bhe@redhat.com Cc: Fenghua Yu Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Will Deacon Cc: linux-acpi@vger.kernel.org Cc: bp@alien8.de Cc: Lv" Cc: yinghai@kernel.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/1505294274-441-1-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..2fb98a48b6ae 100644 --- a/init/main.c +++ b/init/main.c @@ -664,12 +664,12 @@ asmlinkage __visible void __init start_kernel(void) debug_objects_mem_init(); setup_per_cpu_pageset(); numa_policy_init(); + acpi_early_init(); if (late_time_init) late_time_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); - acpi_early_init(); #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); From 1e66e2b86293ff1ded32104ac0ad26a7f08ec439 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Sep 2017 19:08:45 +0200 Subject: [PATCH 0149/1085] x86/apic: Use dead_cpu instead of current CPU when cleaning up x2apic_dead_cpu() cleans up the leftovers of a CPU which got unplugged, but instead of clearing the dead cpu bit in the cluster mask it clears the current (alive) cpu bit. Noticed because smp_processor_id() is called in preemptible code and triggers a debug warning. [ tglx: Rewrote changelog ] Fixes: 023a611748fd ("x86/apic/x2apic: Simplify cluster management") Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170926170845.13955-1-bp@alien8.de --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 3da94277140f..6050c5364bdc 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -156,7 +156,7 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) { struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - cpumask_clear_cpu(smp_processor_id(), &cmsk->mask); + cpumask_clear_cpu(dead_cpu, &cmsk->mask); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } From 9b3a7fd0f5fb583a8fdda678e8a87dff1717f7f3 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:33 -0700 Subject: [PATCH 0150/1085] x86/intel_rdt: Add framework for better RDT UI diagnostics Commands are given to the resctrl file system by making/removing directories, or by writing to files. When something goes wrong the user is generally left wondering why they got: bash: echo: write error: Invalid argument Add a new file "last_cmd_status" to the "info" directory that will give the user some better clues on what went wrong. Provide functions to clear and update last_cmd_status which check that we hold the rdtgroup_mutex. [ tglx: Made last_cmd_status static and folded back the hunk from patch 3 which replaces the open coded access to last_cmd_status with the accessor function ] Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/edc4e0e9741eee89bba569f0021b1b2662fd9508.1506382469.git.tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 7 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 56 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index ebaddaeef023..9d3148d88ec8 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -126,12 +126,15 @@ struct rdtgroup { #define RFTYPE_BASE BIT(1) #define RF_CTRLSHIFT 4 #define RF_MONSHIFT 5 +#define RF_TOPSHIFT 6 #define RFTYPE_CTRL BIT(RF_CTRLSHIFT) #define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_TOP BIT(RF_TOPSHIFT) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) #define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) #define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) /* List of all resource groups */ @@ -408,6 +411,10 @@ union cpuid_0x10_x_edx { unsigned int full; }; +void rdt_last_cmd_clear(void); +void rdt_last_cmd_puts(const char *s); +void rdt_last_cmd_printf(const char *fmt, ...); + void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index a869d4a073c5..68103513130b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,31 @@ static struct kernfs_node *kn_mongrp; /* Kernel fs node for "mon_data" directory under root */ static struct kernfs_node *kn_mondata; +static struct seq_buf last_cmd_status; +static char last_cmd_status_buf[512]; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -569,6 +595,21 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -685,6 +726,13 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RF_TOP_INFO, + }, { .name = "num_closids", .mode = 0444, @@ -855,6 +903,10 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); + ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + if (ret) + goto out_destroy; + for_each_alloc_enabled_rdt_resource(r) { fflags = r->fflags | RF_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); @@ -1156,6 +1208,7 @@ out_info: out_cdp: cdp_disable(); out: + rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); return dentry; @@ -1902,6 +1955,9 @@ int __init rdtgroup_init(void) { int ret = 0; + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + ret = rdtgroup_setup_root(); if (ret) return ret; From c377dcfbee808efdb66cf1bb6b9f06fa26b2ad0a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:34 -0700 Subject: [PATCH 0151/1085] x86/intel_rdt: Add diagnostics when writing the schemata file Save helpful descriptions of what went wrong when writing a schemata file. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/9d6cef757dc88639c8ab47f1e7bc1b081a84bb88.1506382469.git.tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 49 ++++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index f6ea94f8954a..f29b4c21e7d4 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -42,15 +42,22 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) + if (!r->membw.delay_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; + } ret = kstrtoul(buf, 10, &bw); - if (ret) + if (ret) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); return false; + } - if (bw < r->membw.min_bw || bw > r->default_ctrl) + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, + r->membw.min_bw, r->default_ctrl); return false; + } *data = roundup(bw, (unsigned long)r->membw.bw_gran); return true; @@ -60,8 +67,10 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if (!bw_validate(buf, &data, r)) return -EINVAL; @@ -84,20 +93,29 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) int ret; ret = kstrtoul(buf, 16, &val); - if (ret) + if (ret) { + rdt_last_cmd_printf("non-hex character in mask %s\n", buf); return false; + } - if (val == 0 || val > r->default_ctrl) + if (val == 0 || val > r->default_ctrl) { + rdt_last_cmd_puts("mask out of range\n"); return false; + } first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val); return false; + } - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in mask\n", + r->cache.min_cbm_bits); return false; + } *data = val; return true; @@ -111,8 +129,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if(!cbm_validate(buf, &data, r)) return -EINVAL; @@ -139,8 +159,10 @@ next: return 0; dom = strsep(&line, ";"); id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); return -EINVAL; + } dom = strim(dom); list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { @@ -196,6 +218,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } + rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); return -EINVAL; } @@ -209,8 +232,10 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, int closid, ret = 0; /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') + if (nbytes == 0 || buf[nbytes - 1] != '\n') { + seq_buf_puts(&last_cmd_status, "no trailing newline\n"); return -EINVAL; + } buf[nbytes - 1] = '\0'; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -218,6 +243,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return -ENOENT; } + rdt_last_cmd_clear(); closid = rdtgrp->closid; @@ -229,6 +255,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, while ((tok = strsep(&buf, "\n")) != NULL) { resname = strim(strsep(&tok, ":")); if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); ret = -EINVAL; goto out; } From 29e74f35b2fed0ca3e8b31db157e1d183e9d0819 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:35 -0700 Subject: [PATCH 0152/1085] x86/intel_rdt: Add diagnostics when writing the tasks file About the only tricky case is trying to move a task into a monitor group that is a subdirectory of a different control group. But cover the simple cases too. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/f1841cce6a242aed37cb926dee8942727331bf78.1506382469.git.tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 4 +--- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index f29b4c21e7d4..30aeb267cbd2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -232,10 +232,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, int closid, ret = 0; /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') { - seq_buf_puts(&last_cmd_status, "no trailing newline\n"); + if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; - } buf[nbytes - 1] = '\0'; rdtgrp = rdtgroup_kn_lock_live(of->kn); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 68103513130b..d39092eb63bb 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -478,6 +478,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ atomic_dec(&rdtgrp->waitcount); kfree(callback); + rdt_last_cmd_puts("task exited\n"); } else { /* * For ctrl_mon groups move both closid and rmid. @@ -488,10 +489,12 @@ static int __rdtgroup_move_task(struct task_struct *tsk, tsk->closid = rdtgrp->closid; tsk->rmid = rdtgrp->mon.rmid; } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) + if (rdtgrp->mon.parent->closid == tsk->closid) { tsk->rmid = rdtgrp->mon.rmid; - else + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); ret = -EINVAL; + } } } return ret; @@ -510,8 +513,10 @@ static int rdtgroup_task_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); ret = -EPERM; + } put_cred(tcred); return ret; @@ -528,6 +533,7 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); return -ESRCH; } } else { @@ -555,6 +561,7 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (rdtgrp) ret = rdtgroup_move_task(pid, rdtgrp, of); From 94457b36e8a5026443707b48dcf54b204e098fd7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:36 -0700 Subject: [PATCH 0153/1085] x86/intel_rdt: Add diagnostics when writing the cpus file Can't add a cpu to a monitor group unless it belongs to parent group. Can't delete cpus from the default group. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/757a869a25e9fc1b7a2e9bc43e1159455c1964a0.1506382469.git.tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index d39092eb63bb..6e0ee7ca1490 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -264,8 +264,10 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) + if (cpumask_weight(tmpmask)) { + rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; + } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); @@ -317,8 +319,10 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); if (cpumask_weight(tmpmask)) { /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); return -EINVAL; + } /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, @@ -383,8 +387,10 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; + rdt_last_cmd_puts("directory was removed\n"); goto unlock; } @@ -393,13 +399,16 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, else ret = cpumask_parse(buf, newmask); - if (ret) + if (ret) { + rdt_last_cmd_puts("bad cpu list/mask\n"); goto unlock; + } /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); if (cpumask_weight(tmpmask)) { ret = -EINVAL; + rdt_last_cmd_puts("can only assign online cpus\n"); goto unlock; } From cfd0f34e4cd5f1a5ad7000a3104c37886a70bca9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:37 -0700 Subject: [PATCH 0154/1085] x86/intel_rdt: Add diagnostics when making directories Mostly this is about running out of RMIDs or CLOSIDs. Other errors are various internal errors. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/027cf1ffb3a3695f2d54525813a1d644887353cf.1506382469.git.tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 6e0ee7ca1490..abd220bf6cd7 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1593,8 +1593,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; + rdt_last_cmd_puts("directory was removed\n"); goto out_unlock; } @@ -1602,6 +1604,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; + rdt_last_cmd_puts("kernel out of memory\n"); goto out_unlock; } *r = rdtgrp; @@ -1613,6 +1616,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1626,24 +1630,32 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kernfs_get(kn); ret = rdtgroup_kn_set_ugid(kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); goto out_destroy; + } files = RFTYPE_BASE | RFTYPE_CTRL; files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); ret = rdtgroup_add_files(kn, files); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); goto out_destroy; + } if (rdt_mon_capable) { ret = alloc_rmid(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of RMIDs\n"); goto out_destroy; + } rdtgrp->mon.rmid = ret; ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_idfree; + } } kernfs_activate(kn); @@ -1721,8 +1733,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, kn = rdtgrp->kn; ret = closid_alloc(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of CLOSIDs\n"); goto out_common_fail; + } closid = ret; rdtgrp->closid = closid; @@ -1734,8 +1748,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, * of tasks and cpus to monitor. */ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_id_free; + } } goto out_unlock; From 165d3ad884df4b30c3564a478b457b499345886f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 16:39:38 -0700 Subject: [PATCH 0155/1085] x86/intel_rdt: Add documentation for "info/last_cmd_status" New file in the "info" directory helps diagnose what went wrong when using the /sys/fs/resctrl file system Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Steven Rostedt Cc: Vikas Shivappa Cc: Boris Petkov Cc: Reinette Chatre Link: https://lkml.kernel.org/r/387e78e444582403c2454479e576caf5721a363f.1506382469.git.tony.luck@intel.com --- Documentation/x86/intel_rdt_ui.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index 4d8848e4e224..6851854cf69d 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -87,6 +87,17 @@ with the following files: bytes) at which a previously used LLC_occupancy counter can be considered for re-use. +Finally, in the top level of the "info" directory there is a file +named "last_cmd_status". This is reset with every "command" issued +via the file system (making new directories or writing to any of the +control files). If the command was successful, it will read as "ok". +If the command failed, it will provide more information that can be +conveyed in the error returns from file operations. E.g. + + # echo L3:0=f7 > schemata + bash: echo: write error: Invalid argument + # cat info/last_cmd_status + mask f7 has non-consecutive 1-bits Resource alloc and monitor groups --------------------------------- From 15cc3ae001873845b5d842e212478a6570c7d938 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 13 Sep 2017 18:42:14 +0800 Subject: [PATCH 0156/1085] EDAC, sb_edac: Don't create a second memory controller if HA1 is not present Yi Zhang reported the following failure on a 2-socket Haswell (E5-2603v3) server (DELL PowerEdge 730xd): EDAC sbridge: Some needed devices are missing EDAC MC: Removed device 0 for sb_edac.c Haswell SrcID#0_Ha#0: DEV 0000:7f:12.0 EDAC MC: Removed device 1 for sb_edac.c Haswell SrcID#1_Ha#0: DEV 0000:ff:12.0 EDAC sbridge: Couldn't find mci handler EDAC sbridge: Couldn't find mci handler EDAC sbridge: Failed to register device with error -19. The refactored sb_edac driver creates the IMC1 (the 2nd memory controller) if any IMC1 device is present. In this case only HA1_TA of IMC1 was present, but the driver expected to find HA1/HA1_TM/HA1_TAD[0-3] devices too, leading to the above failure. The document [1] says the 'E5-2603 v3' CPU has 4 memory channels max. Yi Zhang inserted one DIMM per channel for each CPU, and did random error address injection test with this patch: 4024 addresses fell in TOLM hole area 12715 addresses fell in CPU_SrcID#0_Ha#0_Chan#0_DIMM#0 12774 addresses fell in CPU_SrcID#0_Ha#0_Chan#1_DIMM#0 12798 addresses fell in CPU_SrcID#0_Ha#0_Chan#2_DIMM#0 12913 addresses fell in CPU_SrcID#0_Ha#0_Chan#3_DIMM#0 12674 addresses fell in CPU_SrcID#1_Ha#0_Chan#0_DIMM#0 12686 addresses fell in CPU_SrcID#1_Ha#0_Chan#1_DIMM#0 12882 addresses fell in CPU_SrcID#1_Ha#0_Chan#2_DIMM#0 12934 addresses fell in CPU_SrcID#1_Ha#0_Chan#3_DIMM#0 106400 addresses were injected totally. The test result shows that all the 4 channels belong to IMC0 per CPU, so the server really only has one IMC per CPU. In the 1st page of chapter 2 in datasheet [2], it also says 'E5-2600 v3' implements either one or two IMCs. For CPUs with one IMC, IMC1 is not used and should be ignored. Thus, do not create a second memory controller if the key HA1 is absent. [1] http://ark.intel.com/products/83349/Intel-Xeon-Processor-E5-2603-v3-15M-Cache-1_60-GHz [2] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf Reported-and-tested-by: Yi Zhang Signed-off-by: Qiuxu Zhuo Cc: Tony Luck Cc: linux-edac Fixes: e2f747b1f42a ("EDAC, sb_edac: Assign EDAC memory controller per h/w controller") Link: http://lkml.kernel.org/r/20170913104214.7325-1-qiuxu.zhuo@intel.com [ Massage commit message. ] Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 3ea90fc5978b..54d9d179cb77 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -462,6 +462,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = { static const struct pci_id_descr pci_dev_descr_ibridge[] = { /* Processor Home Agent */ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) }, + { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, /* Memory controller */ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) }, @@ -472,7 +473,6 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = { { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) }, /* Optional, mode 2HA */ - { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) }, { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) }, { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) }, @@ -2291,6 +2291,13 @@ static int sbridge_get_onedevice(struct pci_dev **prev, next_imc: sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev); if (!sbridge_dev) { + /* If the HA1 wasn't found, don't create EDAC second memory controller */ + if (dev_descr->dom == IMC1 && devno != 1) { + edac_dbg(0, "Skip IMC1: %04x:%04x (since HA1 was absent)\n", + PCI_VENDOR_ID_INTEL, dev_descr->dev_id); + pci_dev_put(pdev); + return 0; + } if (dev_descr->dom == SOCK) goto out_imc; From a9c0a10888614603cc38b6f60250e92932f24a37 Mon Sep 17 00:00:00 2001 From: Charles Rose Date: Mon, 25 Sep 2017 10:10:43 -0700 Subject: [PATCH 0157/1085] EDAC, skx_edac: Fix detection of single-rank DIMMs Single-rank DIMMs did not get detected on Skylake/Kabylake systems due to wrong limit check. The single rank DIMM check is a simple typo. "0" is a legal value in this field meaning single rank. Signed-off-by: Charles Rose Cc: Aristeu Rozanski Cc: Mauro Carvalho Chehab Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/66df72d327c265fbf92fe25df96daa228a35f076.1506358467.git.tony.luck@intel.com [ Also fix debug message to print number of ranks. ] Signed-off-by: Tony Luck [ Expand commit message. ] Signed-off-by: Borislav Petkov --- drivers/edac/skx_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index e8e570e73796..9ca40a3fe03f 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -301,7 +301,7 @@ static int get_dimm_attr(u32 reg, int lobit, int hibit, int add, int minval, #define IS_DIMM_PRESENT(mtr) GET_BITFIELD((mtr), 15, 15) -#define numrank(reg) get_dimm_attr((reg), 12, 13, 0, 1, 2, "ranks") +#define numrank(reg) get_dimm_attr((reg), 12, 13, 0, 0, 2, "ranks") #define numrow(reg) get_dimm_attr((reg), 2, 4, 12, 1, 6, "rows") #define numcol(reg) get_dimm_attr((reg), 0, 1, 10, 0, 2, "cols") @@ -362,7 +362,7 @@ static int get_dimm_info(u32 mtr, u32 amap, struct dimm_info *dimm, edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n", imc->mc, chan, dimmno, size, npages, - banks, ranks, rows, cols); + banks, 1 << ranks, rows, cols); imc->chan[chan].dimms[dimmno].close_pg = GET_BITFIELD(mtr, 0, 0); imc->chan[chan].dimms[dimmno].bank_xor_enable = GET_BITFIELD(mtr, 9, 9); From f821fe8cc7e51b238585e71b688e7093bfec355c Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 25 Sep 2017 14:34:56 +0200 Subject: [PATCH 0158/1085] EDAC, thunderx: Remove suspend/resume support The memory controller on ThunderX/OcteonTX systems does not support power management. Therefore remove the suspend/resume callbacks. Signed-off-by: Jan Glauber Cc: David Daney Cc: Jan Glauber Cc: Mark Rutland Cc: Mauro Carvalho Chehab Cc: Ralf Baechle Cc: Suzuki K Poulose Cc: Will Deacon Cc: Zhangshaokun Cc: linux-arm-kernel@lists.infradead.org Cc: linux-edac Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20170925123502.17289-2-jglauber@cavium.com Signed-off-by: Borislav Petkov --- drivers/edac/thunderx_edac.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c index f35d87519a3e..4803c6468bab 100644 --- a/drivers/edac/thunderx_edac.c +++ b/drivers/edac/thunderx_edac.c @@ -639,27 +639,6 @@ err_free: return ret; } -#ifdef CONFIG_PM -static int thunderx_lmc_suspend(struct pci_dev *pdev, pm_message_t state) -{ - pci_save_state(pdev); - pci_disable_device(pdev); - - pci_set_power_state(pdev, pci_choose_state(pdev, state)); - - return 0; -} - -static int thunderx_lmc_resume(struct pci_dev *pdev) -{ - pci_set_power_state(pdev, PCI_D0); - pci_enable_wake(pdev, PCI_D0, 0); - pci_restore_state(pdev); - - return 0; -} -#endif - static const struct pci_device_id thunderx_lmc_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_LMC) }, { 0, }, @@ -834,10 +813,6 @@ static struct pci_driver thunderx_lmc_driver = { .name = "thunderx_lmc_edac", .probe = thunderx_lmc_probe, .remove = thunderx_lmc_remove, -#ifdef CONFIG_PM - .suspend = thunderx_lmc_suspend, - .resume = thunderx_lmc_resume, -#endif .id_table = thunderx_lmc_pci_tbl, }; From b7969caf41a1d1be72bb1dd02542a26c0fb3d688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 27 Sep 2017 17:39:22 +0200 Subject: [PATCH 0159/1085] spi: mxs: implement runtime pm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a straight forward addition of runtime and system sleep pm operations that handle clk and pinctrl (for runtime pm) and spi_master_{suspend,resume} (for system sleep). Signed-off-by: Uwe Kleine-König Signed-off-by: Mark Brown --- drivers/spi/spi-mxs.c | 120 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 110 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c index 5b0e9a3e83f6..3d216b950b41 100644 --- a/drivers/spi/spi-mxs.c +++ b/drivers/spi/spi-mxs.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -442,6 +443,85 @@ static int mxs_spi_transfer_one(struct spi_master *master, return status; } +static int mxs_spi_runtime_suspend(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + struct mxs_spi *spi = spi_master_get_devdata(master); + struct mxs_ssp *ssp = &spi->ssp; + int ret; + + clk_disable_unprepare(ssp->clk); + + ret = pinctrl_pm_select_idle_state(dev); + if (ret) { + int ret2 = clk_prepare_enable(ssp->clk); + + if (ret2) + dev_warn(dev, "Failed to reenable clock after failing pinctrl request (pinctrl: %d, clk: %d)\n", + ret, ret2); + } + + return ret; +} + +static int mxs_spi_runtime_resume(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + struct mxs_spi *spi = spi_master_get_devdata(master); + struct mxs_ssp *ssp = &spi->ssp; + int ret; + + ret = pinctrl_pm_select_default_state(dev); + if (ret) + return ret; + + ret = clk_prepare_enable(ssp->clk); + if (ret) + pinctrl_pm_select_idle_state(dev); + + return ret; +} + +static int __maybe_unused mxs_spi_suspend(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + int ret; + + ret = spi_master_suspend(master); + if (ret) + return ret; + + if (!pm_runtime_suspended(dev)) + return mxs_spi_runtime_suspend(dev); + else + return 0; +} + +static int __maybe_unused mxs_spi_resume(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + int ret; + + if (!pm_runtime_suspended(dev)) + ret = mxs_spi_runtime_resume(dev); + else + ret = 0; + if (ret) + return ret; + + ret = spi_master_resume(master); + if (ret < 0 && !pm_runtime_suspended(dev)) + mxs_spi_runtime_suspend(dev); + + return ret; +} + +static const struct dev_pm_ops mxs_spi_pm = { + SET_RUNTIME_PM_OPS(mxs_spi_runtime_suspend, + mxs_spi_runtime_resume, NULL) + SET_SYSTEM_SLEEP_PM_OPS(mxs_spi_suspend, mxs_spi_resume) +}; + static const struct of_device_id mxs_spi_dt_ids[] = { { .compatible = "fsl,imx23-spi", .data = (void *) IMX23_SSP, }, { .compatible = "fsl,imx28-spi", .data = (void *) IMX28_SSP, }, @@ -493,12 +573,15 @@ static int mxs_spi_probe(struct platform_device *pdev) if (!master) return -ENOMEM; + platform_set_drvdata(pdev, master); + master->transfer_one_message = mxs_spi_transfer_one; master->bits_per_word_mask = SPI_BPW_MASK(8); master->mode_bits = SPI_CPOL | SPI_CPHA; master->num_chipselect = 3; master->dev.of_node = np; master->flags = SPI_MASTER_HALF_DUPLEX; + master->auto_runtime_pm = true; spi = spi_master_get_devdata(master); ssp = &spi->ssp; @@ -521,28 +604,41 @@ static int mxs_spi_probe(struct platform_device *pdev) goto out_master_free; } - ret = clk_prepare_enable(ssp->clk); - if (ret) - goto out_dma_release; + pm_runtime_enable(ssp->dev); + if (!pm_runtime_enabled(ssp->dev)) { + ret = mxs_spi_runtime_resume(ssp->dev); + if (ret < 0) { + dev_err(ssp->dev, "runtime resume failed\n"); + goto out_dma_release; + } + } + + ret = pm_runtime_get_sync(ssp->dev); + if (ret < 0) { + dev_err(ssp->dev, "runtime_get_sync failed\n"); + goto out_pm_runtime_disable; + } clk_set_rate(ssp->clk, clk_freq); ret = stmp_reset_block(ssp->base); if (ret) - goto out_disable_clk; - - platform_set_drvdata(pdev, master); + goto out_pm_runtime_put; ret = devm_spi_register_master(&pdev->dev, master); if (ret) { dev_err(&pdev->dev, "Cannot register SPI master, %d\n", ret); - goto out_disable_clk; + goto out_pm_runtime_put; } + pm_runtime_put(ssp->dev); + return 0; -out_disable_clk: - clk_disable_unprepare(ssp->clk); +out_pm_runtime_put: + pm_runtime_put(ssp->dev); +out_pm_runtime_disable: + pm_runtime_disable(ssp->dev); out_dma_release: dma_release_channel(ssp->dmach); out_master_free: @@ -560,7 +656,10 @@ static int mxs_spi_remove(struct platform_device *pdev) spi = spi_master_get_devdata(master); ssp = &spi->ssp; - clk_disable_unprepare(ssp->clk); + pm_runtime_disable(&pdev->dev); + if (!pm_runtime_status_suspended(&pdev->dev)) + mxs_spi_runtime_suspend(&pdev->dev); + dma_release_channel(ssp->dmach); return 0; @@ -572,6 +671,7 @@ static struct platform_driver mxs_spi_driver = { .driver = { .name = DRIVER_NAME, .of_match_table = mxs_spi_dt_ids, + .pm = &mxs_spi_pm, }, }; From 8076428f0c9f24d90270204fd39ccb11b83db71d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:22 +0200 Subject: [PATCH 0160/1085] s390: convert release_thread() into a static inline function release_thread() is an empty function that gets called on every task exit. Move the function to a header file and force inlining of it, so that the compiler can optimize it away instead of generating a pointless function call. Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 2 +- arch/s390/kernel/process.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c25d57e0aad3..4cc9bf53074b 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -218,7 +218,7 @@ void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); +static inline void release_thread(struct task_struct *tsk) { } /* Free guarded storage control block for current */ void exit_thread_gs(void); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bb32b8618bf6..8cd311fc432a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -58,10 +58,6 @@ void flush_thread(void) { } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { } From d6e646ad7cfa7034d280459b2b2546288f247144 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:22 +0200 Subject: [PATCH 0161/1085] s390/runtime instrumention: fix possible memory corruption For PREEMPT enabled kernels the runtime instrumentation (RI) code contains a possible use-after-free bug. If a task that makes use of RI exits, it will execute do_exit() while still enabled for preemption. That function will call exit_thread_runtime_instr() via exit_thread(). If exit_thread_runtime_instr() gets preempted after the RI control block of the task has been freed but before the pointer to it is set to NULL, then save_ri_cb(), called from switch_to(), will write to already freed memory. Avoid this and simply disable preemption while freeing the control block and setting the pointer to NULL. Fixes: e4b8b3f33fca ("s390: add support for runtime instrumentation") Cc: # v3.7+ Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/runtime_instr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index 429d3a782f1c..b9738ae2e1de 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -49,11 +49,13 @@ void exit_thread_runtime_instr(void) { struct task_struct *task = current; + preempt_disable(); if (!task->thread.ri_cb) return; disable_runtime_instr(); kfree(task->thread.ri_cb); task->thread.ri_cb = NULL; + preempt_enable(); } SYSCALL_DEFINE1(s390_runtime_instr, int, command) @@ -64,9 +66,7 @@ SYSCALL_DEFINE1(s390_runtime_instr, int, command) return -EOPNOTSUPP; if (command == S390_RUNTIME_INSTR_STOP) { - preempt_disable(); exit_thread_runtime_instr(); - preempt_enable(); return 0; } From 8d9047f8b967ce6181fd824ae922978e1b055cc0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:22 +0200 Subject: [PATCH 0162/1085] s390/runtime instrumentation: simplify task exit handling Free data structures required for runtime instrumentation from arch_release_task_struct(). This allows to simplify the code a bit, and also makes the semantics a bit easier: arch_release_task_struct() is never called from the task that is being removed. In addition this allows to get rid of exit_thread() in a later patch. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/runtime_instr.h | 4 +++- arch/s390/kernel/process.c | 5 ++--- arch/s390/kernel/runtime_instr.c | 30 +++++++++++++-------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/arch/s390/include/asm/runtime_instr.h b/arch/s390/include/asm/runtime_instr.h index 402ad6df4897..c54a9310d814 100644 --- a/arch/s390/include/asm/runtime_instr.h +++ b/arch/s390/include/asm/runtime_instr.h @@ -85,6 +85,8 @@ static inline void restore_ri_cb(struct runtime_instr_cb *cb_next, load_runtime_instr_cb(&runtime_instr_empty_cb); } -void exit_thread_runtime_instr(void); +struct task_struct; + +void runtime_instr_release(struct task_struct *tsk); #endif /* _RUNTIME_INSTR_H */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 8cd311fc432a..a7c6c5e944c7 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -48,10 +48,8 @@ extern void kernel_thread_starter(void); */ void exit_thread(struct task_struct *tsk) { - if (tsk == current) { - exit_thread_runtime_instr(); + if (tsk == current) exit_thread_gs(); - } } void flush_thread(void) @@ -60,6 +58,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { + runtime_instr_release(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index b9738ae2e1de..e2c541db829f 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -20,11 +20,24 @@ /* empty control block to disable RI by loading it */ struct runtime_instr_cb runtime_instr_empty_cb; +void runtime_instr_release(struct task_struct *tsk) +{ + kfree(tsk->thread.ri_cb); +} + static void disable_runtime_instr(void) { - struct pt_regs *regs = task_pt_regs(current); + struct task_struct *task = current; + struct pt_regs *regs; + if (!task->thread.ri_cb) + return; + regs = task_pt_regs(task); + preempt_disable(); load_runtime_instr_cb(&runtime_instr_empty_cb); + kfree(task->thread.ri_cb); + task->thread.ri_cb = NULL; + preempt_enable(); /* * Make sure the RI bit is deleted from the PSW. If the user did not @@ -45,19 +58,6 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb) cb->valid = 1; } -void exit_thread_runtime_instr(void) -{ - struct task_struct *task = current; - - preempt_disable(); - if (!task->thread.ri_cb) - return; - disable_runtime_instr(); - kfree(task->thread.ri_cb); - task->thread.ri_cb = NULL; - preempt_enable(); -} - SYSCALL_DEFINE1(s390_runtime_instr, int, command) { struct runtime_instr_cb *cb; @@ -66,7 +66,7 @@ SYSCALL_DEFINE1(s390_runtime_instr, int, command) return -EOPNOTSUPP; if (command == S390_RUNTIME_INSTR_STOP) { - exit_thread_runtime_instr(); + disable_runtime_instr(); return 0; } From fa1edf3f63c05ca8eacafcd7048ed91e5360f1a8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:22 +0200 Subject: [PATCH 0163/1085] s390/guarded storage: fix possible memory corruption For PREEMPT enabled kernels the guarded storage (GS) code contains a possible use-after-free bug. If a task that makes use of GS exits, it will execute do_exit() while still enabled for preemption. That function will call exit_thread_runtime_instr() via exit_thread(). If exit_thread_gs() gets preempted after the GS control block of the task has been freed but before the pointer to it is set to NULL, then save_gs_cb(), called from switch_to(), will write to already freed memory. Avoid this and simply disable preemption while freeing the control block and setting the pointer to NULL. Fixes: 916cda1aa1b4 ("s390: add a system call for guarded storage") Cc: # v4.12+ Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/guarded_storage.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index 6f064745c3b1..4920cf6ffa00 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -13,9 +13,11 @@ void exit_thread_gs(void) { + preempt_disable(); kfree(current->thread.gs_cb); kfree(current->thread.gs_bc_cb); current->thread.gs_cb = current->thread.gs_bc_cb = NULL; + preempt_enable(); } static int gs_enable(void) From 5ef2d5231d547c672c67bdf84c13a4adaf477964 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:23 +0200 Subject: [PATCH 0164/1085] s390/ptrace: fix guarded storage regset handling If the guarded storage regset for current is supposed to be changed, the regset from user space is copied directly into the guarded storage control block. If then the process gets scheduled away while the control block is being copied and before the new control block has been loaded, the result is random: the process can be scheduled away due to a page fault or preemption. If that happens the already copied parts will be overwritten by save_gs_cb(), called from switch_to(). Avoid this by copying the data to a temporary buffer on the stack and do the actual update with preemption disabled. Fixes: f5bbd7219891 ("s390/ptrace: guarded storage regset for the current task") Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ptrace.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 252ed61a128b..a5f8f0c8ccf0 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -1171,26 +1171,37 @@ static int s390_gs_cb_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct gs_cb *data = target->thread.gs_cb; + struct gs_cb gs_cb = { }, *data = NULL; int rc; if (!MACHINE_HAS_GS) return -ENODEV; - if (!data) { + if (!target->thread.gs_cb) { data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - data->gsd = 25; - target->thread.gs_cb = data; - if (target == current) - __ctl_set_bit(2, 4); - } else if (target == current) { - save_gs_cb(data); } + if (!target->thread.gs_cb) + gs_cb.gsd = 25; + else if (target == current) + save_gs_cb(&gs_cb); + else + gs_cb = *target->thread.gs_cb; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(struct gs_cb)); - if (target == current) - restore_gs_cb(data); + &gs_cb, 0, sizeof(gs_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + preempt_disable(); + if (!target->thread.gs_cb) + target->thread.gs_cb = data; + *target->thread.gs_cb = gs_cb; + if (target == current) { + __ctl_set_bit(2, 4); + restore_gs_cb(target->thread.gs_cb); + } + preempt_enable(); return rc; } From 7b83c6297d2fc7350997e86188df84c27fd59530 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:23 +0200 Subject: [PATCH 0165/1085] s390/guarded storage: simplify task exit handling Free data structures required for guarded storage from arch_release_task_struct(). This allows to simplify the code a bit, and also makes the semantics a bit easier: arch_release_task_struct() is never called from the task that is being removed. In addition this allows to get rid of exit_thread() in a later patch. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 4 ++-- arch/s390/kernel/guarded_storage.c | 9 +++------ arch/s390/kernel/process.c | 3 +-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 4cc9bf53074b..58eaaccd3cf0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -220,8 +220,8 @@ void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ static inline void release_thread(struct task_struct *tsk) { } -/* Free guarded storage control block for current */ -void exit_thread_gs(void); +/* Free guarded storage control block */ +void guarded_storage_release(struct task_struct *tsk); unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index 4920cf6ffa00..65b0eca2128c 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -11,13 +11,10 @@ #include #include "entry.h" -void exit_thread_gs(void) +void guarded_storage_release(struct task_struct *tsk) { - preempt_disable(); - kfree(current->thread.gs_cb); - kfree(current->thread.gs_bc_cb); - current->thread.gs_cb = current->thread.gs_bc_cb = NULL; - preempt_enable(); + kfree(tsk->thread.gs_cb); + kfree(tsk->thread.gs_bc_cb); } static int gs_enable(void) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index a7c6c5e944c7..0ecadd349106 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -48,8 +48,6 @@ extern void kernel_thread_starter(void); */ void exit_thread(struct task_struct *tsk) { - if (tsk == current) - exit_thread_gs(); } void flush_thread(void) @@ -59,6 +57,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { runtime_instr_release(tsk); + guarded_storage_release(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) From 59a19ea9a0b3b0ee69887b6a5015aee3a3c7e527 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:23 +0200 Subject: [PATCH 0166/1085] s390: get rid of exit_thread() exit_thread() is empty now. Therefore remove it and get rid of a pointless branch. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 - arch/s390/kernel/process.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 48af970320cb..2784ca96f418 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -142,7 +142,6 @@ config S390 select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 0ecadd349106..080c851dd9a5 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -43,13 +43,6 @@ asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); extern void kernel_thread_starter(void); -/* - * Free current thread data structures etc.. - */ -void exit_thread(struct task_struct *tsk) -{ -} - void flush_thread(void) { } From 79962038dffab28094782406d8e27b095a26ce51 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Sep 2017 13:49:57 +0200 Subject: [PATCH 0167/1085] s390: add support for FORTIFY_SOURCE This is the quite trivial backend for s390 which is required to enable FORTIFY_SOURCE support. See commit 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions") for more details. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 + arch/s390/boot/compressed/Makefile | 2 +- arch/s390/include/asm/string.h | 2 +- arch/s390/kernel/Makefile | 2 ++ 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2784ca96f418..4edc91e6c3ff 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -67,6 +67,7 @@ config S390 select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index f7e4c834ea24..ec63b580d41f 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -11,7 +11,7 @@ targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 targets += misc.o piggy.o sizes.h head.o KBUILD_CFLAGS := -m64 -D__KERNEL__ -O2 -KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS += $(call cc-option,-mpacked-stack) KBUILD_CFLAGS += $(call cc-option,-ffreestanding) diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index e5f5c7074f2c..8fb43319693d 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -49,7 +49,7 @@ extern char *strstr(const char *, const char *); #undef __HAVE_ARCH_STRSEP #undef __HAVE_ARCH_STRSPN -#if !defined(IN_ARCH_STRING_C) +#if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY)) static inline void *memchr(const void * s, int c, size_t n) { diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index adb3fe2e3d42..9118e62f9fca 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -33,6 +33,8 @@ AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) AFLAGS_head.o += -march=z900 endif +CFLAGS_als.o += -D__NO_FORTIFY + # # Passing null pointers is ok for smp code, since we access the lowcore here. # From bb59c2da3fb4ecc83258ca6b2ecde70d80c33465 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Thu, 14 Sep 2017 12:35:45 +0200 Subject: [PATCH 0168/1085] s390/runtime_instrumentation: clean up struct runtime_instr_cb Update runtime_instr_cb structure to be consistent with the runtime instrumentation documentation. Signed-off-by: Alice Frosi Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/runtime_instr.h | 82 +++++++++++++-------------- arch/s390/kernel/runtime_instr.c | 12 ++-- arch/s390/kvm/kvm-s390.c | 2 +- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/arch/s390/include/asm/runtime_instr.h b/arch/s390/include/asm/runtime_instr.h index c54a9310d814..f0ce4972d87c 100644 --- a/arch/s390/include/asm/runtime_instr.h +++ b/arch/s390/include/asm/runtime_instr.h @@ -5,55 +5,55 @@ #define S390_RUNTIME_INSTR_STOP 0x2 struct runtime_instr_cb { - __u64 buf_current; - __u64 buf_origin; - __u64 buf_limit; + __u64 rca; + __u64 roa; + __u64 rla; - __u32 valid : 1; - __u32 pstate : 1; - __u32 pstate_set_buf : 1; - __u32 home_space : 1; - __u32 altered : 1; - __u32 : 3; - __u32 pstate_sample : 1; - __u32 sstate_sample : 1; - __u32 pstate_collect : 1; - __u32 sstate_collect : 1; - __u32 : 1; - __u32 halted_int : 1; - __u32 int_requested : 1; - __u32 buffer_full_int : 1; + __u32 v : 1; + __u32 s : 1; + __u32 k : 1; + __u32 h : 1; + __u32 a : 1; + __u32 reserved1 : 3; + __u32 ps : 1; + __u32 qs : 1; + __u32 pc : 1; + __u32 qc : 1; + __u32 reserved2 : 1; + __u32 g : 1; + __u32 u : 1; + __u32 l : 1; __u32 key : 4; - __u32 : 9; + __u32 reserved3 : 8; + __u32 t : 1; __u32 rgs : 3; - __u32 mode : 4; - __u32 next : 1; + __u32 m : 4; + __u32 n : 1; __u32 mae : 1; - __u32 : 2; - __u32 call_type_br : 1; - __u32 return_type_br : 1; - __u32 other_type_br : 1; - __u32 bc_other_type : 1; - __u32 emit : 1; - __u32 tx_abort : 1; - __u32 : 2; - __u32 bp_xn : 1; - __u32 bp_xt : 1; - __u32 bp_ti : 1; - __u32 bp_ni : 1; - __u32 suppr_y : 1; - __u32 suppr_z : 1; + __u32 reserved4 : 2; + __u32 c : 1; + __u32 r : 1; + __u32 b : 1; + __u32 j : 1; + __u32 e : 1; + __u32 x : 1; + __u32 reserved5 : 2; + __u32 bpxn : 1; + __u32 bpxt : 1; + __u32 bpti : 1; + __u32 bpni : 1; + __u32 reserved6 : 2; - __u32 dc_miss_extra : 1; - __u32 lat_lev_ignore : 1; - __u32 ic_lat_lev : 4; - __u32 dc_lat_lev : 4; + __u32 d : 1; + __u32 f : 1; + __u32 ic : 4; + __u32 dc : 4; - __u64 reserved1; - __u64 scaling_factor; + __u64 reserved7; + __u64 sf; __u64 rsic; - __u64 reserved2; + __u64 reserved8; } __packed __aligned(8); extern struct runtime_instr_cb runtime_instr_empty_cb; diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index e2c541db829f..8ae851cfea6c 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -49,13 +49,13 @@ static void disable_runtime_instr(void) static void init_runtime_instr_cb(struct runtime_instr_cb *cb) { - cb->buf_limit = 0xfff; - cb->pstate = 1; - cb->pstate_set_buf = 1; - cb->pstate_sample = 1; - cb->pstate_collect = 1; + cb->rla = 0xfff; + cb->s = 1; + cb->k = 1; + cb->ps = 1; + cb->pc = 1; cb->key = PAGE_DEFAULT_KEY; - cb->valid = 1; + cb->v = 1; } SYSCALL_DEFINE1(s390_runtime_instr, int, command) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 40d0a1a97889..3f8316c2cb2a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3283,7 +3283,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && test_kvm_facility(vcpu->kvm, 64) && - riccb->valid && + riccb->v && !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)"); vcpu->arch.sie_block->ecb3 |= ECB3_RI; From 262832bc5acda76fd8f901d39f4da1121d951222 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Thu, 14 Sep 2017 12:36:03 +0200 Subject: [PATCH 0169/1085] s390/ptrace: add runtime instrumention register get/set Add runtime instrumention register get and set which allows to read and modify the runtime instrumention control block. Signed-off-by: Alice Frosi Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ptrace.c | 109 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 110 insertions(+) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index a5f8f0c8ccf0..ea711f141bb8 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -30,6 +30,9 @@ #include #include #include +#include +#include + #include "entry.h" #ifdef CONFIG_COMPAT @@ -1239,6 +1242,96 @@ static int s390_gs_bc_set(struct task_struct *target, data, 0, sizeof(struct gs_cb)); } +static bool is_ri_cb_valid(struct runtime_instr_cb *cb) +{ + return (cb->rca & 0x1f) == 0 && + (cb->roa & 0xfff) == 0 && + (cb->rla & 0xfff) == 0xfff && + cb->s == 1 && + cb->k == 1 && + cb->h == 0 && + cb->reserved1 == 0 && + cb->ps == 1 && + cb->qs == 0 && + cb->pc == 1 && + cb->qc == 0 && + cb->reserved2 == 0 && + cb->key == PAGE_DEFAULT_KEY && + cb->reserved3 == 0 && + cb->reserved4 == 0 && + cb->reserved5 == 0 && + cb->reserved6 == 0 && + cb->reserved7 == 0 && + cb->reserved8 == 0 && + cb->rla >= cb->roa && + cb->rca >= cb->roa && + cb->rca <= cb->rla+1 && + cb->m < 3; +} + +static int s390_runtime_instr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct runtime_instr_cb *data = target->thread.ri_cb; + + if (!test_facility(64)) + return -ENODEV; + if (!data) + return -ENODATA; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct runtime_instr_cb)); +} + +static int s390_runtime_instr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct runtime_instr_cb ri_cb = { }, *data = NULL; + int rc; + + if (!test_facility(64)) + return -ENODEV; + + if (!target->thread.ri_cb) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + } + + if (target->thread.ri_cb) { + if (target == current) + store_runtime_instr_cb(&ri_cb); + else + ri_cb = *target->thread.ri_cb; + } + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &ri_cb, 0, sizeof(struct runtime_instr_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + + if (!is_ri_cb_valid(&ri_cb)) { + kfree(data); + return -EINVAL; + } + + preempt_disable(); + if (!target->thread.ri_cb) + target->thread.ri_cb = data; + *target->thread.ri_cb = ri_cb; + if (target == current) + load_runtime_instr_cb(target->thread.ri_cb); + preempt_enable(); + + return 0; +} + static const struct user_regset s390_regsets[] = { { .core_note_type = NT_PRSTATUS, @@ -1312,6 +1405,14 @@ static const struct user_regset s390_regsets[] = { .get = s390_gs_bc_get, .set = s390_gs_bc_set, }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, }; static const struct user_regset_view user_s390_view = { @@ -1548,6 +1649,14 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_gs_cb_get, .set = s390_gs_cb_set, }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, }; static const struct user_regset_view user_s390_compat_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b5280db9ef6a..e3739c330c15 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -411,6 +411,7 @@ typedef struct elf64_shdr { #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ #define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ #define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ #define NT_ARM_TLS 0x401 /* ARM TLS register */ #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ From e16c5dd5157efeddaad7492b920192fea0e7e4ec Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 14 Sep 2017 14:11:15 +0200 Subject: [PATCH 0170/1085] samples/kprobes: Add s390 case in kprobe example module Add info prints in sample kprobe handlers for S/390 Signed-off-by: Johannes Thumshirn Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- samples/kprobes/kprobe_example.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 88b3e2d227ae..67de3b774bc9 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -47,6 +47,10 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs) " pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); #endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->psw.addr, regs->flags); +#endif /* A dump_stack() here will give a stack backtrace */ return 0; @@ -76,6 +80,10 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs, pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pstate); #endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->flags); +#endif } /* From b08e19defc10befc50af09771ff0b086f2a72138 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 18 Sep 2017 11:13:13 +0200 Subject: [PATCH 0171/1085] s390/char: fix cdev_add usage Function cdev_add does set cdev->dev, so there is no point in setting it prior to calling this function. Signed-off-by: Jean Delvare Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/tape_class.c | 3 +-- drivers/s390/char/vmlogrdr.c | 3 +-- drivers/s390/char/vmur.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/s390/char/tape_class.c b/drivers/s390/char/tape_class.c index 91c3c642c76e..e7d23048d3f0 100644 --- a/drivers/s390/char/tape_class.c +++ b/drivers/s390/char/tape_class.c @@ -68,9 +68,8 @@ struct tape_class_device *register_tape_dev( tcd->char_device->owner = fops->owner; tcd->char_device->ops = fops; - tcd->char_device->dev = dev; - rc = cdev_add(tcd->char_device, tcd->char_device->dev, 1); + rc = cdev_add(tcd->char_device, dev, 1); if (rc) goto fail_with_cdev; diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index b19020b9efff..62559dc0169f 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -812,8 +812,7 @@ static int vmlogrdr_register_cdev(dev_t dev) } vmlogrdr_cdev->owner = THIS_MODULE; vmlogrdr_cdev->ops = &vmlogrdr_fops; - vmlogrdr_cdev->dev = dev; - rc = cdev_add(vmlogrdr_cdev, vmlogrdr_cdev->dev, MAXMINOR); + rc = cdev_add(vmlogrdr_cdev, dev, MAXMINOR); if (!rc) return 0; diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index 04aceb694d51..a50cf930a623 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -892,10 +892,9 @@ static int ur_set_online(struct ccw_device *cdev) } urd->char_device->ops = &ur_fops; - urd->char_device->dev = MKDEV(major, minor); urd->char_device->owner = ur_fops.owner; - rc = cdev_add(urd->char_device, urd->char_device->dev, 1); + rc = cdev_add(urd->char_device, MKDEV(major, minor), 1); if (rc) goto fail_free_cdev; if (urd->cdev->id.cu_type == READER_PUNCH_DEVTYPE) { From adc69b4d76ee8a7065c7c637584b39ace265ffab Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 5 Sep 2017 14:22:48 +0200 Subject: [PATCH 0172/1085] s390/cmf: set_schib_wait add timeout When enabling channel measurement fails with a busy condition we wait for the next interrupt to arrive before we retry the operation. For devices which usually don't create interrupts we wait forever. Although the waiting is done interruptible that behavior is not expected and confused some users. Abort the operation after a 10s timeout. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 220491d27ef4..be0b010ff136 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -227,6 +227,7 @@ static void cmf_set_schib_release(struct kref *kref) } #define CMF_PENDING 1 +#define SET_SCHIB_TIMEOUT (10 * HZ) static int set_schib_wait(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address) @@ -263,19 +264,19 @@ static int set_schib_wait(struct ccw_device *cdev, u32 mme, cdev->private->state = DEV_STATE_CMFCHANGE; set_data->ret = CMF_PENDING; cdev->private->cmb_wait = set_data; - spin_unlock_irq(cdev->ccwlock); - if (wait_event_interruptible(set_data->wait, - set_data->ret != CMF_PENDING)) { - spin_lock_irq(cdev->ccwlock); + + ret = wait_event_interruptible_timeout(set_data->wait, + set_data->ret != CMF_PENDING, + SET_SCHIB_TIMEOUT); + spin_lock_irq(cdev->ccwlock); + if (ret <= 0) { if (set_data->ret == CMF_PENDING) { - set_data->ret = -ERESTARTSYS; + set_data->ret = (ret == 0) ? -ETIME : ret; if (cdev->private->state == DEV_STATE_CMFCHANGE) cdev->private->state = DEV_STATE_ONLINE; } - spin_unlock_irq(cdev->ccwlock); } - spin_lock_irq(cdev->ccwlock); cdev->private->cmb_wait = NULL; ret = set_data->ret; out_put: From eeec1e435fd0195287ee3224c3837b3d71dedc74 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 6 Sep 2017 13:43:20 +0200 Subject: [PATCH 0173/1085] s390/cmf: simplify set_schib_wait No need for refcounting - the data can be on stack. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 70 ++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index be0b010ff136..a8e4c5e13ba8 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -215,72 +215,52 @@ struct set_schib_struct { unsigned long address; wait_queue_head_t wait; int ret; - struct kref kref; }; -static void cmf_set_schib_release(struct kref *kref) -{ - struct set_schib_struct *set_data; - - set_data = container_of(kref, struct set_schib_struct, kref); - kfree(set_data); -} - #define CMF_PENDING 1 #define SET_SCHIB_TIMEOUT (10 * HZ) static int set_schib_wait(struct ccw_device *cdev, u32 mme, - int mbfc, unsigned long address) + int mbfc, unsigned long address) { - struct set_schib_struct *set_data; - int ret; + struct set_schib_struct set_data; + int ret = -ENODEV; spin_lock_irq(cdev->ccwlock); - if (!cdev->private->cmb) { - ret = -ENODEV; + if (!cdev->private->cmb) goto out; - } - set_data = kzalloc(sizeof(struct set_schib_struct), GFP_ATOMIC); - if (!set_data) { - ret = -ENOMEM; - goto out; - } - init_waitqueue_head(&set_data->wait); - kref_init(&set_data->kref); - set_data->mme = mme; - set_data->mbfc = mbfc; - set_data->address = address; ret = set_schib(cdev, mme, mbfc, address); if (ret != -EBUSY) - goto out_put; + goto out; - if (cdev->private->state != DEV_STATE_ONLINE) { - /* if the device is not online, don't even try again */ - ret = -EBUSY; - goto out_put; - } + /* if the device is not online, don't even try again */ + if (cdev->private->state != DEV_STATE_ONLINE) + goto out; + + init_waitqueue_head(&set_data.wait); + set_data.mme = mme; + set_data.mbfc = mbfc; + set_data.address = address; + set_data.ret = CMF_PENDING; cdev->private->state = DEV_STATE_CMFCHANGE; - set_data->ret = CMF_PENDING; - cdev->private->cmb_wait = set_data; + cdev->private->cmb_wait = &set_data; spin_unlock_irq(cdev->ccwlock); - ret = wait_event_interruptible_timeout(set_data->wait, - set_data->ret != CMF_PENDING, + ret = wait_event_interruptible_timeout(set_data.wait, + set_data.ret != CMF_PENDING, SET_SCHIB_TIMEOUT); spin_lock_irq(cdev->ccwlock); if (ret <= 0) { - if (set_data->ret == CMF_PENDING) { - set_data->ret = (ret == 0) ? -ETIME : ret; + if (set_data.ret == CMF_PENDING) { + set_data.ret = (ret == 0) ? -ETIME : ret; if (cdev->private->state == DEV_STATE_CMFCHANGE) cdev->private->state = DEV_STATE_ONLINE; } } cdev->private->cmb_wait = NULL; - ret = set_data->ret; -out_put: - kref_put(&set_data->kref, cmf_set_schib_release); + ret = set_data.ret; out: spin_unlock_irq(cdev->ccwlock); return ret; @@ -288,18 +268,14 @@ out: void retry_set_schib(struct ccw_device *cdev) { - struct set_schib_struct *set_data; + struct set_schib_struct *set_data = cdev->private->cmb_wait; - set_data = cdev->private->cmb_wait; - if (!set_data) { - WARN_ON(1); + if (!set_data) return; - } - kref_get(&set_data->kref); + set_data->ret = set_schib(cdev, set_data->mme, set_data->mbfc, set_data->address); wake_up(&set_data->wait); - kref_put(&set_data->kref, cmf_set_schib_release); } static int cmf_copy_block(struct ccw_device *cdev) From 60f3eac3a1dc127541817af8042b6711e63dc6e1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 6 Sep 2017 19:05:29 +0200 Subject: [PATCH 0174/1085] s390/cmf: simplify cmb_copy_wait No need for refcounting - the data can be on stack. Also change the locking in this function to only use spin_lock_irq (the function waits, thus it's never called from IRQ context). Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 72 ++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 49 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index a8e4c5e13ba8..7ed6c865501b 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -319,83 +319,57 @@ static int cmf_copy_block(struct ccw_device *cdev) struct copy_block_struct { wait_queue_head_t wait; int ret; - struct kref kref; }; -static void cmf_copy_block_release(struct kref *kref) -{ - struct copy_block_struct *copy_block; - - copy_block = container_of(kref, struct copy_block_struct, kref); - kfree(copy_block); -} - static int cmf_cmb_copy_wait(struct ccw_device *cdev) { - struct copy_block_struct *copy_block; - int ret; - unsigned long flags; + struct copy_block_struct copy_block; + int ret = -ENODEV; - spin_lock_irqsave(cdev->ccwlock, flags); - if (!cdev->private->cmb) { - ret = -ENODEV; + spin_lock_irq(cdev->ccwlock); + if (!cdev->private->cmb) goto out; - } - copy_block = kzalloc(sizeof(struct copy_block_struct), GFP_ATOMIC); - if (!copy_block) { - ret = -ENOMEM; - goto out; - } - init_waitqueue_head(©_block->wait); - kref_init(©_block->kref); ret = cmf_copy_block(cdev); if (ret != -EBUSY) - goto out_put; + goto out; - if (cdev->private->state != DEV_STATE_ONLINE) { - ret = -EBUSY; - goto out_put; - } + if (cdev->private->state != DEV_STATE_ONLINE) + goto out; + + init_waitqueue_head(©_block.wait); + copy_block.ret = CMF_PENDING; cdev->private->state = DEV_STATE_CMFUPDATE; - copy_block->ret = CMF_PENDING; - cdev->private->cmb_wait = copy_block; + cdev->private->cmb_wait = ©_block; + spin_unlock_irq(cdev->ccwlock); - spin_unlock_irqrestore(cdev->ccwlock, flags); - if (wait_event_interruptible(copy_block->wait, - copy_block->ret != CMF_PENDING)) { - spin_lock_irqsave(cdev->ccwlock, flags); - if (copy_block->ret == CMF_PENDING) { - copy_block->ret = -ERESTARTSYS; + ret = wait_event_interruptible(copy_block.wait, + copy_block.ret != CMF_PENDING); + spin_lock_irq(cdev->ccwlock); + if (ret) { + if (copy_block.ret == CMF_PENDING) { + copy_block.ret = -ERESTARTSYS; if (cdev->private->state == DEV_STATE_CMFUPDATE) cdev->private->state = DEV_STATE_ONLINE; } - spin_unlock_irqrestore(cdev->ccwlock, flags); } - spin_lock_irqsave(cdev->ccwlock, flags); cdev->private->cmb_wait = NULL; - ret = copy_block->ret; -out_put: - kref_put(©_block->kref, cmf_copy_block_release); + ret = copy_block.ret; out: - spin_unlock_irqrestore(cdev->ccwlock, flags); + spin_unlock_irq(cdev->ccwlock); return ret; } void cmf_retry_copy_block(struct ccw_device *cdev) { - struct copy_block_struct *copy_block; + struct copy_block_struct *copy_block = cdev->private->cmb_wait; - copy_block = cdev->private->cmb_wait; - if (!copy_block) { - WARN_ON(1); + if (!copy_block) return; - } - kref_get(©_block->kref); + copy_block->ret = cmf_copy_block(cdev); wake_up(©_block->wait); - kref_put(©_block->kref, cmf_copy_block_release); } static void cmf_generic_reset(struct ccw_device *cdev) From 81b050b564b814585e1515b218783c8ad56913f1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 6 Sep 2017 10:44:23 +0200 Subject: [PATCH 0175/1085] s390/cmf: simplify copy_block cmf_copy_block tries to ensure data consistency by copying the channel measurement block twice and comparing the data. This was needed on very old machines only. Nowadays we guarantee consistency by copying the data when the channel is idle. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 7ed6c865501b..531861d005fd 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -280,12 +280,9 @@ void retry_set_schib(struct ccw_device *cdev) static int cmf_copy_block(struct ccw_device *cdev) { - struct subchannel *sch; - void *reference_buf; - void *hw_block; + struct subchannel *sch = to_subchannel(cdev->dev.parent); struct cmb_data *cmb_data; - - sch = to_subchannel(cdev->dev.parent); + void *hw_block; if (cio_update_schib(sch)) return -ENODEV; @@ -300,19 +297,8 @@ static int cmf_copy_block(struct ccw_device *cdev) } cmb_data = cdev->private->cmb; hw_block = cmb_data->hw_block; - if (!memcmp(cmb_data->last_block, hw_block, cmb_data->size)) - /* No need to copy. */ - return 0; - reference_buf = kzalloc(cmb_data->size, GFP_ATOMIC); - if (!reference_buf) - return -ENOMEM; - /* Ensure consistency of block copied from hardware. */ - do { - memcpy(cmb_data->last_block, hw_block, cmb_data->size); - memcpy(reference_buf, hw_block, cmb_data->size); - } while (memcmp(cmb_data->last_block, reference_buf, cmb_data->size)); + memcpy(cmb_data->last_block, hw_block, cmb_data->size); cmb_data->last_update = get_tod_clock(); - kfree(reference_buf); return 0; } From d4d287e81ff1924f0791fad0638551daec7742b5 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 7 Sep 2017 13:18:40 +0200 Subject: [PATCH 0176/1085] s390/cmf: read from hw buffer To ensure data consistency when reading from the channel measurement block we wait for the subchannel to become idle and copy the whole block. This is unnecessary when all we do is export the individual values via sysfs. Read the values for sysfs export directly from the measurement block. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 47 ++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 531861d005fd..78fb492d9881 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -589,22 +589,18 @@ static int set_cmb(struct ccw_device *cdev, u32 mme) static u64 read_cmb(struct ccw_device *cdev, int index) { - struct cmb *cmb; - u32 val; - int ret; + struct cmb_data *cmb_data; unsigned long flags; - - ret = cmf_cmb_copy_wait(cdev); - if (ret < 0) - return 0; + struct cmb *cmb; + int ret = 0; + u32 val; spin_lock_irqsave(cdev->ccwlock, flags); - if (!cdev->private->cmb) { - ret = 0; + cmb_data = cdev->private->cmb; + if (!cmb_data) goto out; - } - cmb = ((struct cmb_data *)cdev->private->cmb)->last_block; + cmb = cmb_data->hw_block; switch (index) { case cmb_ssch_rsch_count: ret = cmb->ssch_rsch_count; @@ -628,7 +624,6 @@ static u64 read_cmb(struct ccw_device *cdev, int index) val = cmb->device_active_only_time; break; default: - ret = 0; goto out; } ret = time_to_avg_nsec(val, cmb->sample_count); @@ -841,27 +836,20 @@ static int set_cmbe(struct ccw_device *cdev, u32 mme) return set_schib_wait(cdev, mme, 1, mba); } - static u64 read_cmbe(struct ccw_device *cdev, int index) { - struct cmbe *cmb; struct cmb_data *cmb_data; - u32 val; - int ret; unsigned long flags; - - ret = cmf_cmb_copy_wait(cdev); - if (ret < 0) - return 0; + struct cmbe *cmb; + int ret = 0; + u32 val; spin_lock_irqsave(cdev->ccwlock, flags); cmb_data = cdev->private->cmb; - if (!cmb_data) { - ret = 0; + if (!cmb_data) goto out; - } - cmb = cmb_data->last_block; + cmb = cmb_data->hw_block; switch (index) { case cmb_ssch_rsch_count: ret = cmb->ssch_rsch_count; @@ -891,7 +879,6 @@ static u64 read_cmbe(struct ccw_device *cdev, int index) val = cmb->initial_command_response_time; break; default: - ret = 0; goto out; } ret = time_to_avg_nsec(val, cmb->sample_count); @@ -982,18 +969,14 @@ static ssize_t cmb_show_avg_sample_interval(struct device *dev, struct device_attribute *attr, char *buf) { - struct ccw_device *cdev; - long interval; + struct ccw_device *cdev = to_ccwdev(dev); unsigned long count; - struct cmb_data *cmb_data; + long interval; - cdev = to_ccwdev(dev); count = cmf_read(cdev, cmb_sample_count); spin_lock_irq(cdev->ccwlock); - cmb_data = cdev->private->cmb; if (count) { - interval = cmb_data->last_update - - cdev->private->cmb_start_time; + interval = get_tod_clock() - cdev->private->cmb_start_time; interval = (interval * 1000) >> 12; interval /= count; } else From cb09b356cd43fb079ea85128dcd17e5b65585fe8 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Fri, 8 Sep 2017 21:01:38 +0200 Subject: [PATCH 0177/1085] s390/cmf: avg_utilization All (but one) cmf related sysfs attributes have been converted to read directly from the measurement block using cmf_read. This is not possible for the avg_utilization attribute since this is an aggregation of several values for which cmf_read only returns average values. Move the computation of the utilization value to the cmf_read interface such that it can use the raw data. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 58 +++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 78fb492d9881..0b10221c4661 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -58,8 +58,9 @@ /* indices for READCMB */ enum cmb_index { + avg_utilization = -1, /* basic and exended format: */ - cmb_ssch_rsch_count, + cmb_ssch_rsch_count = 0, cmb_sample_count, cmb_device_connect_time, cmb_function_pending_time, @@ -587,12 +588,29 @@ static int set_cmb(struct ccw_device *cdev, u32 mme) return set_schib_wait(cdev, mme, 0, offset); } +/* calculate utilization in 0.1 percent units */ +static u64 __cmb_utilization(u64 device_connect_time, u64 function_pending_time, + u64 device_disconnect_time, u64 start_time) +{ + u64 utilization, elapsed_time; + + utilization = time_to_nsec(device_connect_time + + function_pending_time + + device_disconnect_time); + + elapsed_time = get_tod_clock() - start_time; + elapsed_time = tod_to_ns(elapsed_time); + elapsed_time /= 1000; + + return elapsed_time ? (utilization / elapsed_time) : 0; +} + static u64 read_cmb(struct ccw_device *cdev, int index) { struct cmb_data *cmb_data; unsigned long flags; struct cmb *cmb; - int ret = 0; + u64 ret = 0; u32 val; spin_lock_irqsave(cdev->ccwlock, flags); @@ -602,6 +620,12 @@ static u64 read_cmb(struct ccw_device *cdev, int index) cmb = cmb_data->hw_block; switch (index) { + case avg_utilization: + ret = __cmb_utilization(cmb->device_connect_time, + cmb->function_pending_time, + cmb->device_disconnect_time, + cdev->private->cmb_start_time); + goto out; case cmb_ssch_rsch_count: ret = cmb->ssch_rsch_count; goto out; @@ -841,7 +865,7 @@ static u64 read_cmbe(struct ccw_device *cdev, int index) struct cmb_data *cmb_data; unsigned long flags; struct cmbe *cmb; - int ret = 0; + u64 ret = 0; u32 val; spin_lock_irqsave(cdev->ccwlock, flags); @@ -851,6 +875,12 @@ static u64 read_cmbe(struct ccw_device *cdev, int index) cmb = cmb_data->hw_block; switch (index) { + case avg_utilization: + ret = __cmb_utilization(cmb->device_connect_time, + cmb->function_pending_time, + cmb->device_disconnect_time, + cdev->private->cmb_start_time); + goto out; case cmb_ssch_rsch_count: ret = cmb->ssch_rsch_count; goto out; @@ -989,27 +1019,9 @@ static ssize_t cmb_show_avg_utilization(struct device *dev, struct device_attribute *attr, char *buf) { - struct cmbdata data; - u64 utilization; - unsigned long t, u; - int ret; + unsigned long u = cmf_read(to_ccwdev(dev), avg_utilization); - ret = cmf_readall(to_ccwdev(dev), &data); - if (ret == -EAGAIN || ret == -ENODEV) - /* No data (yet/currently) available to use for calculation. */ - return sprintf(buf, "n/a\n"); - else if (ret) - return ret; - - utilization = data.device_connect_time + - data.function_pending_time + - data.device_disconnect_time; - - /* calculate value in 0.1 percent units */ - t = data.elapsed_time / 1000; - u = utilization / t; - - return sprintf(buf, "%02ld.%01ld%%\n", u/ 10, u - (u/ 10) * 10); + return sprintf(buf, "%02lu.%01lu%%\n", u / 10, u % 10); } #define cmf_attr(name) \ From 08c6df97d65a1cb24393d2051bf12ab05f8d7fb6 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Sep 2017 11:21:00 +0200 Subject: [PATCH 0178/1085] s390/cmf: use tod_to_ns() Instead of open coding tod clock to ns conversions use the timex helper. Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cmf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 0b10221c4661..7d59230e88bb 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -685,8 +685,7 @@ static int readall_cmb(struct ccw_device *cdev, struct cmbdata *data) /* we only know values before device_busy_time */ data->size = offsetof(struct cmbdata, device_busy_time); - /* convert to nanoseconds */ - data->elapsed_time = (time * 1000) >> 12; + data->elapsed_time = tod_to_ns(time); /* copy data to new structure */ data->ssch_rsch_count = cmb->ssch_rsch_count; @@ -945,8 +944,7 @@ static int readall_cmbe(struct ccw_device *cdev, struct cmbdata *data) /* we only know values before device_busy_time */ data->size = offsetof(struct cmbdata, device_busy_time); - /* conver to nanoseconds */ - data->elapsed_time = (time * 1000) >> 12; + data->elapsed_time = tod_to_ns(time); cmb = cmb_data->last_block; /* copy data to new structure */ @@ -1007,7 +1005,7 @@ static ssize_t cmb_show_avg_sample_interval(struct device *dev, spin_lock_irq(cdev->ccwlock); if (count) { interval = get_tod_clock() - cdev->private->cmb_start_time; - interval = (interval * 1000) >> 12; + interval = tod_to_ns(interval); interval /= count; } else interval = -1; From 76b3138192e6ffec334269b1426fd0d100ba962b Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 12 Sep 2017 07:04:26 +0200 Subject: [PATCH 0179/1085] s390/zcrypt: Explicitly check input data length. The function to prepare MEX type 50 ap messages did not explicitly check for the data length in case of data > 512 bytes. Instead the function assumes the boundary check done in the ioctl function will always reject requests with invalid data length values. However, screening just the function code may give the illusion, that there may be a gap which could be exploited by userspace for buffer overwrite attacks. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_msgtype50.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c index 6dd5d7c58dd0..db5bde47dfb0 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.c +++ b/drivers/s390/crypto/zcrypt_msgtype50.c @@ -240,8 +240,7 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq, mod = meb2->modulus + sizeof(meb2->modulus) - mod_len; exp = meb2->exponent + sizeof(meb2->exponent) - mod_len; inp = meb2->message + sizeof(meb2->message) - mod_len; - } else { - /* mod_len > 256 = 4096 bit RSA Key */ + } else if (mod_len <= 512) { struct type50_meb3_msg *meb3 = ap_msg->message; memset(meb3, 0, sizeof(*meb3)); ap_msg->length = sizeof(*meb3); @@ -251,7 +250,8 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq, mod = meb3->modulus + sizeof(meb3->modulus) - mod_len; exp = meb3->exponent + sizeof(meb3->exponent) - mod_len; inp = meb3->message + sizeof(meb3->message) - mod_len; - } + } else + return -EINVAL; if (copy_from_user(mod, mex->n_modulus, mod_len) || copy_from_user(exp, mex->b_key, mod_len) || From 19220999790a5a67bf954f19e9115c68e889cd11 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Sep 2017 14:43:10 +0200 Subject: [PATCH 0180/1085] s390/cpumf: remove superfluous nr_cpumask_bits check Paul Burton reported that the nr_cpumask_bits check within cpumsf_pmu_event_init() is not necessary. Actually there is already a prior check within perf_event_alloc(). Therefore remove the check. Reported-by: Paul Burton Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 7e1e40323b78..bd4bbf61aaf3 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,12 +823,8 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= 0) { - if ((unsigned int)event->cpu >= nr_cpumask_bits) + if (event->cpu >= 0 && !cpu_online(event->cpu)) return -ENODEV; - if (!cpu_online(event->cpu)) - return -ENODEV; - } /* Force reset of idle/hv excludes regardless of what the * user requested. From 8179c7ba109a8eca5fdc214d695ef009ee2fa9cb Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 24 Sep 2017 17:30:14 +0530 Subject: [PATCH 0181/1085] s390/sclp: Use setup_timer and mod_timer Use setup_timer and mod_timer API instead of structure assignments. This is done using Coccinelle and semantic patch used for this as follows: @@ expression x,y,z,a,b; @@ -init_timer (&x); +setup_timer (&x, y, z); +mod_timer (&a, b); -x.function = y; -x.data = z; -x.expires = b; -add_timer(&a); Signed-off-by: Himanshu Jha Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_con.c | 7 ++----- drivers/s390/char/sclp_tty.c | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c index 6037bc87e767..9b5ac8f8c206 100644 --- a/drivers/s390/char/sclp_con.c +++ b/drivers/s390/char/sclp_con.c @@ -210,11 +210,8 @@ sclp_console_write(struct console *console, const char *message, /* Setup timer to output current console buffer after 1/10 second */ if (sclp_conbuf != NULL && sclp_chars_in_buffer(sclp_conbuf) != 0 && !timer_pending(&sclp_con_timer)) { - init_timer(&sclp_con_timer); - sclp_con_timer.function = sclp_console_timeout; - sclp_con_timer.data = 0UL; - sclp_con_timer.expires = jiffies + HZ/10; - add_timer(&sclp_con_timer); + setup_timer(&sclp_con_timer, sclp_console_timeout, 0UL); + mod_timer(&sclp_con_timer, jiffies + HZ / 10); } out: spin_unlock_irqrestore(&sclp_con_lock, flags); diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c index 236b736ae136..469487e09671 100644 --- a/drivers/s390/char/sclp_tty.c +++ b/drivers/s390/char/sclp_tty.c @@ -217,11 +217,8 @@ static int sclp_tty_write_string(const unsigned char *str, int count, int may_fa /* Setup timer to output current console buffer after 1/10 second */ if (sclp_ttybuf && sclp_chars_in_buffer(sclp_ttybuf) && !timer_pending(&sclp_tty_timer)) { - init_timer(&sclp_tty_timer); - sclp_tty_timer.function = sclp_tty_timeout; - sclp_tty_timer.data = 0UL; - sclp_tty_timer.expires = jiffies + HZ/10; - add_timer(&sclp_tty_timer); + setup_timer(&sclp_tty_timer, sclp_tty_timeout, 0UL); + mod_timer(&sclp_tty_timer, jiffies + HZ / 10); } spin_unlock_irqrestore(&sclp_tty_lock, flags); out: From 1887aa07b6765d345dd79f26017aa2d15d49d7af Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Sep 2017 14:17:41 +0200 Subject: [PATCH 0182/1085] s390/topology: add detection of dedicated vs shared CPUs The topology information returned by STSI 15.x.x contains a flag if the CPUs of a topology-list are dedicated or shared. Make this information available if the machine provides topology information. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 2 ++ arch/s390/include/asm/sysinfo.h | 3 ++- arch/s390/include/asm/topology.h | 2 ++ arch/s390/kernel/smp.c | 10 +++++-- arch/s390/kernel/topology.c | 43 ++++++++++++++++++++++++++++++- 5 files changed, 56 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 58eaaccd3cf0..4c5fde30d741 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -21,6 +21,7 @@ #define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */ #define CIF_ENABLED_WAIT 6 /* in enabled wait state */ #define CIF_MCCK_GUEST 7 /* machine check happening in guest */ +#define CIF_DEDICATED_CPU 8 /* this CPU is dedicated */ #define _CIF_MCCK_PENDING _BITUL(CIF_MCCK_PENDING) #define _CIF_ASCE_PRIMARY _BITUL(CIF_ASCE_PRIMARY) @@ -30,6 +31,7 @@ #define _CIF_IGNORE_IRQ _BITUL(CIF_IGNORE_IRQ) #define _CIF_ENABLED_WAIT _BITUL(CIF_ENABLED_WAIT) #define _CIF_MCCK_GUEST _BITUL(CIF_MCCK_GUEST) +#define _CIF_DEDICATED_CPU _BITUL(CIF_DEDICATED_CPU) #ifndef __ASSEMBLY__ diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 2b498e58b914..0f09135f0ae4 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -156,7 +156,8 @@ static inline unsigned char topology_mnest_limit(void) struct topology_core { unsigned char nl; unsigned char reserved0[3]; - unsigned char :6; + unsigned char :5; + unsigned char d:1; unsigned char pp:2; unsigned char reserved1; unsigned short origin; diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 5222da162b69..5108176435c1 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -16,6 +16,7 @@ struct cpu_topology_s390 { unsigned short book_id; unsigned short drawer_id; unsigned short node_id; + unsigned short dedicated : 1; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -34,6 +35,7 @@ extern cpumask_t cpus_with_topology; #define topology_book_cpumask(cpu) (&cpu_topology[cpu].book_mask) #define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) #define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) +#define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated) #define mc_capable() 1 diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1cee6753d47a..b9fc9b00d845 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -800,6 +800,8 @@ void __init smp_detect_cpus(void) */ static void smp_start_secondary(void *cpuvoid) { + int cpu = smp_processor_id(); + S390_lowcore.last_update_clock = get_tod_clock(); S390_lowcore.restart_stack = (unsigned long) restart_stack; S390_lowcore.restart_fn = (unsigned long) do_restart; @@ -813,8 +815,12 @@ static void smp_start_secondary(void *cpuvoid) init_cpu_timer(); vtime_init(); pfault_init(); - notify_cpu_starting(smp_processor_id()); - set_cpu_online(smp_processor_id(), true); + notify_cpu_starting(cpu); + if (topology_cpu_dedicated(cpu)) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); + set_cpu_online(cpu, true); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index ed0bdd220e1a..d49940fb43b1 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -133,6 +133,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, topo->socket_id = socket->id; topo->core_id = rcore; topo->thread_id = lcpu + i; + topo->dedicated = tl_core->d; cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); @@ -273,6 +274,14 @@ void store_topology(struct sysinfo_15_1_x *info) stsi(info, 15, 1, topology_mnest_limit()); } +static void __arch_update_dedicated_flag(void *arg) +{ + if (topology_cpu_dedicated(smp_processor_id())) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); +} + static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; @@ -298,6 +307,7 @@ int arch_update_cpu_topology(void) int cpu, rc; rc = __arch_update_cpu_topology(); + on_each_cpu(__arch_update_dedicated_flag, NULL, 0); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); kobject_uevent(&dev->kobj, KOBJ_CHANGE); @@ -435,9 +445,39 @@ static struct attribute_group topology_cpu_attr_group = { .attrs = topology_cpu_attrs, }; +static ssize_t cpu_dedicated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cpu = dev->id; + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} +static DEVICE_ATTR(dedicated, 0444, cpu_dedicated_show, NULL); + +static struct attribute *topology_extra_cpu_attrs[] = { + &dev_attr_dedicated.attr, + NULL, +}; + +static struct attribute_group topology_extra_cpu_attr_group = { + .attrs = topology_extra_cpu_attrs, +}; + int topology_cpu_init(struct cpu *cpu) { - return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); + int rc; + + rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); + if (rc || !MACHINE_HAS_TOPOLOGY) + return rc; + rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); + if (rc) + sysfs_remove_group(&cpu->dev.kobj, &topology_cpu_attr_group); + return rc; } static const struct cpumask *cpu_thread_mask(int cpu) @@ -509,6 +549,7 @@ void __init topology_init_early(void) alloc_masks(info, &drawer_info, 3); out: __arch_update_cpu_topology(); + __arch_update_dedicated_flag(NULL); } static inline int topology_get_mode(int enabled) From 8153380379ecc8381f6d55f6497de31a36c75aa5 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sun, 4 Dec 2016 14:36:04 +0100 Subject: [PATCH 0183/1085] s390/spinlock: use the cpu number +1 as spinlock value The queued spinlock code will come out simpler if the encoding of the CPU that holds the spinlock is (cpu+1) instead of (~cpu). Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/spinlock.h | 2 +- arch/s390/lib/spinlock.c | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 8182b521c42f..6727cc30d59b 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -48,7 +48,7 @@ static inline void arch_spin_relax(arch_spinlock_t *lock) static inline u32 arch_spin_lockval(int cpu) { - return ~cpu; + return cpu + 1; } static inline int arch_spin_value_unlocked(arch_spinlock_t lock) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index b12663d653d8..ee73bcca7e6f 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -67,8 +67,8 @@ void arch_spin_lock_wait(arch_spinlock_t *lp) /* Pass the virtual CPU to the lock holder if it is not running */ owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); count = spin_retry; while (1) { @@ -87,8 +87,8 @@ void arch_spin_lock_wait(arch_spinlock_t *lp) * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); } } EXPORT_SYMBOL(arch_spin_lock_wait); @@ -102,8 +102,8 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) /* Pass the virtual CPU to the lock holder if it is not running */ owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); count = spin_retry; while (1) { @@ -124,8 +124,8 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); } } EXPORT_SYMBOL(arch_spin_lock_wait_flags); @@ -158,8 +158,8 @@ void _raw_read_lock_wait(arch_rwlock_t *rw) owner = 0; while (1) { if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); count = spin_retry; } old = ACCESS_ONCE(rw->lock); @@ -198,8 +198,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, int prev) owner = 0; while (1) { if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); count = spin_retry; } old = ACCESS_ONCE(rw->lock); @@ -226,8 +226,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw) owner = 0; while (1) { if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(~owner)) - smp_yield_cpu(~owner); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); count = spin_retry; } old = ACCESS_ONCE(rw->lock); @@ -265,8 +265,8 @@ void arch_lock_relax(int cpu) { if (!cpu) return; - if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(~cpu)) + if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) return; - smp_yield_cpu(~cpu); + smp_yield_cpu(cpu - 1); } EXPORT_SYMBOL(arch_lock_relax); From b96f7d881ad94203e997cd2aa7112d4a06d121ef Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 24 Mar 2017 17:25:02 +0100 Subject: [PATCH 0184/1085] s390/spinlock: introduce spinlock wait queueing The queued spinlock code for s390 follows the principles of the common code qspinlock implementation but with a few notable differences. The format of the spinlock_t locking word differs, s390 needs to store the logical CPU number of the lock holder in the spinlock_t to be able to use the diagnose 9c directed yield hypervisor call. The inline code sequences for spin_lock and spin_unlock are nice and short. The inline portion of a spin_lock now typically looks like this: lhi %r0,0 # 0 indicates an empty lock l %r1,0x3a0 # CPU number + 1 from lowcore cs %r0,%r1, # lock operation jnz call_wait # on failure call wait function locked: ... call_wait: la %r2, brasl %r14,arch_spin_lock_wait j locked A spin_unlock is as simple as before: lhi %r0,0 sth %r0,2(%r2) # unlock operation After a CPU has queued itself it may not enable interrupts again for the arch_spin_lock_flags() variant. The arch_spin_lock_wait_flags wait function is removed. To improve performance the code implements opportunistic lock stealing. If the wait function finds a spinlock_t that indicates that the lock is free but there are queued waiters, the CPU may steal the lock up to three times without queueing itself. The lock stealing update the steal counter in the lock word to prevent more than 3 steals. The counter is reset at the time the CPU next in the queue successfully takes the lock. While the queued spinlocks improve performance in a system with dedicated CPUs, in a virtualized environment with continuously overcommitted CPUs the queued spinlocks can have a negative effect on performance. This is due to the fact that a queued CPU that is preempted by the hypervisor will block the queue at some point even without holding the lock. With the classic spinlock it does not matter if a CPU is preempted that waits for the lock. Therefore use the queued spinlock code only if the system runs with dedicated CPUs and fall back to classic spinlocks when running with shared CPUs. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/lowcore.h | 5 +- arch/s390/include/asm/spinlock.h | 18 +-- arch/s390/kernel/setup.c | 2 + arch/s390/kernel/smp.c | 4 + arch/s390/lib/spinlock.c | 264 +++++++++++++++++++++++-------- 5 files changed, 215 insertions(+), 78 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index a6870ea6ea8b..62943af36ac6 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -133,8 +133,9 @@ struct lowcore { __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ __u64 gmap; /* 0x03b8 */ __u32 spinlock_lockval; /* 0x03c0 */ - __u32 fpu_flags; /* 0x03c4 */ - __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ + __u32 spinlock_index; /* 0x03c4 */ + __u32 fpu_flags; /* 0x03c8 */ + __u8 pad_0x03cc[0x0400-0x03cc]; /* 0x03cc */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 6727cc30d59b..2da4a6d13f54 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -36,15 +36,11 @@ bool arch_vcpu_is_preempted(int cpu); */ void arch_lock_relax(int cpu); +void arch_spin_relax(arch_spinlock_t *lock); void arch_spin_lock_wait(arch_spinlock_t *); int arch_spin_trylock_retry(arch_spinlock_t *); -void arch_spin_lock_wait_flags(arch_spinlock_t *, unsigned long flags); - -static inline void arch_spin_relax(arch_spinlock_t *lock) -{ - arch_lock_relax(lock->lock); -} +void arch_spin_lock_setup(int cpu); static inline u32 arch_spin_lockval(int cpu) { @@ -64,8 +60,7 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp) static inline int arch_spin_trylock_once(arch_spinlock_t *lp) { barrier(); - return likely(arch_spin_value_unlocked(*lp) && - __atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); + return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); } static inline void arch_spin_lock(arch_spinlock_t *lp) @@ -78,7 +73,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lp, unsigned long flags) { if (!arch_spin_trylock_once(lp)) - arch_spin_lock_wait_flags(lp, flags); + arch_spin_lock_wait(lp); } static inline int arch_spin_trylock(arch_spinlock_t *lp) @@ -95,8 +90,9 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) #ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES " .long 0xb2fa0070\n" /* NIAI 7 */ #endif - " st %1,%0\n" - : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); + " sth %1,%0\n" + : "=Q" (((unsigned short *) &lp->lock)[1]) + : "d" (0) : "cc", "memory"); } /* diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 164a1e16b53e..b2c9af9b88d5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -380,6 +380,8 @@ static void __init setup_lowcore(void) #ifdef CONFIG_SMP lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; + arch_spin_lock_setup(0); #endif set_prefix((u32)(unsigned long) lc); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index b9fc9b00d845..cc04b74fbb84 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -226,6 +226,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->mcesad = mcesa_origin | mcesa_bits; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; if (vdso_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; @@ -273,6 +274,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; lc->kernel_asce = S390_lowcore.kernel_asce; lc->machine_flags = S390_lowcore.machine_flags; @@ -281,6 +283,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) save_access_regs((unsigned int *) lc->access_regs_save_area); memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); + arch_spin_lock_setup(cpu); } static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) @@ -967,6 +970,7 @@ void __init smp_setup_processor_id(void) pcpu_devices[0].address = stap(); S390_lowcore.cpu_nr = 0; S390_lowcore.spinlock_lockval = arch_spin_lockval(0); + S390_lowcore.spinlock_index = 0; } /* diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ee73bcca7e6f..6747134227cd 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #include int spin_retry = -1; @@ -32,6 +34,40 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +struct spin_wait { + struct spin_wait *next, *prev; + int node_id; +} __aligned(32); + +static DEFINE_PER_CPU_ALIGNED(struct spin_wait, spin_wait[4]); + +#define _Q_LOCK_CPU_OFFSET 0 +#define _Q_LOCK_STEAL_OFFSET 16 +#define _Q_TAIL_IDX_OFFSET 18 +#define _Q_TAIL_CPU_OFFSET 20 + +#define _Q_LOCK_CPU_MASK 0x0000ffff +#define _Q_LOCK_STEAL_ADD 0x00010000 +#define _Q_LOCK_STEAL_MASK 0x00030000 +#define _Q_TAIL_IDX_MASK 0x000c0000 +#define _Q_TAIL_CPU_MASK 0xfff00000 + +#define _Q_LOCK_MASK (_Q_LOCK_CPU_MASK | _Q_LOCK_STEAL_MASK) +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +void arch_spin_lock_setup(int cpu) +{ + struct spin_wait *node; + int ix; + + node = per_cpu_ptr(&spin_wait[0], cpu); + for (ix = 0; ix < 4; ix++, node++) { + memset(node, 0, sizeof(*node)); + node->node_id = ((cpu + 1) << _Q_TAIL_CPU_OFFSET) + + (ix << _Q_TAIL_IDX_OFFSET); + } +} + static inline int arch_load_niai4(int *lock) { int owner; @@ -60,76 +96,161 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) return expected == old; } +static inline struct spin_wait *arch_spin_decode_tail(int lock) +{ + int ix, cpu; + + ix = (lock & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + cpu = (lock & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET; + return per_cpu_ptr(&spin_wait[ix], cpu - 1); +} + +static inline int arch_spin_yield_target(int lock, struct spin_wait *node) +{ + if (lock & _Q_LOCK_CPU_MASK) + return lock & _Q_LOCK_CPU_MASK; + if (node == NULL || node->prev == NULL) + return 0; /* 0 -> no target cpu */ + while (node->prev) + node = node->prev; + return node->node_id >> _Q_TAIL_CPU_OFFSET; +} + +static inline void arch_spin_lock_queued(arch_spinlock_t *lp) +{ + struct spin_wait *node, *next; + int lockval, ix, node_id, tail_id, old, new, owner, count; + + ix = S390_lowcore.spinlock_index++; + barrier(); + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + node = this_cpu_ptr(&spin_wait[ix]); + node->prev = node->next = NULL; + node_id = node->node_id; + + /* Enqueue the node for this CPU in the spinlock wait queue */ + while (1) { + old = READ_ONCE(lp->lock); + if ((old & _Q_LOCK_CPU_MASK) == 0 && + (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) { + /* + * The lock is free but there may be waiters. + * With no waiters simply take the lock, if there + * are waiters try to steal the lock. The lock may + * be stolen three times before the next queued + * waiter will get the lock. + */ + new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + goto out; + /* lock passing in progress */ + continue; + } + /* Make the node of this CPU the new tail. */ + new = node_id | (old & _Q_LOCK_MASK); + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + break; + } + /* Set the 'next' pointer of the tail node in the queue */ + tail_id = old & _Q_TAIL_MASK; + if (tail_id != 0) { + node->prev = arch_spin_decode_tail(tail_id); + WRITE_ONCE(node->prev->next, node); + } + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + /* Spin on the CPU local node->prev pointer */ + if (tail_id != 0) { + count = spin_retry; + while (READ_ONCE(node->prev) != NULL) { + if (count-- >= 0) + continue; + count = spin_retry; + /* Query running state of lock holder again. */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + } + + /* Spin on the lock value in the spinlock_t */ + count = spin_retry; + while (1) { + old = READ_ONCE(lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + if (!owner) { + tail_id = old & _Q_TAIL_MASK; + new = ((tail_id != node_id) ? tail_id : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + break; + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + + /* Pass lock_spin job to next CPU in the queue */ + if (node_id && tail_id != node_id) { + /* Wait until the next CPU has set up the 'next' pointer */ + while ((next = READ_ONCE(node->next)) == NULL) + ; + next->prev = NULL; + } + + out: + S390_lowcore.spinlock_index--; +} + +static inline void arch_spin_lock_classic(arch_spinlock_t *lp) +{ + int lockval, old, new, owner, count; + + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(ACCESS_ONCE(lp->lock), NULL); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + count = spin_retry; + while (1) { + old = arch_load_niai4(&lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + /* Try to get the lock if it is free. */ + if (!owner) { + new = (old & _Q_TAIL_MASK) | lockval; + if (arch_cmpxchg_niai8(&lp->lock, old, new)) + /* Got the lock */ + return; + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { - int cpu = SPINLOCK_LOCKVAL; - int owner, count; - - /* Pass the virtual CPU to the lock holder if it is not running */ - owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - - count = spin_retry; - while (1) { - owner = arch_load_niai4(&lp->lock); - /* Try to get the lock if it is free. */ - if (!owner) { - if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) - return; - continue; - } - if (count-- >= 0) - continue; - count = spin_retry; - /* - * For multiple layers of hypervisors, e.g. z/VM + LPAR - * yield the CPU unconditionally. For LPAR rely on the - * sense running status. - */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - } + /* Use classic spinlocks + niai if the steal time is >= 10% */ + if (test_cpu_flag(CIF_DEDICATED_CPU)) + arch_spin_lock_queued(lp); + else + arch_spin_lock_classic(lp); } EXPORT_SYMBOL(arch_spin_lock_wait); -void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) -{ - int cpu = SPINLOCK_LOCKVAL; - int owner, count; - - local_irq_restore(flags); - - /* Pass the virtual CPU to the lock holder if it is not running */ - owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - - count = spin_retry; - while (1) { - owner = arch_load_niai4(&lp->lock); - /* Try to get the lock if it is free. */ - if (!owner) { - local_irq_disable(); - if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) - return; - local_irq_restore(flags); - continue; - } - if (count-- >= 0) - continue; - count = spin_retry; - /* - * For multiple layers of hypervisors, e.g. z/VM + LPAR - * yield the CPU unconditionally. For LPAR rely on the - * sense running status. - */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - } -} -EXPORT_SYMBOL(arch_spin_lock_wait_flags); - int arch_spin_trylock_retry(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; @@ -270,3 +391,16 @@ void arch_lock_relax(int cpu) smp_yield_cpu(cpu - 1); } EXPORT_SYMBOL(arch_lock_relax); + +void arch_spin_relax(arch_spinlock_t *lp) +{ + int cpu; + + cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK; + if (!cpu) + return; + if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) + return; + smp_yield_cpu(cpu - 1); +} +EXPORT_SYMBOL(arch_spin_relax); From eb3b7b848fb3dd00f7a57d633d4ae4d194aa7865 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 24 Mar 2017 17:32:23 +0100 Subject: [PATCH 0185/1085] s390/rwlock: introduce rwlock wait queueing Like the common queued rwlock code the s390 implementation uses the queued spinlock code on a spinlock_t embedded in the rwlock_t to achieve the queueing. The encoding of the rwlock_t differs though, the counter field in the rwlock_t is split into two parts. The upper two bytes hold the write bit and the write wait counter, the lower two bytes hold the read counter. The arch_read_lock operation works exactly like the common qrwlock but the enqueue operation for a writer follows a diffent logic. After the failed inline try to get the rwlock in write, the writer first increases the write wait counter, acquires the wait spin_lock for the queueing, and then loops until there are no readers and the write bit is zero. Without the write wait counter a CPU that just released the rwlock could immediately reacquire the lock in the inline code, bypassing all outstanding read and write waiters. For s390 this would cause massive imbalances in favour of writers in case of a contended rwlock. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/atomic_ops.h | 32 ++++-- arch/s390/include/asm/spinlock.h | 144 ++++--------------------- arch/s390/include/asm/spinlock_types.h | 4 +- arch/s390/lib/spinlock.c | 142 ++++++------------------ 4 files changed, 75 insertions(+), 247 deletions(-) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index ba6d29412344..daec181fa1f7 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -39,19 +39,24 @@ __ATOMIC_OPS(__atomic64_xor, long, "laxg") #undef __ATOMIC_OPS #undef __ATOMIC_OP -static inline void __atomic_add_const(int val, int *ptr) -{ - asm volatile( - " asi %[ptr],%[val]\n" - : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); +#define __ATOMIC_CONST_OP(op_name, op_type, op_string, op_barrier) \ +static inline void op_name(op_type val, op_type *ptr) \ +{ \ + asm volatile( \ + op_string " %[ptr],%[val]\n" \ + op_barrier \ + : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc", "memory");\ } -static inline void __atomic64_add_const(long val, long *ptr) -{ - asm volatile( - " agsi %[ptr],%[val]\n" - : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); -} +#define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \ + __ATOMIC_CONST_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + +__ATOMIC_CONST_OPS(__atomic_add_const, int, "asi") +__ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") + +#undef __ATOMIC_CONST_OPS +#undef __ATOMIC_CONST_OP #else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ @@ -107,6 +112,11 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") #undef __ATOMIC64_OPS +#define __atomic_add_const(val, ptr) __atomic_add(val, ptr) +#define __atomic_add_const_barrier(val, ptr) __atomic_add(val, ptr) +#define __atomic64_add_const(val, ptr) __atomic64_add(val, ptr) +#define __atomic64_add_const_barrier(val, ptr) __atomic64_add(val, ptr) + #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ static inline int __atomic_cmpxchg(int *ptr, int old, int new) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 2da4a6d13f54..09e783d83d5d 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -35,7 +35,6 @@ bool arch_vcpu_is_preempted(int cpu); * (the type definitions are in asm/spinlock_types.h) */ -void arch_lock_relax(int cpu); void arch_spin_relax(arch_spinlock_t *lock); void arch_spin_lock_wait(arch_spinlock_t *); @@ -110,164 +109,63 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -#define arch_read_can_lock(x) ((int)(x)->lock >= 0) +#define arch_read_can_lock(x) (((x)->cnts & 0xffff0000) == 0) /** * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -#define arch_write_can_lock(x) ((x)->lock == 0) - -extern int _raw_read_trylock_retry(arch_rwlock_t *lp); -extern int _raw_write_trylock_retry(arch_rwlock_t *lp); +#define arch_write_can_lock(x) ((x)->cnts == 0) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) +#define arch_read_relax(rw) barrier() +#define arch_write_relax(rw) barrier() -static inline int arch_read_trylock_once(arch_rwlock_t *rw) -{ - int old = ACCESS_ONCE(rw->lock); - return likely(old >= 0 && - __atomic_cmpxchg_bool(&rw->lock, old, old + 1)); -} - -static inline int arch_write_trylock_once(arch_rwlock_t *rw) -{ - int old = ACCESS_ONCE(rw->lock); - return likely(old == 0 && - __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)); -} - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __RAW_OP_OR "lao" -#define __RAW_OP_AND "lan" -#define __RAW_OP_ADD "laa" - -#define __RAW_LOCK(ptr, op_val, op_string) \ -({ \ - int old_val; \ - \ - typecheck(int *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - "bcr 14,0\n" \ - : "=d" (old_val), "+Q" (*ptr) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#define __RAW_UNLOCK(ptr, op_val, op_string) \ -({ \ - int old_val; \ - \ - typecheck(int *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - : "=d" (old_val), "+Q" (*ptr) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -extern void _raw_read_lock_wait(arch_rwlock_t *lp); -extern void _raw_write_lock_wait(arch_rwlock_t *lp, int prev); +void arch_read_lock_wait(arch_rwlock_t *lp); +void arch_write_lock_wait(arch_rwlock_t *lp); static inline void arch_read_lock(arch_rwlock_t *rw) { int old; - old = __RAW_LOCK(&rw->lock, 1, __RAW_OP_ADD); - if (old < 0) - _raw_read_lock_wait(rw); + old = __atomic_add(1, &rw->cnts); + if (old & 0xffff0000) + arch_read_lock_wait(rw); } static inline void arch_read_unlock(arch_rwlock_t *rw) { - __RAW_UNLOCK(&rw->lock, -1, __RAW_OP_ADD); + __atomic_add_const_barrier(-1, &rw->cnts); } static inline void arch_write_lock(arch_rwlock_t *rw) { - int old; - - old = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR); - if (old != 0) - _raw_write_lock_wait(rw, old); - rw->owner = SPINLOCK_LOCKVAL; + if (!__atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000)) + arch_write_lock_wait(rw); } static inline void arch_write_unlock(arch_rwlock_t *rw) { - rw->owner = 0; - __RAW_UNLOCK(&rw->lock, 0x7fffffff, __RAW_OP_AND); + __atomic_add_barrier(-0x30000, &rw->cnts); } -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -extern void _raw_read_lock_wait(arch_rwlock_t *lp); -extern void _raw_write_lock_wait(arch_rwlock_t *lp); - -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - if (!arch_read_trylock_once(rw)) - _raw_read_lock_wait(rw); -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - int old; - - do { - old = ACCESS_ONCE(rw->lock); - } while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1)); -} - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - if (!arch_write_trylock_once(rw)) - _raw_write_lock_wait(rw); - rw->owner = SPINLOCK_LOCKVAL; -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - typecheck(int, rw->lock); - - rw->owner = 0; - asm volatile( - "st %1,%0\n" - : "+Q" (rw->lock) - : "d" (0) - : "cc", "memory"); -} - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ static inline int arch_read_trylock(arch_rwlock_t *rw) { - if (!arch_read_trylock_once(rw)) - return _raw_read_trylock_retry(rw); - return 1; + int old; + + old = READ_ONCE(rw->cnts); + return (!(old & 0xffff0000) && + __atomic_cmpxchg_bool(&rw->cnts, old, old + 1)); } static inline int arch_write_trylock(arch_rwlock_t *rw) { - if (!arch_write_trylock_once(rw) && !_raw_write_trylock_retry(rw)) - return 0; - rw->owner = SPINLOCK_LOCKVAL; - return 1; -} + int old; -static inline void arch_read_relax(arch_rwlock_t *rw) -{ - arch_lock_relax(rw->owner); -} - -static inline void arch_write_relax(arch_rwlock_t *rw) -{ - arch_lock_relax(rw->owner); + old = READ_ONCE(rw->cnts); + return !old && __atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000); } #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index fe755eec275f..271b4da94fd0 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -12,8 +12,8 @@ typedef struct { #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, } typedef struct { - int lock; - int owner; + int cnts; + arch_spinlock_t wait; } arch_rwlock_t; #define __ARCH_RW_LOCK_UNLOCKED { 0 } diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 6747134227cd..43b0d46c3786 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -268,129 +268,49 @@ int arch_spin_trylock_retry(arch_spinlock_t *lp) } EXPORT_SYMBOL(arch_spin_trylock_retry); -void _raw_read_lock_wait(arch_rwlock_t *rw) +void arch_read_lock_wait(arch_rwlock_t *rw) { - int count = spin_retry; - int owner, old; - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - __RAW_LOCK(&rw->lock, -1, __RAW_OP_ADD); -#endif - owner = 0; - while (1) { - if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - count = spin_retry; - } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); - if (old < 0) - continue; - if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) - return; + if (unlikely(in_interrupt())) { + while (READ_ONCE(rw->cnts) & 0x10000) + barrier(); + return; } -} -EXPORT_SYMBOL(_raw_read_lock_wait); -int _raw_read_trylock_retry(arch_rwlock_t *rw) + /* Remove this reader again to allow recursive read locking */ + __atomic_add_const(-1, &rw->cnts); + /* Put the reader into the wait queue */ + arch_spin_lock(&rw->wait); + /* Now add this reader to the count value again */ + __atomic_add_const(1, &rw->cnts); + /* Loop until the writer is done */ + while (READ_ONCE(rw->cnts) & 0x10000) + barrier(); + arch_spin_unlock(&rw->wait); +} +EXPORT_SYMBOL(arch_read_lock_wait); + +void arch_write_lock_wait(arch_rwlock_t *rw) { - int count = spin_retry; int old; - while (count-- > 0) { - old = ACCESS_ONCE(rw->lock); - if (old < 0) - continue; - if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) - return 1; - } - return 0; -} -EXPORT_SYMBOL(_raw_read_trylock_retry); + /* Add this CPU to the write waiters */ + __atomic_add(0x20000, &rw->cnts); -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + /* Put the writer into the wait queue */ + arch_spin_lock(&rw->wait); -void _raw_write_lock_wait(arch_rwlock_t *rw, int prev) -{ - int count = spin_retry; - int owner, old; - - owner = 0; while (1) { - if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - count = spin_retry; - } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); - smp_mb(); - if (old >= 0) { - prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR); - old = prev; - } - if ((old & 0x7fffffff) == 0 && prev >= 0) + old = READ_ONCE(rw->cnts); + if ((old & 0x1ffff) == 0 && + __atomic_cmpxchg_bool(&rw->cnts, old, old | 0x10000)) + /* Got the lock */ break; + barrier(); } + + arch_spin_unlock(&rw->wait); } -EXPORT_SYMBOL(_raw_write_lock_wait); - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -void _raw_write_lock_wait(arch_rwlock_t *rw) -{ - int count = spin_retry; - int owner, old, prev; - - prev = 0x80000000; - owner = 0; - while (1) { - if (count-- <= 0) { - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - count = spin_retry; - } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); - if (old >= 0 && - __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000)) - prev = old; - else - smp_mb(); - if ((old & 0x7fffffff) == 0 && prev >= 0) - break; - } -} -EXPORT_SYMBOL(_raw_write_lock_wait); - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -int _raw_write_trylock_retry(arch_rwlock_t *rw) -{ - int count = spin_retry; - int old; - - while (count-- > 0) { - old = ACCESS_ONCE(rw->lock); - if (old) - continue; - if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)) - return 1; - } - return 0; -} -EXPORT_SYMBOL(_raw_write_trylock_retry); - -void arch_lock_relax(int cpu) -{ - if (!cpu) - return; - if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) - return; - smp_yield_cpu(cpu - 1); -} -EXPORT_SYMBOL(arch_lock_relax); +EXPORT_SYMBOL(arch_write_lock_wait); void arch_spin_relax(arch_spinlock_t *lp) { From 63fef14fc98a8b4fad777fd3bef4d068802b3f14 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 18 Aug 2017 17:24:00 +0900 Subject: [PATCH 0186/1085] kprobes/x86: Make insn buffer always ROX and use text_poke() Make insn buffer always ROX and use text_poke() to write the copied instructions instead of set_memory_*(). This makes instruction buffer stronger against other kernel subsystems because there is no window time to modify the buffer. Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150304463032.17009.14195368040691676813.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/common.h | 6 +-- arch/x86/kernel/kprobes/core.c | 61 ++++++++++++++++++++----------- arch/x86/kernel/kprobes/opt.c | 63 ++++++++++++++++++-------------- kernel/kprobes.c | 2 +- 4 files changed, 80 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index db2182d63ed0..e2c2a1970869 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -75,11 +75,11 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); +extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn); /* Generate a relative-jump/call instruction */ -extern void synthesize_reljump(void *from, void *to); -extern void synthesize_relcall(void *from, void *to); +extern void synthesize_reljump(void *dest, void *from, void *to); +extern void synthesize_relcall(void *dest, void *from, void *to); #ifdef CONFIG_OPTPROBES extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f0153714ddac..b48e0efd668e 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -119,29 +119,29 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); static nokprobe_inline void -__synthesize_relative_insn(void *from, void *to, u8 op) +__synthesize_relative_insn(void *dest, void *from, void *to, u8 op) { struct __arch_relative_insn { u8 op; s32 raddr; } __packed *insn; - insn = (struct __arch_relative_insn *)from; + insn = (struct __arch_relative_insn *)dest; insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); insn->op = op; } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -void synthesize_reljump(void *from, void *to) +void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ -void synthesize_relcall(void *from, void *to) +void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -346,10 +346,11 @@ static int is_IF_modifier(kprobe_opcode_t *insn) /* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * addressing mode. Note that since @real will be the final place of copied + * instruction, displacement must be adjust by @real, not @dest. * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) +int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; unsigned long recovered_insn = @@ -387,11 +388,11 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) * have given. */ newdisp = (u8 *) src + (s64) insn->displacement.value - - (u8 *) dest; + - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, dest, insn->displacement.value); + src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -402,20 +403,38 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) } /* Prepare reljump right after instruction to boost */ -static void prepare_boost(struct kprobe *p, struct insn *insn) +static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { + int len = insn->length; + if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ - synthesize_reljump(p->ainsn.insn + insn->length, + synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); + len += RELATIVEJUMP_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; } + + return len; +} + +/* Make page to RO mode when allocate it */ +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (page) + set_memory_ro((unsigned long)page & PAGE_MASK, 1); + + return page; } /* Recover page to RW mode before releasing it */ @@ -429,12 +448,11 @@ void free_insn_page(void *page) static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; int len; - set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); - /* Copy an instruction with recovering if other optprobe modifies it.*/ - len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; @@ -442,15 +460,16 @@ static int arch_copy_kprobe(struct kprobe *p) * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - prepare_boost(p, &insn); - - set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); + len = prepare_boost(buf, p, &insn); /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + p->ainsn.if_modifier = is_IF_modifier(buf); /* Also, displacement change doesn't affect the first byte */ - p->opcode = p->ainsn.insn[0]; + p->opcode = buf[0]; + + /* OK, write back the instruction(s) into ROX insn buffer */ + text_poke(p->ainsn.insn, buf, len); return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4f98aad38237..22e65f0b8b34 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -184,13 +184,13 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) } NOKPROBE_SYMBOL(optimized_callback); -static int copy_optimized_instructions(u8 *dest, u8 *src) +static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) { struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len, &insn); + ret = __copy_instruction(dest + len, src + len, real, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; @@ -343,57 +343,66 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *__unused) { - u8 *buf; - int ret; + u8 *buf = NULL, *slot; + int ret, len; long rel; if (!can_optimize((unsigned long)op->kp.addr)) return -EILSEQ; - op->optinsn.insn = get_optinsn_slot(); - if (!op->optinsn.insn) + buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); + if (!buf) return -ENOMEM; + op->optinsn.insn = slot = get_optinsn_slot(); + if (!slot) { + ret = -ENOMEM; + goto out; + } + /* * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; if (abs(rel) > 0x7fffffff) { - __arch_remove_optimized_kprobe(op, 0); - return -ERANGE; + ret = -ERANGE; + goto err; } - buf = (u8 *)op->optinsn.insn; - set_memory_rw((unsigned long)buf & PAGE_MASK, 1); - - /* Copy instructions into the out-of-line buffer */ - ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); - if (ret < 0) { - __arch_remove_optimized_kprobe(op, 0); - return ret; - } - op->optinsn.size = ret; - /* Copy arch-dep-instance from template */ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, + slot + TMPL_END_IDX); + if (ret < 0) + goto err; + op->optinsn.size = ret; + len = TMPL_END_IDX + op->optinsn.size; + /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); /* Set probe function call */ - synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + synthesize_relcall(buf + TMPL_CALL_IDX, + slot + TMPL_CALL_IDX, optimized_callback); /* Set returning jmp instruction at the tail of out-of-line buffer */ - synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); + len += RELATIVEJUMP_SIZE; - set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + /* We have to use text_poke for instuction buffer because it is RO */ + text_poke(slot, buf, len); + ret = 0; +out: + kfree(buf); + return ret; - flush_icache_range((unsigned long) buf, - (unsigned long) buf + TMPL_END_IDX + - op->optinsn.size + RELATIVEJUMP_SIZE); - return 0; +err: + __arch_remove_optimized_kprobe(op, 0); + goto out; } /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a1606a4224e1..15fba7fe57c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -117,7 +117,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static void *alloc_insn_page(void) +void __weak *alloc_insn_page(void) { return module_alloc(PAGE_SIZE); } From a8976fc84b644e3b567ea2bafad3b53b21ed6b6c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 18 Aug 2017 17:25:08 +0900 Subject: [PATCH 0187/1085] kprobes/x86: Remove addressof() operators The following commit: 54a7d50b9205 ("x86: mark kprobe templates as character arrays, not single characters") changed optprobe_template_* to arrays, so we can remove the addressof() operators from those symbols. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150304469798.17009.15886717935027472863.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 4 ++-- arch/x86/kernel/kprobes/opt.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 6cf65437b5e5..9f2e3102e0bb 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -58,8 +58,8 @@ extern __visible kprobe_opcode_t optprobe_template_call[]; extern __visible kprobe_opcode_t optprobe_template_end[]; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ - (((unsigned long)&optprobe_template_end - \ - (unsigned long)&optprobe_template_entry) + \ + (((unsigned long)optprobe_template_end - \ + (unsigned long)optprobe_template_entry) + \ MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) extern const int kretprobe_blacklist_size; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 22e65f0b8b34..0cae7c0f32ec 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -142,11 +142,11 @@ void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ - ((long)&optprobe_template_val - (long)&optprobe_template_entry) + ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ - ((long)&optprobe_template_call - (long)&optprobe_template_entry) + ((long)optprobe_template_call - (long)optprobe_template_entry) #define TMPL_END_IDX \ - ((long)&optprobe_template_end - (long)&optprobe_template_entry) + ((long)optprobe_template_end - (long)optprobe_template_entry) #define INT3_SIZE sizeof(kprobe_opcode_t) @@ -371,7 +371,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } /* Copy arch-dep-instance from template */ - memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + memcpy(buf, optprobe_template_entry, TMPL_END_IDX); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, From 3539d09154e11336c31a900a9cd49e386ba6d9b2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 18:59:00 +0900 Subject: [PATCH 0188/1085] kprobes: Improve smoke test to check preemptibility Add preemptible check to each handler. Handlers are called with non-preemtible, which is guaranteed by Documentation/kprobes.txt. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581513991.32348.7956810394499654272.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/test_kprobes.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 0dbab6d1acb4..47106a1e645a 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value) static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("pre-handler is preemptible\n"); + } preh_val = (rand1 / div_factor); return 0; } @@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { + if (preemptible()) { + handler_errors++; + pr_err("post-handler is preemptible\n"); + } if (preh_val != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in post_handler\n"); @@ -156,6 +164,10 @@ static int test_kprobes(void) static u32 j_kprobe_target(u32 value) { + if (preemptible()) { + handler_errors++; + pr_err("jprobe-handler is preemptible\n"); + } if (value != rand1) { handler_errors++; pr_err("incorrect value in jprobe handler\n"); @@ -232,6 +244,10 @@ static u32 krph_val; static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("kretprobe entry handler is preemptible\n"); + } krph_val = (rand1 / div_factor); return 0; } @@ -240,6 +256,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); + if (preemptible()) { + handler_errors++; + pr_err("kretprobe return handler is preemptible\n"); + } if (ret != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in kretprobe handler\n"); From cd52edad55fbcd8064877a77d31445b2fb4b85c3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 18:59:39 +0900 Subject: [PATCH 0189/1085] kprobes/x86: Move the get_kprobe_ctlblk() into irq-disabled block Since get_kprobe_ctlblk() accesses per-cpu variables which calls smp_processor_id(), it must be called under preempt-disabled or irq-disabled. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581517952.32348.2655896843219158446.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/opt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 0cae7c0f32ec..f55810305f9a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -154,7 +154,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long flags; /* This is possible if op is under delayed unoptimizing */ @@ -165,6 +164,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; From e863d5396146411b615231cae0c518cb2a23371c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 19:00:19 +0900 Subject: [PATCH 0190/1085] kprobes: Warn if optprobe handler tries to change execution path Warn if optprobe handler tries to change execution path. As described in Documentation/kprobes.txt, with optprobe user handler can not change instruction pointer. In that case user must avoid optimizing the kprobes by setting post_handler or break_handler. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581521955.32348.3615624715034787365.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 15fba7fe57c8..2d28377a0e32 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -387,7 +387,10 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); - kp->pre_handler(kp, regs); + if (kp->pre_handler(kp, regs)) { + if (WARN_ON_ONCE(1)) + pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr); + } } reset_kprobe_instance(); } From 9a09f261a4fa52de916b0db34a36956c95f78fdc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 19:00:59 +0900 Subject: [PATCH 0191/1085] kprobes/x86: Disable preemption in optprobe Disable preemption in optprobe handler as described in Documentation/kprobes.txt, which says: "Probe handlers are run with preemption disabled." Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581525942.32348.6359217983269060829.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/opt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f55810305f9a..32c35cb3550c 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -161,6 +161,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) return; local_irq_save(flags); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -180,6 +181,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } + preempt_enable_no_resched(); local_irq_restore(flags); } NOKPROBE_SYMBOL(optimized_callback); From 5bb4fc2d8641219732eb2bb654206775a4219aca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 19:01:40 +0900 Subject: [PATCH 0192/1085] kprobes/x86: Disable preemption in ftrace-based jprobes Disable preemption in ftrace-based jprobe handlers as described in Documentation/kprobes.txt: "Probe handlers are run with preemption disabled." This will fix jprobes behavior when CONFIG_PREEMPT=y. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581530024.32348.9863783558598926771.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/ftrace.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 041f7b6dfa0f..bcfee4f69b0e 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -26,7 +26,7 @@ #include "common.h" static nokprobe_inline -int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, +void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* @@ -41,20 +41,21 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, NULL); if (orig_ip) regs->ip = orig_ip; - return 1; } int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb, 0); - else - return 0; + if (kprobe_ftrace(p)) { + __skip_singlestep(p, regs, kcb, 0); + preempt_enable_no_resched(); + return 1; + } + return 0; } NOKPROBE_SYMBOL(skip_singlestep); -/* Ftrace callback handler for kprobes */ +/* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { @@ -77,13 +78,17 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); + /* To emulate trap based kprobes, preempt_disable here */ + preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) + if (!p->pre_handler || !p->pre_handler(p, regs)) { __skip_singlestep(p, regs, kcb, orig_ip); + preempt_enable_no_resched(); + } /* * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. + * resets current kprobe, and keep preempt count +1. */ } end: From a19b2e3d783964d48d2b494439648e929bcdc976 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 19:02:20 +0900 Subject: [PATCH 0193/1085] kprobes/x86: Remove IRQ disabling from ftrace-based/optimized kprobes Kkprobes don't need to disable IRQs if they are called from the ftrace/jump trampoline code, because Documentation/kprobes.txt says: ----- Probe handlers are run with preemption disabled. Depending on the architecture and optimization state, handlers may also run with interrupts disabled (e.g., kretprobe handlers and optimized kprobe handlers run without interrupt disabled on x86/x86-64). ----- So let's remove IRQ disabling from those handlers. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581534039.32348.11331736206004264553.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/ftrace.c | 9 ++------- arch/x86/kernel/kprobes/opt.c | 4 ---- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index bcfee4f69b0e..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -61,14 +61,11 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); + /* Preempt is disabled by ftrace */ p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - goto end; + return; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -91,8 +88,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, * resets current kprobe, and keep preempt count +1. */ } -end: - local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 32c35cb3550c..e941136e24d8 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -154,13 +154,10 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); @@ -182,7 +179,6 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) __this_cpu_write(current_kprobe, NULL); } preempt_enable_no_resched(); - local_irq_restore(flags); } NOKPROBE_SYMBOL(optimized_callback); From 00d96180dc38ef872ac471c2d3e14b067cbd895d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:30 -0500 Subject: [PATCH 0194/1085] objtool: Don't report end of section error after an empty unwind hint If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the section ends unexpectedly. This can happen with the xen-head.S code because the hypercall_page is "text" but it's all zeros. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/check.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a0c518ecf085..83f370fa00c2 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1752,11 +1752,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, if (insn->dead_end) return 0; - insn = next_insn; - if (!insn) { + if (!next_insn) { + if (state.cfa.base == CFI_UNDEFINED) + return 0; WARN("%s: unexpected end of section", sec->name); return 1; } + + insn = next_insn; } return 0; From 17270717e80de33a884ad328fea5f407d87f6d6a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:31 -0500 Subject: [PATCH 0195/1085] x86/head: Remove confusing comment This comment is actively wrong and confusing. It refers to the registers' stack offsets after the pt_regs has been constructed on the stack, but this code is *before* that. At this point the stack just has the standard iret frame, for which no comment should be needed. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..3b04e4c99389 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -270,10 +270,6 @@ bad_address: __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 From a8b88e84d124bc92c4808e72b8b8c0e0bb538630 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:32 -0500 Subject: [PATCH 0196/1085] x86/head: Remove unused 'bad_address' code It's no longer possible for this code to be executed, so remove it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3b04e4c99389..afb0a1e22d41 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,9 +265,6 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) i = 0 From 015a2ea5478680fc5216d56b7ff306f2a74efaf9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:33 -0500 Subject: [PATCH 0197/1085] x86/head: Fix head ELF function annotations These functions aren't callable C-type functions, so don't annotate them as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index afb0a1e22d41..edacd579d504 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,7 +234,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -277,7 +277,7 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +END(early_idt_handler_array) early_idt_handler_common: /* @@ -320,7 +320,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) +END(early_idt_handler_common) __INITDATA From e93db75a0054b23a874a12c63376753544f3fe9e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:34 -0500 Subject: [PATCH 0198/1085] x86/boot: Annotate verify_cpu() as a callable function verify_cpu() is a callable function. Annotate it as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/verify_cpu.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include #include -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) From 2582d3df95c76d3b686453baf90b64d57e87d1e8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:35 -0500 Subject: [PATCH 0199/1085] x86/xen: Fix xen head ELF annotations Mark the ends of the startup_xen and hypercall_page code sections. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a7525e95d53f..9753225289e8 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -33,7 +33,7 @@ ENTRY(startup_xen) mov $init_thread_union+THREAD_SIZE, %_ASM_SP jmp xen_start_kernel - +END(startup_xen) __FINIT #endif @@ -47,7 +47,7 @@ ENTRY(hypercall_page) .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 #include #undef HYPERCALL - +END(hypercall_page) .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") From abbe1cac6214d81d2f4e149aba64a8760703144e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:36 -0500 Subject: [PATCH 0200/1085] x86/xen: Add unwind hint annotations Add unwind hint annotations to the xen head code so the ORC unwinder can read head_64.o. hypercall_page needs empty annotations at 32-byte intervals to match the 'xen_hypercall_*' ELF functions at those locations. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 9753225289e8..124941d09b2b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) + UNWIND_HINT_EMPTY cld /* Clear .bss */ @@ -40,7 +42,10 @@ END(startup_xen) .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE + .rept (PAGE_SIZE / 32) + UNWIND_HINT_EMPTY + .skip 32 + .endr #define HYPERCALL(n) \ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ From 2704fbb672d0d9a19414907fda7949283dcef6a1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:37 -0500 Subject: [PATCH 0201/1085] x86/head: Add unwind hint annotations Jiri Slaby reported an ORC issue when unwinding from an idle task. The stack was: ffffffff811083c2 do_idle+0x142/0x1e0 ffffffff8110861d cpu_startup_entry+0x5d/0x60 ffffffff82715f58 start_kernel+0x3ff/0x407 ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d ffffffff810001bf secondary_startup_64+0x9f/0xa0 The ORC unwinder errored out at secondary_startup_64 because the head code isn't annotated yet so there wasn't a corresponding ORC entry. Fix that and any other head-related unwinding issues by adding unwind hints to the head code. Reported-by: Jiri Slaby Tested-by: Jiri Slaby Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/head_64.S | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..d8e2b700d1db 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index edacd579d504..42e32c2e51bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -88,6 +89,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -132,6 +134,7 @@ ENTRY(secondary_startup_64) movq $1f, %rax jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -246,6 +249,7 @@ END(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -270,13 +274,18 @@ ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr + UNWIND_HINT_IRET_REGS offset=16 END(early_idt_handler_array) early_idt_handler_common: @@ -305,6 +314,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -427,7 +437,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE From 30c23f29d2d5e3ce8e24f7dec550b7e15142aaef Mon Sep 17 00:00:00 2001 From: Miguel Bernal Marin Date: Mon, 25 Sep 2017 18:03:49 -0500 Subject: [PATCH 0202/1085] locking/x86: Use named operands in rwsem.h Since GCC version 3.1 it is possible to specify input and output operands using symbolic names, which can be referenced within the assembler code. Converting to named operands makes it easier to understand and maintain the code in the future. Update operands in asm/rwsem.h accordingly. Signed-off-by: Miguel Bernal Marin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170925230349.18834-1-miguel.bernal.marin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/rwsem.h | 53 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 7116b7931c7b..a8f486e7a124 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -63,14 +63,14 @@ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX _ASM_INC "(%1)\n\t" + LOCK_PREFIX _ASM_INC "(%[sem])\n\t" /* adds 0x00000001 */ " jns 1f\n" " call call_rwsem_down_read_failed\n" "1:\n\t" "# ending down_read\n\t" : "+m" (sem->count) - : "a" (sem) + : [sem] "a" (sem) : "memory", "cc"); } @@ -81,17 +81,18 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) { long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " mov %0,%1\n\t" + " mov %[count],%[result]\n\t" "1:\n\t" - " mov %1,%2\n\t" - " add %3,%2\n\t" + " mov %[result],%[tmp]\n\t" + " add %[inc],%[tmp]\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchg %2,%0\n\t" + LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" - : "+m" (sem->count), "=&a" (result), "=&r" (tmp) - : "i" (RWSEM_ACTIVE_READ_BIAS) + : [count] "+m" (sem->count), [result] "=&a" (result), + [tmp] "=&r" (tmp) + : [inc] "i" (RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); return result >= 0; } @@ -105,7 +106,7 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) struct rw_semaphore* ret; \ \ asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %1,(%4)\n\t" \ + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \ /* adds 0xffff0001, returns the old value */ \ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ /* was the active mask 0 before? */\ @@ -113,9 +114,9 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), \ + : "+m" (sem->count), [tmp] "=d" (tmp), \ "=a" (ret), ASM_CALL_CONSTRAINT \ - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ }) @@ -141,21 +142,21 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) bool result; long tmp0, tmp1; asm volatile("# beginning __down_write_trylock\n\t" - " mov %0,%1\n\t" + " mov %[count],%[tmp0]\n\t" "1:\n\t" " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" /* was the active mask 0 before? */ " jnz 2f\n\t" - " mov %1,%2\n\t" - " add %4,%2\n\t" - LOCK_PREFIX " cmpxchg %2,%0\n\t" + " mov %[tmp0],%[tmp1]\n\t" + " add %[inc],%[tmp1]\n\t" + LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t" " jnz 1b\n\t" "2:\n\t" CC_SET(e) "# ending __down_write_trylock\n\t" - : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), - CC_OUT(e) (result) - : "er" (RWSEM_ACTIVE_WRITE_BIAS) + : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0), + [tmp1] "=&r" (tmp1), CC_OUT(e) (result) + : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS) : "memory"); return result; } @@ -167,14 +168,14 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n" "# ending __up_read\n" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS) + : "+m" (sem->count), [tmp] "=d" (tmp) + : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); } @@ -185,14 +186,14 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; asm volatile("# beginning __up_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" /* subtracts 0xffff0001, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "+m" (sem->count), [tmp] "=d" (tmp) + : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); } @@ -202,7 +203,7 @@ static inline void __up_write(struct rw_semaphore *sem) static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t" /* * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) @@ -212,7 +213,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "er" (-RWSEM_WAITING_BIAS) + : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } From 564c9cc84e2adf8a6671c1937f0a9fe3da2a4b0e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 2 Sep 2017 13:09:45 -0700 Subject: [PATCH 0203/1085] locking/refcounts, x86/asm: Use unique .text section for refcount exceptions Using .text.unlikely for refcount exceptions isn't safe because gcc may move entire functions into .text.unlikely (e.g. in6_dev_dev()), which would cause any uses of a protected refcount_t function to stay inline with the function, triggering the protection unconditionally: .section .text.unlikely,"ax",@progbits .type in6_dev_get, @function in6_dev_getx: .LFB4673: .loc 2 4128 0 .cfi_startproc ... lock; incl 480(%rbx) js 111f .pushsection .text.unlikely 111: lea 480(%rbx), %rcx 112: .byte 0x0f, 0xff .popsection 113: This creates a unique .text..refcount section and adds an additional test to the exception handler to WARN in the case of having none of OF, SF, nor ZF set so we can see things like this more easily in the future. The double dot for the section name keeps it out of the TEXT_MAIN macro namespace, to avoid collisions and so it can be put at the end with text.unlikely to keep the cold code together. See commit: cb87481ee89db ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured") ... which matches C names: [a-zA-Z0-9_] but not ".". Reported-by: Mike Galbraith Signed-off-by: Kees Cook Cc: Ard Biesheuvel Cc: Elena Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch Fixes: 7a46ec0e2f48 ("locking/refcounts, x86/asm: Implement fast refcount overflow protection") Link: http://lkml.kernel.org/r/1504382986-49301-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/refcount.h | 2 +- arch/x86/mm/extable.c | 7 ++++++- include/asm-generic/vmlinux.lds.h | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index ff871210b9f2..4e44250e7d0d 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -15,7 +15,7 @@ * back to the regular execution flow in .text. */ #define _REFCOUNT_EXCEPTION \ - ".pushsection .text.unlikely\n" \ + ".pushsection .text..refcount\n" \ "111:\tlea %[counter], %%" _ASM_CX "\n" \ "112:\t" ASM_UD0 "\n" \ ASM_UNREACHABLE \ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c3521e2be396..3321b446b66c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -67,12 +67,17 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, * wrapped around) will be set. Additionally, seeing the refcount * reach 0 will set ZF (Zero Flag: result was zero). In each of * these cases we want a report, since it's a boundary condition. - * + * The SF case is not reported since it indicates post-boundary + * manipulations below zero or above INT_MAX. And if none of the + * flags are set, something has gone very wrong, so report it. */ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { bool zero = regs->flags & X86_EFLAGS_ZF; refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } else if ((regs->flags & X86_EFLAGS_SF) == 0) { + /* Report if none of OF, ZF, nor SF are set. */ + refcount_error_report(regs, "unexpected saturation"); } return true; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8acfc1e099e1..e549bff87c5b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -459,6 +459,7 @@ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ + *(.text..refcount) \ *(.ref.text) \ MEM_KEEP(init.text) \ MEM_KEEP(exit.text) \ From 39208aa7ecb7d9c4e86df782b5693270313cbab1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 2 Sep 2017 13:09:46 -0700 Subject: [PATCH 0204/1085] locking/refcounts, x86/asm: Enable CONFIG_ARCH_HAS_REFCOUNT With the section inlining bug fixed for the x86 refcount protection, we can turn the config back on. Signed-off-by: Kees Cook Cc: Ard Biesheuvel Cc: Elena Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch Link: http://lkml.kernel.org/r/1504382986-49301-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..90535646b83d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,7 +55,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 # Causing hangs/crashes, see the commit that added this change for details. - select ARCH_HAS_REFCOUNT if BROKEN + select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN From 79761ce80aa0232157e428bde28c0cef6d43ac5f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 Sep 2017 11:22:23 +0100 Subject: [PATCH 0205/1085] x86/apic: Fix spelling mistake: "symmectic" -> "symmetric" Trivial fix to spelling mistakes in pr_info messages Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Dou Liyang Link: https://lkml.kernel.org/r/20170927102223.31920-1-colin.king@canonical.com --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ca5ec3fddc49..a1ca2c08f532 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1296,11 +1296,11 @@ void __init apic_intr_mode_init(void) default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: - pr_info("APIC: Switch to symmectic I/O mode setup\n"); + pr_info("APIC: Switch to symmetric I/O mode setup\n"); default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: - pr_info("APIC: Switch to symmectic I/O mode setup in no SMP routine\n"); + pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } From 88ae80aa609c7db3e3cc98e97e05badbcc6347dc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Sep 2017 10:10:42 -0700 Subject: [PATCH 0206/1085] EDAC, skx_edac: Handle systems with segmented PCI busses Large systems separate their PCI busses into segments since the limit of only 256 PCI busses can be too restrictive. Extend this driver to check whether matches when deciding how to group memory controller PCI devices to CPU sockets. Signed-off-by: Tony Luck Cc: Aristeu Rozanski Cc: Charles Rose Cc: Mauro Carvalho Chehab Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/f58abfd10bf73c8bc5adc1fe4de7408128b00625.1506358467.git.tony.luck@intel.com [ Make skx_dev.seg an int. ] Signed-off-by: Borislav Petkov --- drivers/edac/skx_edac.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index 9ca40a3fe03f..912c4930c9ef 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -67,6 +67,7 @@ static u64 skx_tolm, skx_tohm; struct skx_dev { struct list_head list; u8 bus[4]; + int seg; struct pci_dev *sad_all; struct pci_dev *util_all; u32 mcroute; @@ -112,12 +113,12 @@ struct decoded_addr { int bank_group; }; -static struct skx_dev *get_skx_dev(u8 bus, u8 idx) +static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) { struct skx_dev *d; list_for_each_entry(d, &skx_edac_list, list) { - if (d->bus[idx] == bus) + if (d->seg == pci_domain_nr(bus) && d->bus[idx] == bus->number) return d; } @@ -174,6 +175,7 @@ static int get_all_bus_mappings(void) pci_dev_put(pdev); return -ENOMEM; } + d->seg = pci_domain_nr(pdev->bus); pci_read_config_dword(pdev, 0xCC, ®); d->bus[0] = GET_BITFIELD(reg, 0, 7); d->bus[1] = GET_BITFIELD(reg, 8, 15); @@ -209,7 +211,7 @@ static int get_all_munits(const struct munit *m) if (i == NUM_IMC) goto fail; } - d = get_skx_dev(pdev->bus->number, m->busidx); + d = get_skx_dev(pdev->bus, m->busidx); if (!d) goto fail; From 5bce9db1894c998c5b85a34036d679ea6517668f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 29 Aug 2017 17:01:03 +0300 Subject: [PATCH 0207/1085] perf/core: Explain perf_sched_mutex To clarify why atomic_inc_return(&perf_sched_events) is not sufficient and a mutex is needed to order static branch enabling vs the atomic counter increment, this adds a comment with a short explanation. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170829140103.6563-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..5ee62714f9a6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9394,6 +9394,11 @@ static void account_event(struct perf_event *event) inc = true; if (inc) { + /* + * We need the mutex here because static_branch_enable() + * must complete *before* the perf_sched_count increment + * becomes visible. + */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; From a47ba4d77e1236d214e5116b5631bc4c2d6e6369 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 14:46:30 -0700 Subject: [PATCH 0208/1085] perf/x86: Enable free running PEBS for REGS_USER/INTR Currently free running PEBS is disabled when user or interrupt registers are requested. Most of the registers are actually available in the PEBS record and can be supported. So we just need to check for the supported registers and then allow it: it is all except for the segment register. For user registers this only works when the counter is limited to ring 3 only, so this also needs to be checked. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++++ arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 829e89cfcee2..cfd91a03304d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; + if (!event->attr.exclude_kernel) + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_user & ~PEBS_REGS) + flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4196f81ec0e1..f7aaadf9331f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -85,13 +85,15 @@ struct amd_nb { * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. + * REGS_USER can be handled for events limited to ring 3. * */ #define PEBS_FREERUNNING_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) /* * A debug store configuration. @@ -110,6 +112,26 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ + PERF_REG_X86_CX | \ + PERF_REG_X86_DX | \ + PERF_REG_X86_DI | \ + PERF_REG_X86_SI | \ + PERF_REG_X86_SP | \ + PERF_REG_X86_BP | \ + PERF_REG_X86_IP | \ + PERF_REG_X86_FLAGS | \ + PERF_REG_X86_R8 | \ + PERF_REG_X86_R9 | \ + PERF_REG_X86_R10 | \ + PERF_REG_X86_R11 | \ + PERF_REG_X86_R12 | \ + PERF_REG_X86_R13 | \ + PERF_REG_X86_R14 | \ + PERF_REG_X86_R15) + /* * Per register state. */ From eecd49c4624a2c35e74f6f4e6352edf7eba545ca Mon Sep 17 00:00:00 2001 From: Patrick Steuer Date: Mon, 18 Sep 2017 12:48:08 +0200 Subject: [PATCH 0209/1085] s390/crypto: add inline assembly for KMA instruction to cpacf.h Signed-off-by: Patrick Steuer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cpacf.h | 52 ++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index e06f2556b316..cbf11aa3e6e9 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -1,7 +1,7 @@ /* * CP Assist for Cryptographic Functions (CPACF) * - * Copyright IBM Corp. 2003, 2016 + * Copyright IBM Corp. 2003, 2017 * Author(s): Thomas Spatzier * Jan Glauber * Harald Freudenberger (freude@de.ibm.com) @@ -133,6 +133,22 @@ #define CPACF_PRNO_TRNG_Q_R2C_RATIO 0x70 #define CPACF_PRNO_TRNG 0x72 +/* + * Function codes for the KMA (CIPHER MESSAGE WITH AUTHENTICATION) + * instruction + */ +#define CPACF_KMA_QUERY 0x00 +#define CPACF_KMA_GCM_AES_128 0x12 +#define CPACF_KMA_GCM_AES_192 0x13 +#define CPACF_KMA_GCM_AES_256 0x14 + +/* + * Flags for the KMA (CIPHER MESSAGE WITH AUTHENTICATION) instruction + */ +#define CPACF_KMA_LPC 0x100 /* Last-Plaintext/Ciphertext */ +#define CPACF_KMA_LAAD 0x200 /* Last-AAD */ +#define CPACF_KMA_HS 0x400 /* Hash-subkey Supplied */ + typedef struct { unsigned char bytes[16]; } cpacf_mask_t; /** @@ -178,6 +194,8 @@ static inline int __cpacf_check_opcode(unsigned int opcode) return test_facility(77); /* check for MSA4 */ case CPACF_PRNO: return test_facility(57); /* check for MSA5 */ + case CPACF_KMA: + return test_facility(146); /* check for MSA8 */ default: BUG(); } @@ -469,4 +487,36 @@ static inline void cpacf_pckmo(long func, void *param) : "cc", "memory"); } +/** + * cpacf_kma() - executes the KMA (CIPHER MESSAGE WITH AUTHENTICATION) + * instruction + * @func: the function code passed to KMA; see CPACF_KMA_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @aad: address of additional authenticated data memory area + * @aad_len: length of aad operand in bytes + */ +static inline void cpacf_kma(unsigned long func, void *param, u8 *dest, + const u8 *src, unsigned long src_len, + const u8 *aad, unsigned long aad_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + register unsigned long r4 asm("4") = (unsigned long) aad; + register unsigned long r5 asm("5") = (unsigned long) aad_len; + register unsigned long r6 asm("6") = (unsigned long) dest; + + asm volatile( + "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n" + " brc 1,0b\n" /* handle partial completion */ + : [dst] "+a" (r6), [src] "+a" (r2), [slen] "+d" (r3), + [aad] "+a" (r4), [alen] "+d" (r5) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMA) + : "cc", "memory"); +} + #endif /* _ASM_S390_CPACF_H */ From bf7fa038707c4c7a51a8f3fc1872a7c0d2adf1d3 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 18 Sep 2017 12:48:09 +0200 Subject: [PATCH 0210/1085] s390/crypto: add s390 platform specific aes gcm support. This patch introduces gcm(aes) support into the aes_s390 kernel module. Signed-off-by: Patrick Steuer Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/aes_s390.c | 296 +++++++++++++++++++++++++++++++++++- 1 file changed, 293 insertions(+), 3 deletions(-) diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 591cbdf615af..b48e20dd94e9 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -4,9 +4,11 @@ * s390 implementation of the AES Cipher Algorithm. * * s390 Version: - * Copyright IBM Corp. 2005, 2007 + * Copyright IBM Corp. 2005, 2017 * Author(s): Jan Glauber (jang@de.ibm.com) * Sebastian Siewior (sebastian@breakpoint.cc> SW-Fallback + * Patrick Steuer + * Harald Freudenberger * * Derived from "crypto/aes_generic.c" * @@ -22,20 +24,25 @@ #include #include +#include +#include #include +#include #include #include #include #include #include #include +#include #include #include static u8 *ctrblk; static DEFINE_SPINLOCK(ctrblk_lock); -static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions, + kma_functions; struct s390_aes_ctx { u8 key[AES_MAX_KEY_SIZE]; @@ -55,6 +62,17 @@ struct s390_xts_ctx { struct crypto_skcipher *fallback; }; +struct gcm_sg_walk { + struct scatter_walk walk; + unsigned int walk_bytes; + u8 *walk_ptr; + unsigned int walk_bytes_remain; + u8 buf[AES_BLOCK_SIZE]; + unsigned int buf_bytes; + u8 *ptr; + unsigned int nbytes; +}; + static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { @@ -771,6 +789,267 @@ static struct crypto_alg ctr_aes_alg = { } }; +static int gcm_aes_setkey(struct crypto_aead *tfm, const u8 *key, + unsigned int keylen) +{ + struct s390_aes_ctx *ctx = crypto_aead_ctx(tfm); + + switch (keylen) { + case AES_KEYSIZE_128: + ctx->fc = CPACF_KMA_GCM_AES_128; + break; + case AES_KEYSIZE_192: + ctx->fc = CPACF_KMA_GCM_AES_192; + break; + case AES_KEYSIZE_256: + ctx->fc = CPACF_KMA_GCM_AES_256; + break; + default: + return -EINVAL; + } + + memcpy(ctx->key, key, keylen); + ctx->key_len = keylen; + return 0; +} + +static int gcm_aes_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static void gcm_sg_walk_start(struct gcm_sg_walk *gw, struct scatterlist *sg, + unsigned int len) +{ + memset(gw, 0, sizeof(*gw)); + gw->walk_bytes_remain = len; + scatterwalk_start(&gw->walk, sg); +} + +static int gcm_sg_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) +{ + int n; + + /* minbytesneeded <= AES_BLOCK_SIZE */ + if (gw->buf_bytes && gw->buf_bytes >= minbytesneeded) { + gw->ptr = gw->buf; + gw->nbytes = gw->buf_bytes; + goto out; + } + + if (gw->walk_bytes_remain == 0) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + + gw->walk_bytes = scatterwalk_clamp(&gw->walk, gw->walk_bytes_remain); + if (!gw->walk_bytes) { + scatterwalk_start(&gw->walk, sg_next(gw->walk.sg)); + gw->walk_bytes = scatterwalk_clamp(&gw->walk, + gw->walk_bytes_remain); + } + gw->walk_ptr = scatterwalk_map(&gw->walk); + + if (!gw->buf_bytes && gw->walk_bytes >= minbytesneeded) { + gw->ptr = gw->walk_ptr; + gw->nbytes = gw->walk_bytes; + goto out; + } + + while (1) { + n = min(gw->walk_bytes, AES_BLOCK_SIZE - gw->buf_bytes); + memcpy(gw->buf + gw->buf_bytes, gw->walk_ptr, n); + gw->buf_bytes += n; + gw->walk_bytes_remain -= n; + scatterwalk_unmap(&gw->walk); + scatterwalk_advance(&gw->walk, n); + scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); + + if (gw->buf_bytes >= minbytesneeded) { + gw->ptr = gw->buf; + gw->nbytes = gw->buf_bytes; + goto out; + } + + gw->walk_bytes = scatterwalk_clamp(&gw->walk, + gw->walk_bytes_remain); + if (!gw->walk_bytes) { + scatterwalk_start(&gw->walk, sg_next(gw->walk.sg)); + gw->walk_bytes = scatterwalk_clamp(&gw->walk, + gw->walk_bytes_remain); + } + gw->walk_ptr = scatterwalk_map(&gw->walk); + } + +out: + return gw->nbytes; +} + +static void gcm_sg_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) +{ + int n; + + if (gw->ptr == NULL) + return; + + if (gw->ptr == gw->buf) { + n = gw->buf_bytes - bytesdone; + if (n > 0) { + memmove(gw->buf, gw->buf + bytesdone, n); + gw->buf_bytes -= n; + } else + gw->buf_bytes = 0; + } else { + gw->walk_bytes_remain -= bytesdone; + scatterwalk_unmap(&gw->walk); + scatterwalk_advance(&gw->walk, bytesdone); + scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); + } +} + +static int gcm_aes_crypt(struct aead_request *req, unsigned int flags) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct s390_aes_ctx *ctx = crypto_aead_ctx(tfm); + unsigned int ivsize = crypto_aead_ivsize(tfm); + unsigned int taglen = crypto_aead_authsize(tfm); + unsigned int aadlen = req->assoclen; + unsigned int pclen = req->cryptlen; + int ret = 0; + + unsigned int len, in_bytes, out_bytes, + min_bytes, bytes, aad_bytes, pc_bytes; + struct gcm_sg_walk gw_in, gw_out; + u8 tag[GHASH_DIGEST_SIZE]; + + struct { + u32 _[3]; /* reserved */ + u32 cv; /* Counter Value */ + u8 t[GHASH_DIGEST_SIZE];/* Tag */ + u8 h[AES_BLOCK_SIZE]; /* Hash-subkey */ + u64 taadl; /* Total AAD Length */ + u64 tpcl; /* Total Plain-/Cipher-text Length */ + u8 j0[GHASH_BLOCK_SIZE];/* initial counter value */ + u8 k[AES_MAX_KEY_SIZE]; /* Key */ + } param; + + /* + * encrypt + * req->src: aad||plaintext + * req->dst: aad||ciphertext||tag + * decrypt + * req->src: aad||ciphertext||tag + * req->dst: aad||plaintext, return 0 or -EBADMSG + * aad, plaintext and ciphertext may be empty. + */ + if (flags & CPACF_DECRYPT) + pclen -= taglen; + len = aadlen + pclen; + + memset(¶m, 0, sizeof(param)); + param.cv = 1; + param.taadl = aadlen * 8; + param.tpcl = pclen * 8; + memcpy(param.j0, req->iv, ivsize); + *(u32 *)(param.j0 + ivsize) = 1; + memcpy(param.k, ctx->key, ctx->key_len); + + gcm_sg_walk_start(&gw_in, req->src, len); + gcm_sg_walk_start(&gw_out, req->dst, len); + + do { + min_bytes = min_t(unsigned int, + aadlen > 0 ? aadlen : pclen, AES_BLOCK_SIZE); + in_bytes = gcm_sg_walk_go(&gw_in, min_bytes); + out_bytes = gcm_sg_walk_go(&gw_out, min_bytes); + bytes = min(in_bytes, out_bytes); + + if (aadlen + pclen <= bytes) { + aad_bytes = aadlen; + pc_bytes = pclen; + flags |= CPACF_KMA_LAAD | CPACF_KMA_LPC; + } else { + if (aadlen <= bytes) { + aad_bytes = aadlen; + pc_bytes = (bytes - aadlen) & + ~(AES_BLOCK_SIZE - 1); + flags |= CPACF_KMA_LAAD; + } else { + aad_bytes = bytes & ~(AES_BLOCK_SIZE - 1); + pc_bytes = 0; + } + } + + if (aad_bytes > 0) + memcpy(gw_out.ptr, gw_in.ptr, aad_bytes); + + cpacf_kma(ctx->fc | flags, ¶m, + gw_out.ptr + aad_bytes, + gw_in.ptr + aad_bytes, pc_bytes, + gw_in.ptr, aad_bytes); + + gcm_sg_walk_done(&gw_in, aad_bytes + pc_bytes); + gcm_sg_walk_done(&gw_out, aad_bytes + pc_bytes); + aadlen -= aad_bytes; + pclen -= pc_bytes; + } while (aadlen + pclen > 0); + + if (flags & CPACF_DECRYPT) { + scatterwalk_map_and_copy(tag, req->src, len, taglen, 0); + if (crypto_memneq(tag, param.t, taglen)) + ret = -EBADMSG; + } else + scatterwalk_map_and_copy(param.t, req->dst, len, taglen, 1); + + memzero_explicit(¶m, sizeof(param)); + return ret; +} + +static int gcm_aes_encrypt(struct aead_request *req) +{ + return gcm_aes_crypt(req, CPACF_ENCRYPT); +} + +static int gcm_aes_decrypt(struct aead_request *req) +{ + return gcm_aes_crypt(req, CPACF_DECRYPT); +} + +static struct aead_alg gcm_aes_aead = { + .setkey = gcm_aes_setkey, + .setauthsize = gcm_aes_setauthsize, + .encrypt = gcm_aes_encrypt, + .decrypt = gcm_aes_decrypt, + + .ivsize = GHASH_BLOCK_SIZE - sizeof(u32), + .maxauthsize = GHASH_DIGEST_SIZE, + .chunksize = AES_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_aes_ctx), + .cra_priority = 900, + .cra_name = "gcm(aes)", + .cra_driver_name = "gcm-aes-s390", + .cra_module = THIS_MODULE, + }, +}; + static struct crypto_alg *aes_s390_algs_ptr[5]; static int aes_s390_algs_num; @@ -790,16 +1069,19 @@ static void aes_s390_fini(void) crypto_unregister_alg(aes_s390_algs_ptr[aes_s390_algs_num]); if (ctrblk) free_page((unsigned long) ctrblk); + + crypto_unregister_aead(&gcm_aes_aead); } static int __init aes_s390_init(void) { int ret; - /* Query available functions for KM, KMC and KMCTR */ + /* Query available functions for KM, KMC, KMCTR and KMA */ cpacf_query(CPACF_KM, &km_functions); cpacf_query(CPACF_KMC, &kmc_functions); cpacf_query(CPACF_KMCTR, &kmctr_functions); + cpacf_query(CPACF_KMA, &kma_functions); if (cpacf_test_func(&km_functions, CPACF_KM_AES_128) || cpacf_test_func(&km_functions, CPACF_KM_AES_192) || @@ -840,6 +1122,14 @@ static int __init aes_s390_init(void) goto out_err; } + if (cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_128) || + cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_192) || + cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_256)) { + ret = crypto_register_aead(&gcm_aes_aead); + if (ret) + goto out_err; + } + return 0; out_err: aes_s390_fini(); From f9a5d70cfaf3e32308de0abfcc95dafe4e36ea51 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 14 Sep 2017 09:52:32 +0200 Subject: [PATCH 0211/1085] s390/ccwgroup: tie a ccwgroup driver to its ccw driver When grouping devices, the ccwgroup core only checks whether all of the devices are bound to the same ccw_driver. It has no means of checking if the requesting ccwgroup driver actually supports this device type. qeth implements its own device matching in qeth_core_probe_device(), while ctcm and lcs currently have no sanity-checking at all. Enable ccwgroup drivers to optionally defer the device type checking to the ccwgroup core, by specifying their supported ccw_driver. This allows us drop the device type matching from qeth, and improves the robustness of ctcm and lcs. Signed-off-by: Julian Wiedmann Acked-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ccwgroup.h | 2 ++ drivers/s390/cio/ccwgroup.c | 6 ++++++ drivers/s390/net/ctcm_main.c | 1 + drivers/s390/net/lcs.c | 1 + drivers/s390/net/qeth_core_main.c | 1 + 5 files changed, 11 insertions(+) diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h index 057ce0ca6377..6d50e86dd0e8 100644 --- a/arch/s390/include/asm/ccwgroup.h +++ b/arch/s390/include/asm/ccwgroup.h @@ -41,6 +41,7 @@ struct ccwgroup_device { * @thaw: undo work done in @freeze * @restore: callback for restoring after hibernation * @driver: embedded driver structure + * @ccw_driver: supported ccw_driver (optional) */ struct ccwgroup_driver { int (*setup) (struct ccwgroup_device *); @@ -55,6 +56,7 @@ struct ccwgroup_driver { int (*restore)(struct ccwgroup_device *); struct device_driver driver; + struct ccw_driver *ccw_driver; }; extern int ccwgroup_driver_register (struct ccwgroup_driver *cdriver); diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index 34b9ad6b3143..e2f7b6e93efd 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -373,6 +373,12 @@ int ccwgroup_create_dev(struct device *parent, struct ccwgroup_driver *gdrv, rc = -EINVAL; goto error; } + /* Check if the devices are bound to the required ccw driver. */ + if (gdev->count && gdrv && gdrv->ccw_driver && + gdev->cdev[0]->drv != gdrv->ccw_driver) { + rc = -EINVAL; + goto error; + } dev_set_name(&gdev->dev, "%s", dev_name(&gdev->cdev[0]->dev)); gdev->dev.groups = ccwgroup_attr_groups; diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c index 26363e0816fe..be9f17218531 100644 --- a/drivers/s390/net/ctcm_main.c +++ b/drivers/s390/net/ctcm_main.c @@ -1761,6 +1761,7 @@ static struct ccwgroup_driver ctcm_group_driver = { .owner = THIS_MODULE, .name = CTC_DRIVER_NAME, }, + .ccw_driver = &ctcm_ccw_driver, .setup = ctcm_probe_device, .remove = ctcm_remove_device, .set_online = ctcm_new_device, diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index d01b5c2a7760..0bf7b7356f10 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -2396,6 +2396,7 @@ static struct ccwgroup_driver lcs_group_driver = { .owner = THIS_MODULE, .name = "lcs", }, + .ccw_driver = &lcs_ccw_driver, .setup = lcs_probe_device, .remove = lcs_remove_device, .set_online = lcs_new_device, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index bae7440abc01..61cf3e9c0acb 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5875,6 +5875,7 @@ static struct ccwgroup_driver qeth_core_ccwgroup_driver = { .owner = THIS_MODULE, .name = "qeth", }, + .ccw_driver = &qeth_ccw_driver, .setup = qeth_core_probe_device, .remove = qeth_core_remove_device, .set_online = qeth_core_set_online, From 7fb2b2d512448cf0e914c4647a1cf02b52263702 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 27 Sep 2017 13:42:01 +0200 Subject: [PATCH 0212/1085] s390/virtio: remove the old KVM virtio transport There is no recent user space application available anymore which still supports this old virtio transport. Additionally, commit 3b2fbb3f06ef ("virtio/s390: deprecate old transport") introduced a deprecation message in the driver, and apparently nobody complained so far that it is still required. So let's simply remove it. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Acked-by: Halil Pasic Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 13 - drivers/s390/virtio/Makefile | 3 - drivers/s390/virtio/kvm_virtio.c | 515 ------------------------------- 3 files changed, 531 deletions(-) delete mode 100644 drivers/s390/virtio/kvm_virtio.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4edc91e6c3ff..5fd4a67c93b3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -929,17 +929,4 @@ config S390_GUEST Select this option if you want to run the kernel as a guest under the KVM hypervisor. -config S390_GUEST_OLD_TRANSPORT - def_bool y - prompt "Guest support for old s390 virtio transport (DEPRECATED)" - depends on S390_GUEST - help - Enable this option to add support for the old s390-virtio - transport (i.e. virtio devices NOT based on virtio-ccw). This - type of virtio devices is only available on the experimental - kuli userspace or with old (< 2.6) qemu. If you are running - with a modern version of qemu (which supports virtio-ccw since - 1.4 and uses it by default since version 2.4), you probably won't - need this. - endmenu diff --git a/drivers/s390/virtio/Makefile b/drivers/s390/virtio/Makefile index df40692a9011..d9942e0a123c 100644 --- a/drivers/s390/virtio/Makefile +++ b/drivers/s390/virtio/Makefile @@ -7,7 +7,4 @@ # as published by the Free Software Foundation. s390-virtio-objs := virtio_ccw.o -ifdef CONFIG_S390_GUEST_OLD_TRANSPORT -s390-virtio-objs += kvm_virtio.o -endif obj-$(CONFIG_S390_GUEST) += $(s390-virtio-objs) diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c deleted file mode 100644 index a99d09a11f05..000000000000 --- a/drivers/s390/virtio/kvm_virtio.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * virtio for kvm on s390 - * - * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - * - * Author(s): Christian Borntraeger - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define VIRTIO_SUBCODE_64 0x0D00 - -/* - * The pointer to our (page) of device descriptions. - */ -static void *kvm_devices; -static struct work_struct hotplug_work; - -struct kvm_device { - struct virtio_device vdev; - struct kvm_device_desc *desc; -}; - -#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev) - -/* - * memory layout: - * - kvm_device_descriptor - * struct kvm_device_desc - * - configuration - * struct kvm_vqconfig - * - feature bits - * - config space - */ -static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc) -{ - return (struct kvm_vqconfig *)(desc + 1); -} - -static u8 *kvm_vq_features(const struct kvm_device_desc *desc) -{ - return (u8 *)(kvm_vq_config(desc) + desc->num_vq); -} - -static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc) -{ - return kvm_vq_features(desc) + desc->feature_len * 2; -} - -/* - * The total size of the config page used by this device (incl. desc) - */ -static unsigned desc_size(const struct kvm_device_desc *desc) -{ - return sizeof(*desc) - + desc->num_vq * sizeof(struct kvm_vqconfig) - + desc->feature_len * 2 - + desc->config_len; -} - -/* This gets the device's feature bits. */ -static u64 kvm_get_features(struct virtio_device *vdev) -{ - unsigned int i; - u32 features = 0; - struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; - u8 *in_features = kvm_vq_features(desc); - - for (i = 0; i < min(desc->feature_len * 8, 32); i++) - if (in_features[i / 8] & (1 << (i % 8))) - features |= (1 << i); - return features; -} - -static int kvm_finalize_features(struct virtio_device *vdev) -{ - unsigned int i, bits; - struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; - /* Second half of bitmap is features we accept. */ - u8 *out_features = kvm_vq_features(desc) + desc->feature_len; - - /* Give virtio_ring a chance to accept features. */ - vring_transport_features(vdev); - - /* Make sure we don't have any features > 32 bits! */ - BUG_ON((u32)vdev->features != vdev->features); - - memset(out_features, 0, desc->feature_len); - bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; - for (i = 0; i < bits; i++) { - if (__virtio_test_bit(vdev, i)) - out_features[i / 8] |= (1 << (i % 8)); - } - - return 0; -} - -/* - * Reading and writing elements in config space - */ -static void kvm_get(struct virtio_device *vdev, unsigned int offset, - void *buf, unsigned len) -{ - struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; - - BUG_ON(offset + len > desc->config_len); - memcpy(buf, kvm_vq_configspace(desc) + offset, len); -} - -static void kvm_set(struct virtio_device *vdev, unsigned int offset, - const void *buf, unsigned len) -{ - struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; - - BUG_ON(offset + len > desc->config_len); - memcpy(kvm_vq_configspace(desc) + offset, buf, len); -} - -/* - * The operations to get and set the status word just access - * the status field of the device descriptor. set_status will also - * make a hypercall to the host, to tell about status changes - */ -static u8 kvm_get_status(struct virtio_device *vdev) -{ - return to_kvmdev(vdev)->desc->status; -} - -static void kvm_set_status(struct virtio_device *vdev, u8 status) -{ - BUG_ON(!status); - to_kvmdev(vdev)->desc->status = status; - kvm_hypercall1(KVM_S390_VIRTIO_SET_STATUS, - (unsigned long) to_kvmdev(vdev)->desc); -} - -/* - * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the - * descriptor address. The Host will zero the status and all the - * features. - */ -static void kvm_reset(struct virtio_device *vdev) -{ - kvm_hypercall1(KVM_S390_VIRTIO_RESET, - (unsigned long) to_kvmdev(vdev)->desc); -} - -/* - * When the virtio_ring code wants to notify the Host, it calls us here and we - * make a hypercall. We hand the address of the virtqueue so the Host - * knows which virtqueue we're talking about. - */ -static bool kvm_notify(struct virtqueue *vq) -{ - long rc; - struct kvm_vqconfig *config = vq->priv; - - rc = kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address); - if (rc < 0) - return false; - return true; -} - -/* - * This routine finds the first virtqueue described in the configuration of - * this device and sets it up. - */ -static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, - unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name, bool ctx) -{ - struct kvm_device *kdev = to_kvmdev(vdev); - struct kvm_vqconfig *config; - struct virtqueue *vq; - int err; - - if (index >= kdev->desc->num_vq) - return ERR_PTR(-ENOENT); - - if (!name) - return NULL; - - config = kvm_vq_config(kdev->desc)+index; - - err = vmem_add_mapping(config->address, - vring_size(config->num, - KVM_S390_VIRTIO_RING_ALIGN)); - if (err) - goto out; - - vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN, - vdev, true, ctx, (void *) config->address, - kvm_notify, callback, name); - if (!vq) { - err = -ENOMEM; - goto unmap; - } - - /* - * register a callback token - * The host will sent this via the external interrupt parameter - */ - config->token = (u64) vq; - - vq->priv = config; - return vq; -unmap: - vmem_remove_mapping(config->address, - vring_size(config->num, - KVM_S390_VIRTIO_RING_ALIGN)); -out: - return ERR_PTR(err); -} - -static void kvm_del_vq(struct virtqueue *vq) -{ - struct kvm_vqconfig *config = vq->priv; - - vring_del_virtqueue(vq); - vmem_remove_mapping(config->address, - vring_size(config->num, - KVM_S390_VIRTIO_RING_ALIGN)); -} - -static void kvm_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - kvm_del_vq(vq); -} - -static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char * const names[], - const bool *ctx, - struct irq_affinity *desc) -{ - struct kvm_device *kdev = to_kvmdev(vdev); - int i; - - /* We must have this many virtqueues. */ - if (nvqs > kdev->desc->num_vq) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i], - ctx ? ctx[i] : false); - if (IS_ERR(vqs[i])) - goto error; - } - return 0; - -error: - kvm_del_vqs(vdev); - return PTR_ERR(vqs[i]); -} - -static const char *kvm_bus_name(struct virtio_device *vdev) -{ - return ""; -} - -/* - * The config ops structure as defined by virtio config - */ -static const struct virtio_config_ops kvm_vq_configspace_ops = { - .get_features = kvm_get_features, - .finalize_features = kvm_finalize_features, - .get = kvm_get, - .set = kvm_set, - .get_status = kvm_get_status, - .set_status = kvm_set_status, - .reset = kvm_reset, - .find_vqs = kvm_find_vqs, - .del_vqs = kvm_del_vqs, - .bus_name = kvm_bus_name, -}; - -/* - * The root device for the kvm virtio devices. - * This makes them appear as /sys/devices/kvm_s390/0,1,2 not /sys/devices/0,1,2. - */ -static struct device *kvm_root; - -/* - * adds a new device and register it with virtio - * appropriate drivers are loaded by the device model - */ -static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset) -{ - struct kvm_device *kdev; - - kdev = kzalloc(sizeof(*kdev), GFP_KERNEL); - if (!kdev) { - printk(KERN_EMERG "Cannot allocate kvm dev %u type %u\n", - offset, d->type); - return; - } - - kdev->vdev.dev.parent = kvm_root; - kdev->vdev.id.device = d->type; - kdev->vdev.config = &kvm_vq_configspace_ops; - kdev->desc = d; - - if (register_virtio_device(&kdev->vdev) != 0) { - printk(KERN_ERR "Failed to register kvm device %u type %u\n", - offset, d->type); - kfree(kdev); - } -} - -/* - * scan_devices() simply iterates through the device page. - * The type 0 is reserved to mean "end of devices". - */ -static void scan_devices(void) -{ - unsigned int i; - struct kvm_device_desc *d; - - for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { - d = kvm_devices + i; - - if (d->type == 0) - break; - - add_kvm_device(d, i); - } -} - -/* - * match for a kvm device with a specific desc pointer - */ -static int match_desc(struct device *dev, void *data) -{ - struct virtio_device *vdev = dev_to_virtio(dev); - struct kvm_device *kdev = to_kvmdev(vdev); - - return kdev->desc == data; -} - -/* - * hotplug_device tries to find changes in the device page. - */ -static void hotplug_devices(struct work_struct *dummy) -{ - unsigned int i; - struct kvm_device_desc *d; - struct device *dev; - - for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { - d = kvm_devices + i; - - /* end of list */ - if (d->type == 0) - break; - - /* device already exists */ - dev = device_find_child(kvm_root, d, match_desc); - if (dev) { - /* XXX check for hotplug remove */ - put_device(dev); - continue; - } - - /* new device */ - printk(KERN_INFO "Adding new virtio device %p\n", d); - add_kvm_device(d, i); - } -} - -/* - * we emulate the request_irq behaviour on top of s390 extints - */ -static void kvm_extint_handler(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct virtqueue *vq; - u32 param; - - if ((ext_code.subcode & 0xff00) != VIRTIO_SUBCODE_64) - return; - inc_irq_stat(IRQEXT_VRT); - - /* The LSB might be overloaded, we have to mask it */ - vq = (struct virtqueue *)(param64 & ~1UL); - - /* We use ext_params to decide what this interrupt means */ - param = param32 & VIRTIO_PARAM_MASK; - - switch (param) { - case VIRTIO_PARAM_CONFIG_CHANGED: - virtio_config_changed(vq->vdev); - break; - case VIRTIO_PARAM_DEV_ADD: - schedule_work(&hotplug_work); - break; - case VIRTIO_PARAM_VRING_INTERRUPT: - default: - vring_interrupt(0, vq); - break; - } -} - -/* - * For s390-virtio, we expect a page above main storage containing - * the virtio configuration. Try to actually load from this area - * in order to figure out if the host provides this page. - */ -static int __init test_devices_support(unsigned long addr) -{ - int ret = -EIO; - - asm volatile( - "0: lura 0,%1\n" - "1: xgr %0,%0\n" - "2:\n" - EX_TABLE(0b,2b) - EX_TABLE(1b,2b) - : "+d" (ret) - : "a" (addr) - : "0", "cc"); - return ret; -} -/* - * Init function for virtio - * devices are in a single page above top of "normal" + standby mem - */ -static int __init kvm_devices_init(void) -{ - int rc; - unsigned long total_memory_size = sclp.rzm * sclp.rnmax; - - if (!MACHINE_IS_KVM) - return -ENODEV; - - if (test_devices_support(total_memory_size) < 0) - return -ENODEV; - - pr_warn("The s390-virtio transport is deprecated. Please switch to a modern host providing virtio-ccw.\n"); - - rc = vmem_add_mapping(total_memory_size, PAGE_SIZE); - if (rc) - return rc; - - kvm_devices = (void *) total_memory_size; - - kvm_root = root_device_register("kvm_s390"); - if (IS_ERR(kvm_root)) { - rc = PTR_ERR(kvm_root); - printk(KERN_ERR "Could not register kvm_s390 root device"); - vmem_remove_mapping(total_memory_size, PAGE_SIZE); - return rc; - } - - INIT_WORK(&hotplug_work, hotplug_devices); - - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - register_external_irq(EXT_IRQ_CP_SERVICE, kvm_extint_handler); - - scan_devices(); - return 0; -} - -/* code for early console output with virtio_console */ -static int early_put_chars(u32 vtermno, const char *buf, int count) -{ - char scratch[17]; - unsigned int len = count; - - if (len > sizeof(scratch) - 1) - len = sizeof(scratch) - 1; - scratch[len] = '\0'; - memcpy(scratch, buf, len); - kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, __pa(scratch)); - return len; -} - -static int __init s390_virtio_console_init(void) -{ - if (sclp.has_vt220 || sclp.has_linemode) - return -ENODEV; - return virtio_cons_early_init(early_put_chars); -} -console_initcall(s390_virtio_console_init); - - -/* - * We do this after core stuff, but before the drivers. - */ -postcore_initcall(kvm_devices_init); From d67ce18eb6894dff1f3dfa60cd8f1c75088b878e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 28 Sep 2017 08:05:13 +0200 Subject: [PATCH 0213/1085] s390/virtio: simplify Makefile Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/virtio/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/virtio/Makefile b/drivers/s390/virtio/Makefile index d9942e0a123c..f68af1f317f1 100644 --- a/drivers/s390/virtio/Makefile +++ b/drivers/s390/virtio/Makefile @@ -6,5 +6,4 @@ # it under the terms of the GNU General Public License (version 2 only) # as published by the Free Software Foundation. -s390-virtio-objs := virtio_ccw.o -obj-$(CONFIG_S390_GUEST) += $(s390-virtio-objs) +obj-$(CONFIG_S390_GUEST) += virtio_ccw.o From 5c50538752af7968f53924b22dede8ed4ce4cb3b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 09:16:48 +0200 Subject: [PATCH 0214/1085] s390/disassembler: add missing end marker for e7 table The e7 opcode table does not have an end marker. Hence when trying to find an unknown e7 instruction the code will access memory behind the table until it finds something that matches the opcode, or the kernel crashes, whatever comes first. This affects not only the in-kernel disassembler but also uprobes and kprobes which refuse to set a probe on unknown instructions, and therefore search the opcode tables to figure out if instructions are known or not. Cc: # v3.18+ Fixes: 3585cb0280654 ("s390/disassembler: add vector instructions") Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index f7e82302a71e..d9970c15f79d 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1548,6 +1548,7 @@ static struct s390_insn opcode_e7[] = { { "vfsq", 0xce, INSTR_VRR_VV000MM }, { "vfs", 0xe2, INSTR_VRR_VVV00MM }, { "vftci", 0x4a, INSTR_VRI_VVIMM }, + { "", 0, INSTR_INVALID } }; static struct s390_insn opcode_eb[] = { From caefea1d3aa14589fd56c3fbc42876056d653440 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 13:36:01 +0200 Subject: [PATCH 0215/1085] s390/disassembler: fix LRDFU format Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index d9970c15f79d..6d2974ab4568 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -295,7 +295,7 @@ static const unsigned char formats[][7] = { [INSTR_RSE_RRRD] = { 0xff, R_8,R_12,D_20,B_16,0,0 }, [INSTR_RSE_RURD] = { 0xff, R_8,U4_12,D_20,B_16,0,0 }, [INSTR_RSI_RRP] = { 0xff, R_8,R_12,J16_16,0,0,0 }, - [INSTR_RSL_LRDFU] = { 0xff, F_32,D_20,L4_8,B_16,U4_36,0 }, + [INSTR_RSL_LRDFU] = { 0xff, F_32,D_20,L8_8,B_16,U4_36,0 }, [INSTR_RSL_R0RD] = { 0xff, D_20,L4_8,B_16,0,0,0 }, [INSTR_RSY_AARD] = { 0xff, A_8,A_12,D20_20,B_16,0,0 }, [INSTR_RSY_CCRD] = { 0xff, C_8,C_12,D20_20,B_16,0,0 }, From 7e1263b72024e585466c40adf6ac6e870deb91fe Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 09:16:53 +0200 Subject: [PATCH 0216/1085] s390/disassembler: remove double instructions Remove a couple of instructions that are listed twice. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 6d2974ab4568..0b26b9b61f22 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1054,12 +1054,6 @@ static struct s390_insn opcode_b3[] = { { "fidr", 0x7f, INSTR_RRE_FF }, { "sfpc", 0x84, INSTR_RRE_RR_OPT }, { "efpc", 0x8c, INSTR_RRE_RR_OPT }, - { "cefbr", 0x94, INSTR_RRE_RF }, - { "cdfbr", 0x95, INSTR_RRE_RF }, - { "cxfbr", 0x96, INSTR_RRE_RF }, - { "cfebr", 0x98, INSTR_RRF_U0RF }, - { "cfdbr", 0x99, INSTR_RRF_U0RF }, - { "cfxbr", 0x9a, INSTR_RRF_U0RF }, { "cefr", 0xb4, INSTR_RRE_FR }, { "cdfr", 0xb5, INSTR_RRE_FR }, { "cxfr", 0xb6, INSTR_RRE_FR }, From 630f789e80fc299a38f44d6b2e65c02ed219b47a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 17:06:32 +0200 Subject: [PATCH 0217/1085] s390/disassembler: add sthyi instruction This instruction came with a z/VM extension and not with a specific machine generation. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 0b26b9b61f22..e08c2137a11b 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -865,6 +865,7 @@ static struct s390_insn opcode_b2[] = { { "msr", 0x52, INSTR_RRE_RR }, { "mvpg", 0x54, INSTR_RRE_RR }, { "mvst", 0x55, INSTR_RRE_RR }, + { "sthyi", 0x56, INSTR_RRE_RR }, { "cuse", 0x57, INSTR_RRE_RR }, { "bsg", 0x58, INSTR_RRE_RR }, { "bsa", 0x5a, INSTR_RRE_RR }, From ea7c360b10081a3d0d42a8eab5249f7c2a258b4e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 16:15:06 +0200 Subject: [PATCH 0218/1085] s390/disassembler: add missing z13 instructions Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index e08c2137a11b..c460e6049fb5 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -113,7 +113,7 @@ enum { INSTR_MII_UPI, INSTR_RIE_R0IU, INSTR_RIE_R0UU, INSTR_RIE_RRP, INSTR_RIE_RRPU, INSTR_RIE_RRUUU, INSTR_RIE_RUPI, INSTR_RIE_RUPU, INSTR_RIE_RRI0, - INSTR_RIL_RI, INSTR_RIL_RP, INSTR_RIL_RU, INSTR_RIL_UP, + INSTR_RIE_RUI0, INSTR_RIL_RI, INSTR_RIL_RP, INSTR_RIL_RU, INSTR_RIL_UP, INSTR_RIS_R0RDU, INSTR_RIS_R0UU, INSTR_RIS_RURDI, INSTR_RIS_RURDU, INSTR_RI_RI, INSTR_RI_RP, INSTR_RI_RU, INSTR_RI_UP, INSTR_RRE_00, INSTR_RRE_0R, INSTR_RRE_AA, INSTR_RRE_AR, INSTR_RRE_F0, @@ -130,7 +130,7 @@ enum { INSTR_RSI_RRP, INSTR_RSL_LRDFU, INSTR_RSL_R0RD, INSTR_RSY_AARD, INSTR_RSY_CCRD, INSTR_RSY_RRRD, INSTR_RSY_RURD, - INSTR_RSY_RDRM, INSTR_RSY_RMRD, + INSTR_RSY_RURD2, INSTR_RSY_RDRM, INSTR_RSY_RMRD, INSTR_RS_AARD, INSTR_RS_CCRD, INSTR_RS_R0RD, INSTR_RS_RRRD, INSTR_RS_RURD, INSTR_RXE_FRRD, INSTR_RXE_RRRD, INSTR_RXE_RRRDM, @@ -241,6 +241,7 @@ static const unsigned char formats[][7] = { [INSTR_RIE_RRPU] = { 0xff, R_8,R_12,U4_32,J16_16,0,0 }, [INSTR_RIE_RRP] = { 0xff, R_8,R_12,J16_16,0,0,0 }, [INSTR_RIE_RRUUU] = { 0xff, R_8,R_12,U8_16,U8_24,U8_32,0 }, + [INSTR_RIE_RUI0] = { 0xff, R_8,I16_16,U4_12,0,0,0 }, [INSTR_RIE_RUPI] = { 0xff, R_8,I8_32,U4_12,J16_16,0,0 }, [INSTR_RIE_RUPU] = { 0xff, R_8,U8_32,U4_12,J16_16,0,0 }, [INSTR_RIL_RI] = { 0x0f, R_8,I32_16,0,0,0,0 }, @@ -303,6 +304,7 @@ static const unsigned char formats[][7] = { [INSTR_RSY_RMRD] = { 0xff, R_8,U4_12,D20_20,B_16,0,0 }, [INSTR_RSY_RRRD] = { 0xff, R_8,R_12,D20_20,B_16,0,0 }, [INSTR_RSY_RURD] = { 0xff, R_8,U4_12,D20_20,B_16,0,0 }, + [INSTR_RSY_RURD2] = { 0xff, R_8,D20_20,B_16,U4_12,0,0 }, [INSTR_RS_AARD] = { 0xff, A_8,A_12,D_20,B_16,0,0 }, [INSTR_RS_CCRD] = { 0xff, C_8,C_12,D_20,B_16,0,0 }, [INSTR_RS_R0RD] = { 0xff, R_8,D_20,B_16,0,0,0 }, @@ -425,6 +427,10 @@ enum { LONG_INSN_LLGFRL, LONG_INSN_LLGHRL, LONG_INSN_LLGTAT, + LONG_INSN_LLZRGF, + LONG_INSN_LOCFHR, + LONG_INSN_LOCGHI, + LONG_INSN_LOCHHI, LONG_INSN_POPCNT, LONG_INSN_RIEMIT, LONG_INSN_RINEXT, @@ -433,6 +439,7 @@ enum { LONG_INSN_RISBLG, LONG_INSN_SLHHHR, LONG_INSN_SLHHLR, + LONG_INSN_STOCFH, LONG_INSN_TABORT, LONG_INSN_TBEGIN, LONG_INSN_TBEGINC, @@ -505,6 +512,7 @@ static char *long_insn_name[] = { [LONG_INSN_LLGFRL] = "llgfrl", [LONG_INSN_LLGHRL] = "llghrl", [LONG_INSN_LLGTAT] = "llgtat", + [LONG_INSN_LLZRGF] = "llzrgf", [LONG_INSN_POPCNT] = "popcnt", [LONG_INSN_RIEMIT] = "riemit", [LONG_INSN_RINEXT] = "rinext", @@ -526,6 +534,10 @@ static char *long_insn_name[] = { [LONG_INSN_VESRLV] = "vesrlv", [LONG_INSN_VSBCBI] = "vsbcbi", [LONG_INSN_STCCTM] = "stcctm", + [LONG_INSN_LOCFHR] = "locfhr", + [LONG_INSN_LOCGHI] = "locghi", + [LONG_INSN_LOCHHI] = "lochhi", + [LONG_INSN_STOCFH] = "stocfh", }; static struct s390_insn opcode[] = { @@ -1101,6 +1113,7 @@ static struct s390_insn opcode_b9[] = { { "lhr", 0x27, INSTR_RRE_RR }, { "cgfr", 0x30, INSTR_RRE_RR }, { "clgfr", 0x31, INSTR_RRE_RR }, + { "ppno", 0x3c, INSTR_RRE_RR }, { "cfdtr", 0x41, INSTR_RRF_UURF }, { { 0, LONG_INSN_CLGDTR }, 0x42, INSTR_RRF_UURF }, { { 0, LONG_INSN_CLFDTR }, 0x43, INSTR_RRF_UURF }, @@ -1160,6 +1173,7 @@ static struct s390_insn opcode_b9[] = { { { 0, LONG_INSN_SLHHLR }, 0xdb, INSTR_RRF_R0RR2 }, { "chlr", 0xdd, INSTR_RRE_RR }, { "clhlr", 0xdf, INSTR_RRE_RR }, + { { 0, LONG_INSN_LOCFHR }, 0xe0, INSTR_RRF_U0RR }, { { 0, LONG_INSN_POPCNT }, 0xe1, INSTR_RRE_RR }, { "locgr", 0xe2, INSTR_RRF_M0RR }, { "ngrk", 0xe4, INSTR_RRF_R0RR2 }, @@ -1309,6 +1323,7 @@ static struct s390_insn opcode_e3[] = { { "stg", 0x24, INSTR_RXY_RRRD }, { "ntstg", 0x25, INSTR_RXY_RRRD }, { "cvdy", 0x26, INSTR_RXY_RRRD }, + { "lzrg", 0x2a, INSTR_RXY_RRRD }, { "cvdg", 0x2e, INSTR_RXY_RRRD }, { "strvg", 0x2f, INSTR_RXY_RRRD }, { "cgf", 0x30, INSTR_RXY_RRRD }, @@ -1316,6 +1331,8 @@ static struct s390_insn opcode_e3[] = { { "ltgf", 0x32, INSTR_RXY_RRRD }, { "cgh", 0x34, INSTR_RXY_RRRD }, { "pfd", 0x36, INSTR_RXY_URRD }, + { { 0, LONG_INSN_LLZRGF }, 0x3a, INSTR_RXY_RRRD }, + { "lzrf", 0x3b, INSTR_RXY_RRRD }, { "strvh", 0x3f, INSTR_RXY_RRRD }, { "bctg", 0x46, INSTR_RXY_RRRD }, { "sty", 0x50, INSTR_RXY_RRRD }, @@ -1595,6 +1612,8 @@ static struct s390_insn opcode_eb[] = { { "slak", 0xdd, INSTR_RSY_RRRD }, { "srlk", 0xde, INSTR_RSY_RRRD }, { "sllk", 0xdf, INSTR_RSY_RRRD }, + { "locfh", 0xe0, INSTR_RSY_RURD2 }, + { { 0, LONG_INSN_STOCFH }, 0xe1, INSTR_RSY_RURD2 }, { "locg", 0xe2, INSTR_RSY_RDRM }, { "stocg", 0xe3, INSTR_RSY_RDRM }, { "lang", 0xe4, INSTR_RSY_RRRD }, @@ -1620,8 +1639,11 @@ static struct s390_insn opcode_eb[] = { }; static struct s390_insn opcode_ec[] = { + { "lochi", 0x42, INSTR_RIE_RUI0 }, { "brxhg", 0x44, INSTR_RIE_RRP }, { "brxlg", 0x45, INSTR_RIE_RRP }, + { { 0, LONG_INSN_LOCGHI }, 0x46, INSTR_RIE_RUI0 }, + { { 0, LONG_INSN_LOCHHI }, 0x4e, INSTR_RIE_RUI0 }, { { 0, LONG_INSN_RISBLG }, 0x51, INSTR_RIE_RRUUU }, { "rnsbg", 0x54, INSTR_RIE_RRUUU }, { "risbg", 0x55, INSTR_RIE_RRUUU }, @@ -1717,6 +1739,10 @@ static struct s390_insn opcode_ed[] = { { "mee", 0x37, INSTR_RXE_FRRD }, { "mad", 0x3e, INSTR_RXF_FRRDF }, { "msd", 0x3f, INSTR_RXF_FRRDF }, + { "cdpt", 0xae, INSTR_RSL_LRDFU }, + { "cxpt", 0xaf, INSTR_RSL_LRDFU }, + { "cpdt", 0xac, INSTR_RSL_LRDFU }, + { "cpxt", 0xad, INSTR_RSL_LRDFU }, { "", 0, INSTR_INVALID } }; From e0d281d067f6514f8afdae919e5990958136f197 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 27 Sep 2017 09:45:00 +0200 Subject: [PATCH 0219/1085] s390/disassembler: add new z14 instructions Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 83 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index c460e6049fb5..94e96a7bd6d8 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -78,6 +78,7 @@ enum { U8_8, /* 8 bit unsigned value starting at 8 */ U8_16, /* 8 bit unsigned value starting at 16 */ U8_24, /* 8 bit unsigned value starting at 24 */ + U8_28, /* 8 bit unsigned value starting at 28 */ U8_32, /* 8 bit unsigned value starting at 32 */ I8_8, /* 8 bit signed value starting at 8 */ I8_16, /* 8 bit signed value starting at 16 */ @@ -157,6 +158,10 @@ enum { INSTR_VRS_RVRDM, INSTR_VRV_VVRDM, INSTR_VRV_VWRDM, INSTR_VRX_VRRDM, INSTR_VRX_VRRD0, + INSTR_VSI_URDV, INSTR_VRS_RRDV, INSTR_VRI_V0UU2, INSTR_VRR_RV0U, + INSTR_VRI_VR0UU, INSTR_VRI_VVUUU2, INSTR_VRR_0V, INSTR_VRI_VVV0UU2, + INSTR_VRR_0VV0U, INSTR_VRR_VVV, INSTR_VRR_VVVU0UV, + INSTR_VRR_VVVUU0V, INSTR_VRR_VVV0UUU, }; static const struct s390_operand operands[] = @@ -207,6 +212,7 @@ static const struct s390_operand operands[] = [U8_8] = { 8, 8, 0 }, [U8_16] = { 8, 16, 0 }, [U8_24] = { 8, 24, 0 }, + [U8_28] = { 8, 28, 0 }, [U8_32] = { 8, 32, 0 }, [J12_12] = { 12, 12, OPERAND_PCREL }, [I8_8] = { 8, 8, OPERAND_SIGNED }, @@ -368,6 +374,19 @@ static const unsigned char formats[][7] = { [INSTR_VRV_VWRDM] = { 0xff, V_8,D_20,W_12,B_16,M_32,0 }, [INSTR_VRX_VRRDM] = { 0xff, V_8,D_20,X_12,B_16,M_32,0 }, [INSTR_VRX_VRRD0] = { 0xff, V_8,D_20,X_12,B_16,0,0 }, + [INSTR_VRI_V0UU2] = {0xff, V_8,U16_16,U4_32,0,0,0 }, + [INSTR_VRI_VR0UU] = {0xff, V_8,V_12,U8_28,U8_16,U4_24,0 }, + [INSTR_VRI_VVUUU2]= {0xff, V_8,V_12,U8_28,U8_16,U4_24,0 }, + [INSTR_VRI_VVV0UU2]= {0xff, V_8,V_12,V_16,U8_28,U4_24,0 }, + [INSTR_VRR_0VV0U] = {0xff, V_12,V_16,U4_24,0,0,0 }, + [INSTR_VRR_0V] = {0xff, V_12,0,0,0,0,0 }, + [INSTR_VRR_RV0U] = {0xff, R_8,V_12,U4_24,0,0,0 }, + [INSTR_VRR_VVV0UUU]= {0xff, V_8,V_12,V_16,U4_32,U4_28,U4_24 }, + [INSTR_VRR_VVVU0UV]= {0xff, V_8,V_12,V_16,V_32,U4_28,U4_20 }, + [INSTR_VRR_VVVUU0V]= {0xff, V_8,V_12,V_16,V_32,U4_20,U4_24 }, + [INSTR_VRR_VVV] = {0xff, V_8,V_12,V_16,0,0,0 }, + [INSTR_VRS_RRDV] = {0xff, V_32,R_12,D_20,B_16,0,0 }, + [INSTR_VSI_URDV] = {0xff, V_32,D_20,B_16,U8_8,0,0 }, }; enum { @@ -452,7 +471,11 @@ enum { LONG_INSN_VESRAV, LONG_INSN_VESRLV, LONG_INSN_VSBCBI, - LONG_INSN_STCCTM + LONG_INSN_STCCTM, + LONG_INSN_MSGRKC, + LONG_INSN_LLGFSG, + LONG_INSN_VSTRLR, + LONG_INSN_VBPERM, }; static char *long_insn_name[] = { @@ -538,6 +561,10 @@ static char *long_insn_name[] = { [LONG_INSN_LOCGHI] = "locghi", [LONG_INSN_LOCHHI] = "lochhi", [LONG_INSN_STOCFH] = "stocfh", + [LONG_INSN_MSGRKC] = "msgrkc", + [LONG_INSN_LLGFSG] = "llgfsg", + [LONG_INSN_VSTRLR] = "vstrlr", + [LONG_INSN_VBPERM] = "vbperm", }; static struct s390_insn opcode[] = { @@ -1111,6 +1138,7 @@ static struct s390_insn opcode_b9[] = { { "sturg", 0x25, INSTR_RRE_RR }, { "lbr", 0x26, INSTR_RRE_RR }, { "lhr", 0x27, INSTR_RRE_RR }, + { "kma", 0x29, INSTR_RRF_R0RR }, { "cgfr", 0x30, INSTR_RRE_RR }, { "clgfr", 0x31, INSTR_RRE_RR }, { "ppno", 0x3c, INSTR_RRE_RR }, @@ -1183,6 +1211,8 @@ static struct s390_insn opcode_b9[] = { { "sgrk", 0xe9, INSTR_RRF_R0RR2 }, { "algrk", 0xea, INSTR_RRF_R0RR2 }, { "slgrk", 0xeb, INSTR_RRF_R0RR2 }, + { "mgrk", 0xec, INSTR_RRF_R0RR2 }, + { { 0, LONG_INSN_MSGRKC }, 0xed, INSTR_RRF_R0RR2 }, { "locr", 0xf2, INSTR_RRF_M0RR }, { "nrk", 0xf4, INSTR_RRF_R0RR2 }, { "ork", 0xf6, INSTR_RRF_R0RR2 }, @@ -1191,6 +1221,7 @@ static struct s390_insn opcode_b9[] = { { "srk", 0xf9, INSTR_RRF_R0RR2 }, { "alrk", 0xfa, INSTR_RRF_R0RR2 }, { "slrk", 0xfb, INSTR_RRF_R0RR2 }, + { "msrkc", 0xfd, INSTR_RRF_R0RR2 }, { "kmac", 0x1e, INSTR_RRE_RR }, { "lrvr", 0x1f, INSTR_RRE_RR }, { "km", 0x2e, INSTR_RRE_RR }, @@ -1331,12 +1362,21 @@ static struct s390_insn opcode_e3[] = { { "ltgf", 0x32, INSTR_RXY_RRRD }, { "cgh", 0x34, INSTR_RXY_RRRD }, { "pfd", 0x36, INSTR_RXY_URRD }, + { "agh", 0x38, INSTR_RXY_RRRD }, + { "sgh", 0x39, INSTR_RXY_RRRD }, { { 0, LONG_INSN_LLZRGF }, 0x3a, INSTR_RXY_RRRD }, { "lzrf", 0x3b, INSTR_RXY_RRRD }, + { "mgh", 0x3c, INSTR_RXY_RRRD }, { "strvh", 0x3f, INSTR_RXY_RRRD }, { "bctg", 0x46, INSTR_RXY_RRRD }, + { "bic", 0x47, INSTR_RXY_URRD }, + { { 0, LONG_INSN_LLGFSG }, 0x48, INSTR_RXY_RRRD }, + { "stgsc", 0x49, INSTR_RXY_RRRD }, + { "lgg", 0x4c, INSTR_RXY_RRRD }, + { "lgsc", 0x4d, INSTR_RXY_RRRD }, { "sty", 0x50, INSTR_RXY_RRRD }, { "msy", 0x51, INSTR_RXY_RRRD }, + { "msc", 0x53, INSTR_RXY_RRRD }, { "ny", 0x54, INSTR_RXY_RRRD }, { "cly", 0x55, INSTR_RXY_RRRD }, { "oy", 0x56, INSTR_RXY_RRRD }, @@ -1363,6 +1403,8 @@ static struct s390_insn opcode_e3[] = { { "ng", 0x80, INSTR_RXY_RRRD }, { "og", 0x81, INSTR_RXY_RRRD }, { "xg", 0x82, INSTR_RXY_RRRD }, + { "msgc", 0x83, INSTR_RXY_RRRD }, + { "mg", 0x84, INSTR_RXY_RRRD }, { "lgat", 0x85, INSTR_RXY_RRRD }, { "mlg", 0x86, INSTR_RXY_RRRD }, { "dlg", 0x87, INSTR_RXY_RRRD }, @@ -1420,6 +1462,32 @@ static struct s390_insn opcode_e5[] = { { "", 0, INSTR_INVALID } }; +static struct s390_insn opcode_e6[] = { + { "vpkz", 0x34, INSTR_VSI_URDV }, + { "vlrl", 0x35, INSTR_VSI_URDV }, + { "vlrlr", 0x37, INSTR_VRS_RRDV }, + { "vupkz", 0x3c, INSTR_VSI_URDV }, + { "vstrl", 0x3d, INSTR_VSI_URDV }, + { {0 , LONG_INSN_VSTRLR }, 0x3f, INSTR_VRS_RRDV }, + { "vlip", 0x49, INSTR_VRI_V0UU2 }, + { "vcvb", 0x50, INSTR_VRR_RV0U }, + { "vcvbg", 0x52, INSTR_VRR_RV0U }, + { "vcvd", 0x58, INSTR_VRI_VR0UU }, + { "vsrp", 0x59, INSTR_VRI_VVUUU2 }, + { "vcvdg", 0x5a, INSTR_VRI_VR0UU }, + { "vpsop", 0x5b, INSTR_VRI_VVUUU2 }, + { "vtp", 0x5f, INSTR_VRR_0V }, + { "vap", 0x71, INSTR_VRI_VVV0UU2 }, + { "vsp", 0x73, INSTR_VRI_VVV0UU2 }, + { "vcp", 0x77, INSTR_VRR_0VV0U }, + { "vmp", 0x78, INSTR_VRI_VVV0UU2 }, + { "vmsp", 0x79, INSTR_VRI_VVV0UU2 }, + { "vdp", 0x7a, INSTR_VRI_VVV0UU2 }, + { "vrp", 0x7b, INSTR_VRI_VVV0UU2 }, + { "vsdp", 0x7e, INSTR_VRI_VVV0UU2 }, + { "", 0, INSTR_INVALID } +}; + static struct s390_insn opcode_e7[] = { { "lcbb", 0x27, INSTR_RXE_RRRDM }, { "vgef", 0x13, INSTR_VRV_VVRDM }, @@ -1560,6 +1628,15 @@ static struct s390_insn opcode_e7[] = { { "vfsq", 0xce, INSTR_VRR_VV000MM }, { "vfs", 0xe2, INSTR_VRR_VVV00MM }, { "vftci", 0x4a, INSTR_VRI_VVIMM }, + { "vnx", 0x6c, INSTR_VRR_VVV }, + { "vnn", 0x6e, INSTR_VRR_VVV }, + { "voc", 0x6f, INSTR_VRR_VVV }, + { { 0, LONG_INSN_VBPERM }, 0x85, INSTR_VRR_VVV }, + { "vfnms", 0x9e, INSTR_VRR_VVVU0UV }, + { "vfnma", 0x9f, INSTR_VRR_VVVU0UV }, + { "vmsl", 0xb8, INSTR_VRR_VVVUU0V }, + { "vfmin", 0xee, INSTR_VRR_VVV0UUU }, + { "vfmax", 0xef, INSTR_VRR_VVV0UUU }, { "", 0, INSTR_INVALID } }; @@ -1850,6 +1927,10 @@ struct s390_insn *find_insn(unsigned char *code) case 0xe5: table = opcode_e5; break; + case 0xe6: + table = opcode_e6; + opfrag = code[5]; + break; case 0xe7: table = opcode_e7; opfrag = code[5]; From 7c80cfc99b7bfdc92cee26f8008859f326f4a37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:03:17 +0200 Subject: [PATCH 0220/1085] sched/fair: Clean up calc_cfs_shares() For consistencies sake, we should have only a single reading of tg->shares. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..048fbfb75523 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2694,9 +2694,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +static long calc_cfs_shares(struct cfs_rq *cfs_rq) { - long tg_weight, load, shares; + long tg_weight, tg_shares, load, shares; + struct task_group *tg = cfs_rq->tg; + + tg_shares = READ_ONCE(tg->shares); /* * This really should be: cfs_rq->avg.load_avg, but instead we use @@ -2711,7 +2714,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) tg_weight -= cfs_rq->tg_load_avg_contrib; tg_weight += load; - shares = (tg->shares * load); + shares = (tg_shares * load); if (tg_weight) shares /= tg_weight; @@ -2727,17 +2730,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) * case no task is runnable on a CPU MIN_SHARES=2 should be returned * instead of 0. */ - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; - - return shares; -} -# else /* CONFIG_SMP */ -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - return tg->shares; + return clamp_t(long, shares, MIN_SHARES, tg_shares); } # endif /* CONFIG_SMP */ @@ -2762,7 +2755,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_shares(struct sched_entity *se) { struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg; long shares; if (!cfs_rq) @@ -2771,13 +2763,14 @@ static void update_cfs_shares(struct sched_entity *se) if (throttled_hierarchy(cfs_rq)) return; - tg = cfs_rq->tg; - #ifndef CONFIG_SMP - if (likely(se->load.weight == tg->shares)) + shares = READ_ONCE(cfs_rq->tg->shares); + + if (likely(se->load.weight == shares)) return; +#else + shares = calc_cfs_shares(cfs_rq); #endif - shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } From cef27403cbe98ebda0a32d43128dd60c309eb966 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 May 2017 11:04:07 +0200 Subject: [PATCH 0221/1085] sched/fair: Add comment to calc_cfs_shares() Explain the magic equation in calc_cfs_shares() a bit better. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 048fbfb75523..dd565aeafc5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2694,6 +2694,67 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP +/* + * All this does is approximate the hierarchical proportion which includes that + * global sum we all love to hate. + * + * That is, the weight of a group entity, is the proportional share of the + * group weight based on the group runqueue weights. That is: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (1) + * \Sum grq->load.weight + * + * Now, because computing that sum is prohibitively expensive to compute (been + * there, done that) we approximate it with this average stuff. The average + * moves slower and therefore the approximation is cheaper and more stable. + * + * So instead of the above, we substitute: + * + * grq->load.weight -> grq->avg.load_avg (2) + * + * which yields the following: + * + * tg->weight * grq->avg.load_avg + * ge->load.weight = ------------------------------ (3) + * tg->load_avg + * + * Where: tg->load_avg ~= \Sum grq->avg.load_avg + * + * That is shares_avg, and it is right (given the approximation (2)). + * + * The problem with it is that because the average is slow -- it was designed + * to be exactly that of course -- this leads to transients in boundary + * conditions. In specific, the case where the group was idle and we start the + * one task. It takes time for our CPU's grq->avg.load_avg to build up, + * yielding bad latency etc.. + * + * Now, in that special case (1) reduces to: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- = tg>weight (4) + * grp->load.weight + * + * That is, the sum collapses because all other CPUs are idle; the UP scenario. + * + * So what we do is modify our approximation (3) to approach (4) in the (near) + * UP case, like: + * + * ge->load.weight = + * + * tg->weight * grq->load.weight + * --------------------------------------------------- (5) + * tg->load_avg - grq->avg.load_avg + grq->load.weight + * + * + * And that is shares_weight and is icky. In the (near) UP case it approaches + * (4) while in the normal case it approaches (3). It consistently + * overestimates the ge->load.weight and therefore: + * + * \Sum ge->load.weight >= tg->weight + * + * hence icky! + */ static long calc_cfs_shares(struct cfs_rq *cfs_rq) { long tg_weight, tg_shares, load, shares; From 3d4b60d3e3dde6ea24e439000eb3b71078da81f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 May 2017 18:16:06 +0200 Subject: [PATCH 0222/1085] sched/fair: Cure calc_cfs_shares() vs. reweight_entity() Vincent reported that when running in a cgroup, his root cfs_rq->avg.load_avg dropped to 0 on task idle. This is because reweight_entity() will now immediately propagate the weight change of the group entity to its cfs_rq, and as it happens, our approxmation (5) for calc_cfs_shares() results in 0 when the group is idle. Avoid this by using the correct (3) as a lower bound on (5). This way the empty cgroup will slowly decay instead of instantly drop to 0. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd565aeafc5a..63166a0ed854 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2763,11 +2763,10 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) tg_shares = READ_ONCE(tg->shares); /* - * This really should be: cfs_rq->avg.load_avg, but instead we use - * cfs_rq->load.weight, which is its upper bound. This helps ramp up - * the shares for small weight interactive tasks. + * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3) + * as a lower bound. */ - load = scale_load_down(cfs_rq->load.weight); + load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); tg_weight = atomic_long_read(&tg->load_avg); From c7b50216818ef3dca14a52e3499750fbad2d9691 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:42:08 +0200 Subject: [PATCH 0223/1085] sched/fair: Remove se->load.weight from se->avg.load_sum Remove the load from the load_sum for sched_entities, basically turning load_sum into runnable_sum. This prepares for better reweighting of group entities. Since we now have different rules for computing load_avg, split ___update_load_avg() into two parts, ___update_load_sum() and ___update_load_avg(). So for se: ___update_load_sum(.weight = 1) ___upate_load_avg(.weight = se->load.weight) and for cfs_rq: ___update_load_sum(.weight = cfs_rq->load.weight) ___upate_load_avg(.weight = 1) Since the primary consumable is load_avg, most things will not be affected. Only those few sites that initialize/modify load_sum need attention. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 93 +++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63166a0ed854..3b5b82345774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -731,7 +731,7 @@ void init_entity_runnable_average(struct sched_entity *se) */ if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->load_sum = LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2025,7 +2025,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.load_sum / p->se.load.weight; + delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; } @@ -3018,8 +3018,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta; @@ -3065,39 +3065,80 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) return 0; + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); + sa->load_avg = div_u64(weight * sa->load_sum, divider); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); + div_u64(cfs_rq->runnable_load_sum, divider); } - sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); - - return 1; + sa->util_avg = sa->util_sum / divider; } +/* + * XXX we want to get rid of this helper and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +/* + * sched_entity: + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * cfq_rs: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + */ + static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); + if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { + ___update_load_avg(&se->avg, se_weight(se), NULL); + return 1; + } + + return 0; } static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, + cfs_rq->curr == se, NULL)) { + + ___update_load_avg(&se->avg, se_weight(se), NULL); + return 1; + } + + return 0; } static int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { - return ___update_load_avg(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq); + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq)) { + ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + return 1; + } + + return 0; } /* @@ -3267,7 +3308,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) /* Set new sched_entity's load */ se->avg.load_avg = load; - se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + se->avg.load_sum = LOAD_AVG_MAX; /* Update parent cfs_rq load */ add_positive(&cfs_rq->avg.load_avg, delta); @@ -3473,7 +3514,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s { se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; set_tg_cfs_propagate(cfs_rq); @@ -3493,7 +3534,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s { sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); set_tg_cfs_propagate(cfs_rq); @@ -3505,12 +3546,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct sched_avg *sa = &se->avg; + cfs_rq->runnable_load_avg += se->avg.load_avg; + cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; - cfs_rq->runnable_load_avg += sa->load_avg; - cfs_rq->runnable_load_sum += sa->load_sum; - - if (!sa->last_update_time) { + if (!se->avg.last_update_time) { attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } @@ -3520,10 +3559,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg = - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); - cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); } #ifndef CONFIG_64BIT From 88c0616ee729067ecb412bed76ef4a8734ea5100 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 17:32:43 +0200 Subject: [PATCH 0224/1085] sched/fair: Change update_load_avg() arguments Most call sites of update_load_avg() already have cfs_rq_of(se) available, pass it down instead of recomputing it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b5b82345774..a6c53580fea0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3480,9 +3480,8 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define SKIP_AGE_LOAD 0x2 /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int flags) +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); @@ -3643,9 +3642,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 -static inline void update_load_avg(struct sched_entity *se, int not_used1) +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq_of(se)); + cfs_rq_util_change(cfs_rq); } static inline void @@ -3796,7 +3795,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); @@ -3880,7 +3879,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3968,7 +3967,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -4070,7 +4069,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_load_avg(prev, 0); + update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; } @@ -4086,7 +4085,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, UPDATE_TG); + update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK @@ -5004,7 +5003,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_shares(se); } @@ -5063,7 +5062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_shares(se); } @@ -7121,7 +7120,7 @@ static void update_blocked_averages(int cpu) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(se, 0); + update_load_avg(cfs_rq_of(se), se, 0); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -9295,7 +9294,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } } #else @@ -9307,7 +9306,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_load_avg(se, 0); + update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9326,7 +9325,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif /* Synchronize entity with its cfs_rq */ - update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9610,7 +9609,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); update_cfs_shares(se); } rq_unlock_irqrestore(rq, &rf); From b382a531b9fece229c8358a9fb79431cddcf93c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 17:37:03 +0200 Subject: [PATCH 0225/1085] sched/fair: Move enqueue migrate handling Move the entity migrate handling from enqueue_entity_load_avg() to update_load_avg(). This has two benefits: - {en,de}queue_entity_load_avg() will become purely about managing runnable_load - we can avoid a double update_tg_load_avg() and reduce pressure on the global tg->shares cacheline The reason we do this is so that we can change update_cfs_shares() to change both weight and (future) runnable_weight. For this to work we need to have the cfs_rq averages up-to-date (which means having done the attach), but we need the cfs_rq->avg.runnable_avg to not yet include the se's contribution (since se->on_rq == 0). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 70 +++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6c53580fea0..605b7c3e1ab8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3473,34 +3473,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return decayed || removed_load; } -/* - * Optional action to be done while updating the load average - */ -#define UPDATE_TG 0x1 -#define SKIP_AGE_LOAD 0x2 - -/* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - int decayed; - - /* - * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration - */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); - - decayed = update_cfs_rq_load_avg(now, cfs_rq); - decayed |= propagate_entity_load_avg(se); - - if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); -} - /** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to @@ -3541,17 +3513,46 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq); } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 +#define DO_ATTACH 0x4 + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + u64 now = cfs_rq_clock_task(cfs_rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int decayed; + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); + + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); + + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { + + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, 0); + + } else if (decayed && (flags & UPDATE_TG)) + update_tg_load_avg(cfs_rq, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->runnable_load_avg += se->avg.load_avg; cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; - - if (!se->avg.last_update_time) { - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); - } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ @@ -3641,6 +3642,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define DO_ATTACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -3795,7 +3797,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); enqueue_entity_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); From b5b3e35f4149df72aaba612bba195fb2ec37b1b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 17:38:30 +0200 Subject: [PATCH 0226/1085] sched/fair: Rename {en,de}queue_entity_load_avg() Since they're now purely about runnable_load, rename them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 605b7c3e1ab8..fad77c8454e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3549,7 +3549,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Add the load generated by se into cfs_rq's load average */ static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->runnable_load_avg += se->avg.load_avg; cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; @@ -3557,7 +3557,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); @@ -3650,9 +3650,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s } static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void @@ -3798,7 +3798,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); @@ -3882,7 +3882,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); From 8d5b9025f9b4500f828260dc62e8ffa823ce0d59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 17:45:35 +0200 Subject: [PATCH 0227/1085] sched/fair: Introduce {en,de}queue_load_avg() Analogous to the existing {en,de}queue_runnable_load_avg() add helpers for {en,de}queue_load_avg(). More users will follow. Includes some code movement to avoid fwd declarations. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 156 ++++++++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 70 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fad77c8454e4..654d8e3d6047 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2692,6 +2692,90 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of this helper and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_load_avg += se->avg.load_avg; + cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); +} + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +} +#else +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP /* @@ -3084,14 +3168,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cf sa->util_avg = sa->util_sum / divider; } -/* - * XXX we want to get rid of this helper and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - /* * sched_entity: * @@ -3141,26 +3217,6 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) return 0; } -/* - * Signed add and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define add_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - \ - res = var + val; \ - \ - if (val < 0 && res > var) \ - res = 0; \ - \ - WRITE_ONCE(*ptr, res); \ -} while (0) - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -3405,23 +3461,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() @@ -3484,8 +3523,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { se->avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; set_tg_cfs_propagate(cfs_rq); @@ -3503,9 +3541,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); set_tg_cfs_propagate(cfs_rq); @@ -3547,22 +3583,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s update_tg_load_avg(cfs_rq, 0); } -/* Add the load generated by se into cfs_rq's load average */ -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; -} - -/* Remove the runnable load generated by se from cfs_rq's runnable load average */ -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); -} - #ifndef CONFIG_64BIT static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { @@ -3649,10 +3669,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq); } -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void From 840c5abca499a858619954dbcffc82110bb6e076 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:11:34 +0200 Subject: [PATCH 0228/1085] sched/fair: More accurate reweight_entity() When a (group) entity changes it's weight we should instantly change its load_avg and propagate that change into the sums it is part of. Because we use these values to predict future behaviour and are not interested in its historical value. Without this change, the change in load would need to propagate through the average, by which time it could again have changed etc.. always chasing itself. With this change, the cfs_rq load_avg sum will more accurately reflect the current runnable and expected return of blocked load. Reported-by: Paul Turner [josef: compile fix !SMP || !FAIR_GROUP] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 654d8e3d6047..750ae4dbf812 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2886,12 +2886,22 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (cfs_rq->curr == se) update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); } + dequeue_load_avg(cfs_rq, se); update_load_set(&se->load, weight); - if (se->on_rq) +#ifdef CONFIG_SMP + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, + LOAD_AVG_MAX - 1024 + se->avg.period_contrib); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { account_entity_enqueue(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); + } } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); From 9059393e4ec1c8c6623a120b405ef2c90b968d80 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 17 May 2017 11:50:45 +0200 Subject: [PATCH 0229/1085] sched/fair: Use reweight_entity() for set_user_nice() Now that we directly change load_avg and propagate that change into the sums, sys_nice() and co should do the same, otherwise its possible to confuse load accounting when we migrate near the weight change. Fixes-by: Josef Bacik Signed-off-by: Vincent Guittot [ Added changelog, fixed the call condition. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 +++++++++++----- kernel/sched/fair.c | 63 ++++++++++++++++++++++++++------------------ kernel/sched/sched.h | 2 ++ 3 files changed, 54 insertions(+), 33 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..2288a145bf01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; + /* + * SCHED_OTHER tasks have to update their load when changing their + * weight + */ + if (update_load && p->sched_class == &fair_sched_class) { + reweight_task(p, prio); + } else { + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; + } } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); delta = p->prio - old_prio; @@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* Actually do priority change: must hold pi & rq lock. */ @@ -5933,7 +5941,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 750ae4dbf812..d8c02b31498d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2776,6 +2776,43 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); + } + dequeue_load_avg(cfs_rq, se); + + update_load_set(&se->load, weight); + +#ifdef CONFIG_SMP + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, + LOAD_AVG_MAX - 1024 + se->avg.period_contrib); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { + account_entity_enqueue(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); + } +} + +void reweight_task(struct task_struct *p, int prio) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + reweight_entity(cfs_rq, se, weight); + load->inv_weight = sched_prio_to_wmult[prio]; +} + #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP /* @@ -2878,32 +2915,6 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) } # endif /* CONFIG_SMP */ -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) -{ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); - } - dequeue_load_avg(cfs_rq, se); - - update_load_set(&se->load, weight); - -#ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); -#endif - - enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { - account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } -} - static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_shares(struct sched_entity *se) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..a5d97460ee4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1529,6 +1529,8 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); From 2a2f5d4e44ed160a5ed822c94e04f918f9fbb487 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 16:51:41 +0200 Subject: [PATCH 0230/1085] sched/fair: Rewrite cfs_rq->removed_*avg Since on wakeup migration we don't hold the rq->lock for the old CPU we cannot update its state. Instead we add the removed 'load' to an atomic variable and have the next update on that CPU collect and process it. Currently we have 2 atomic variables; which already have the issue that they can be read out-of-sync. Also, two atomic ops on a single cacheline is already more expensive than an uncontended lock. Since we want to add more, convert the thing over to an explicit cacheline with a lock in. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 +++---- kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++-------------- kernel/sched/sched.h | 13 +++++++---- 3 files changed, 48 insertions(+), 24 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..2f22342c48ff 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", - atomic_long_read(&cfs_rq->removed_load_avg)); - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", - atomic_long_read(&cfs_rq->removed_util_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", + cfs_rq->removed.load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", + cfs_rq->removed.util_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8c02b31498d..fe4a66b10480 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { + unsigned long removed_load = 0, removed_util = 0; struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed_load = 0, removed_util = 0; + int decayed = 0; - if (atomic_long_read(&cfs_rq->removed_load_avg)) { - s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + if (cfs_rq->removed.nr) { + unsigned long r; + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + /* + * The LOAD_AVG_MAX for _sum is a slight over-estimate, + * which is safe due to sub_positive() clipping at 0. + */ + r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); - removed_load = 1; - set_tg_cfs_propagate(cfs_rq); - } - if (atomic_long_read(&cfs_rq->removed_util_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - removed_util = 1; + set_tg_cfs_propagate(cfs_rq); + + decayed = 1; } - decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed || removed_util) + if (decayed) cfs_rq_util_change(cfs_rq); - return decayed || removed_load; + return decayed; } /** @@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. + * + * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING + * we could actually get the right time, since we're called with + * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->propagate_avg = 0; #endif - atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5d97460ee4e..2fd350a12bb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -445,14 +445,19 @@ struct cfs_rq { struct sched_avg avg; u64 runnable_load_sum; unsigned long runnable_load_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; unsigned long propagate_avg; #endif - atomic_long_t removed_load_avg, removed_util_avg; -#ifndef CONFIG_64BIT - u64 load_last_update_time_copy; -#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + } removed; #ifdef CONFIG_FAIR_GROUP_SCHED /* From 0e2d2aaaae52c247c047d14999b93486bdbd3431 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 17:30:46 +0200 Subject: [PATCH 0231/1085] sched/fair: Rewrite PELT migration propagation When an entity migrates in (or out) of a runqueue, we need to add (or remove) its contribution from the entire PELT hierarchy, because even non-runnable entities are included in the load average sums. In order to do this we have some propagation logic that updates the PELT tree, however the way it 'propagates' the runnable (or load) change is (more or less): tg->weight * grq->avg.load_avg ge->avg.load_avg = ------------------------------ tg->load_avg But that is the expression for ge->weight, and per the definition of load_avg: ge->avg.load_avg := ge->weight * ge->avg.runnable_avg That destroys the runnable_avg (by setting it to 1) we wanted to propagate. Instead directly propagate runnable_sum. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 + kernel/sched/fair.c | 188 ++++++++++++++++++++++++------------------- kernel/sched/sched.h | 9 ++- 3 files changed, 113 insertions(+), 86 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f22342c48ff..2e039a81864c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", + cfs_rq->removed.runnable_sum); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4a66b10480..086a5d979720 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } -/* Take into account change of utilization of a child task group */ + +/* + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to + * propagate its contribution. The key to this propagation is the invariant + * that for each group: + * + * ge->avg == grq->avg (1) + * + * _IFF_ we look at the pure running and runnable sums. Because they + * represent the very same entity, just at different points in the hierarchy. + * + * + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and + * simply copies the running sum over. + * + * However, update_tg_cfs_runnable() is more complex. So we have: + * + * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) + * + * And since, like util, the runnable part should be directly transferable, + * the following would _appear_ to be the straight forward approach: + * + * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * + * And per (1) we have: + * + * ge->avg.running_avg == grq->avg.running_avg + * + * Which gives: + * + * ge->load.weight * grq->avg.load_avg + * ge->avg.load_avg = ----------------------------------- (4) + * grq->load.weight + * + * Except that is wrong! + * + * Because while for entities historical weight is not important and we + * really only care about our future and therefore can consider a pure + * runnable sum, runqueues can NOT do this. + * + * We specifically want runqueues to have a load_avg that includes + * historical weights. Those represent the blocked load, the load we expect + * to (shortly) return to us. This only works by keeping the weights as + * integral part of the sum. We therefore cannot decompose as per (3). + * + * OK, so what then? + * + * + * Another way to look at things is: + * + * grq->avg.load_avg = \Sum se->avg.load_avg + * + * Therefore, per (2): + * + * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * + * And the very thing we're propagating is a change in that sum (someone + * joined/left). So we can easily know the runnable change, which would be, per + * (2) the already tracked se->load_avg divided by the corresponding + * se->weight. + * + * Basically (4) but in differential form: + * + * d(runnable_avg) += se->avg.load_avg / se->load.weight + * (5) + * ge->avg.load_avg += ge->load.weight * d(runnable_avg) + */ + static inline void -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; /* Nothing to update */ @@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; } -/* Take into account change of load of a child task group */ static inline void -update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long delta, load = gcfs_rq->avg.load_avg; + long runnable_sum = gcfs_rq->prop_runnable_sum; + long load_avg; + s64 load_sum; - /* - * If the load of group cfs_rq is null, the load of the - * sched_entity will also be null so we can skip the formula - */ - if (load) { - long tg_load; - - /* Get tg's load and ensure tg_load > 0 */ - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; - - /* Ensure tg_load >= load and updated with current load*/ - tg_load -= gcfs_rq->tg_load_avg_contrib; - tg_load += load; - - /* - * We need to compute a correction term in the case that the - * task group is consuming more CPU than a task of equal - * weight. A task with a weight equals to tg->shares will have - * a load less or equal to scale_load_down(tg->shares). - * Similarly, the sched_entities that represent the task group - * at parent level, can't have a load higher than - * scale_load_down(tg->shares). And the Sum of sched_entities' - * load must be <= scale_load_down(tg->shares). - */ - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { - /* scale gcfs_rq's load into tg's shares*/ - load *= scale_load_down(gcfs_rq->tg->shares); - load /= tg_load; - } - } - - delta = load - se->avg.load_avg; - - /* Nothing to update */ - if (!delta) + if (!runnable_sum) return; - /* Set new sched_entity's load */ - se->avg.load_avg = load; - se->avg.load_sum = LOAD_AVG_MAX; + gcfs_rq->prop_runnable_sum = 0; - /* Update parent cfs_rq load */ - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + load_sum = (s64)se_weight(se) * runnable_sum; + load_avg = div_s64(load_sum, LOAD_AVG_MAX); + + add_positive(&se->avg.load_sum, runnable_sum); + add_positive(&se->avg.load_avg, load_avg); + + add_positive(&cfs_rq->avg.load_avg, load_avg); + add_positive(&cfs_rq->avg.load_sum, load_sum); - /* - * If the sched_entity is already enqueued, we also have to update the - * runnable load avg. - */ if (se->on_rq) { - /* Update parent cfs_rq runnable_load_avg */ - add_positive(&cfs_rq->runnable_load_avg, delta); - cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->runnable_load_avg, load_avg); + add_positive(&cfs_rq->runnable_load_sum, load_sum); } } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) { - cfs_rq->propagate_avg = 1; -} - -static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - - if (!cfs_rq->propagate_avg) - return 0; - - cfs_rq->propagate_avg = 0; - return 1; + cfs_rq->propagate = 1; + cfs_rq->prop_runnable_sum += runnable_sum; } /* Update task and its cfs_rq load average */ static inline int propagate_entity_load_avg(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *gcfs_rq; if (entity_is_task(se)) return 0; - if (!test_and_clear_tg_cfs_propagate(se)) + gcfs_rq = group_cfs_rq(se); + if (!gcfs_rq->propagate) return 0; + gcfs_rq->propagate = 0; + cfs_rq = cfs_rq_of(se); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); - update_tg_cfs_util(cfs_rq, se); - update_tg_cfs_load(cfs_rq, se); + update_tg_cfs_util(cfs_rq, se, gcfs_rq); + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); return 1; } @@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) * If there is a pending propagation, we have to update the load and * the utilization of the sched_entity: */ - if (gcfs_rq->propagate_avg) + if (gcfs_rq->propagate) return false; /* @@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 0; } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); decayed = 1; } @@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } @@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->propagate_avg = 0; -#endif raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2fd350a12bb7..5bcb86eb026b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -447,19 +447,20 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; -#endif -#ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - unsigned long propagate_avg; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; + unsigned long runnable_sum; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + /* * h_load = weight * f(tg) * From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 15:59:54 +0200 Subject: [PATCH 0232/1085] sched/fair: Propagate an effective runnable_load_avg The load balancer uses runnable_load_avg as load indicator. For !cgroup this is: runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq That is, a direct sum of all runnable tasks on that runqueue. As opposed to load_avg, which is a sum of all tasks on the runqueue, which includes a blocked component. However, in the cgroup case, this comes apart since the group entities are always runnable, even if most of their constituent entities are blocked. Therefore introduce a runnable_weight which for task entities is the same as the regular weight, but for group entities is a fraction of the entity weight and represents the runnable part of the group runqueue. Then propagate this load through the PELT hierarchy to arrive at an effective runnable load avgerage -- which we should not confuse with the canonical runnable load average. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 + kernel/sched/debug.c | 8 +- kernel/sched/fair.c | 172 +++++++++++++++++++++++++++--------------- kernel/sched/sched.h | 3 +- 4 files changed, 124 insertions(+), 62 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a7df4e558c..bdd6ad6fcce1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -331,9 +331,11 @@ struct load_weight { struct sched_avg { u64 last_update_time; u64 load_sum; + u64 runnable_load_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; + unsigned long runnable_load_avg; unsigned long util_avg; }; @@ -376,6 +378,7 @@ struct sched_statistics { struct sched_entity { /* For load-balancing: */ struct load_weight load; + unsigned long runnable_weight; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2e039a81864c..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", @@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 086a5d979720..10d2000fca2d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = LOAD_AVG_MAX; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; + /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP /* - * XXX we want to get rid of this helper and use the full load resolution. + * XXX we want to get rid of these helpers and use the full load resolution. */ static inline long se_weight(struct sched_entity *se) { return scale_load_down(se->load.weight); } +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; } static inline void dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); } static inline void @@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) + unsigned long weight, unsigned long runnable) { if (se->on_rq) { /* commit outstanding execution time */ @@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); + se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); + do { + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + se->avg.runnable_load_avg = + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); + } while (0); #endif enqueue_load_avg(cfs_rq, se); @@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); + reweight_entity(cfs_rq, se, weight, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct sched_entity *se) +/* + * Recomputes the group entity based on the current state of its group + * runqueue. + */ +static void update_cfs_group(struct sched_entity *se) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - long shares; + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long shares, runnable; - if (!cfs_rq) + if (!gcfs_rq) return; - if (throttled_hierarchy(cfs_rq)) + if (throttled_hierarchy(gcfs_rq)) return; #ifndef CONFIG_SMP - shares = READ_ONCE(cfs_rq->tg->shares); + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(cfs_rq); + shares = calc_cfs_shares(gcfs_rq); + /* + * The hierarchical runnable load metric is the proportional part + * of this group's runnable_load_avg / load_avg. + * + * Note: we need to deal with very sporadic 'runnable > load' cases + * due to numerical instability. + */ + runnable = shares * gcfs_rq->avg.runnable_load_avg; + if (runnable) + runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); #endif - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, shares, runnable); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct sched_entity *se) +static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods); - } + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib * scale_cpu; @@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * this happens during idle_balance() which calls * update_blocked_averages() */ - if (!weight) - running = 0; + if (!load) + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) return 0; return 1; } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(weight * sa->load_sum, divider); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, divider); - } + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); sa->util_avg = sa->util_sum / divider; } /* * sched_entity: * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * * load_sum := runnable_sum * load_avg = se_weight(se) * runnable_avg * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * * cfq_rs: * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg */ static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, - cfs_rq->curr == se, NULL)) { + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; - ___update_load_avg(&se->avg, se_weight(se), NULL); + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, cpu, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq)) { - ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); return 1; } @@ -3409,8 +3457,8 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long runnable_sum = gcfs_rq->prop_runnable_sum; - long load_avg; - s64 load_sum; + long runnable_load_avg, load_avg; + s64 runnable_load_sum, load_sum; if (!runnable_sum) return; @@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf add_positive(&cfs_rq->avg.load_avg, load_avg); add_positive(&cfs_rq->avg.load_sum, load_sum); + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + + add_positive(&se->avg.runnable_load_sum, runnable_sum); + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + if (se->on_rq) { - add_positive(&cfs_rq->runnable_load_avg, load_avg); - add_positive(&cfs_rq->runnable_load_sum, load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); } } @@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se) static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->runnable_load_avg; + return cfs_rq->avg.runnable_load_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + update_cfs_group(se); enqueue_runnable_load_avg(cfs_rq, se); - update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(se); + update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(cfs_rq, curr, UPDATE_TG); - update_cfs_shares(curr); + update_cfs_group(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->runnable_load_sum) + if (cfs_rq->avg.runnable_load_sum) return false; return true; @@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(cfs_rq_of(se), se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5bcb86eb026b..e83d1b8be611 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -418,6 +418,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; + unsigned long runnable_weight; unsigned int nr_running, h_nr_running; u64 exec_clock; @@ -443,8 +444,6 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; - u64 runnable_load_sum; - unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif From 144d8487bc6e9b741895709cb46d4e19b748a725 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 May 2017 17:57:24 +0200 Subject: [PATCH 0233/1085] sched/fair: Implement synchonous PELT detach on load-balance migrate Vincent wondered why his self migrating task had a roughly 50% dip in load_avg when landing on the new CPU. This is because we uncondionally take the asynchronous detatch_entity route, which can lead to the attach on the new CPU still seeing the old CPU's contribution to tg->load_avg, effectively halving the new CPU's shares. While in general this is something we have to live with, there is the special case of runnable migration where we can do better. Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10d2000fca2d..92dbcc0fea46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3746,10 +3746,6 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. - * - * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING - * we could actually get the right time, since we're called with - * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); @@ -6292,6 +6288,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return new_cpu; } +static void detach_entity_cfs_rq(struct sched_entity *se); + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6325,14 +6323,25 @@ static void migrate_task_rq_fair(struct task_struct *p) se->vruntime -= min_vruntime; } - /* - * We are supposed to update the task to "current" time, then its up to date - * and ready to go to new CPU/cfs_rq. But we have difficulty in getting - * what current time is, so simply throw away the out-of-date time. This - * will result in the wakee task is less decayed, but giving the wakee more - * load sounds not bad. - */ - remove_entity_load_avg(&p->se); + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_held(&task_rq(p)->lock); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; From f207934fb79d1af1de1a62b09d56a3a1914172c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 May 2017 14:16:30 +0200 Subject: [PATCH 0234/1085] sched/fair: Align PELT windows between cfs_rq and its se The PELT _sum values are a saw-tooth function, dropping on the decay edge and then growing back up again during the window. When these window-edges are not aligned between cfs_rq and se, we can have the situation where, for example, on dequeue, the se decays first. Its _sum values will be small(er), while the cfs_rq _sum values will still be on their way up. Because of this, the subtraction: cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This will then, once the cfs_rq reaches an edge, translate into its _avg value jumping up. This is especially visible with the runnable_load bits, since they get added/subtracted a lot. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 92dbcc0fea46..954b332cd899 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -716,13 +716,8 @@ void init_entity_runnable_average(struct sched_entity *se) { struct sched_avg *sa = &se->avg; - sa->last_update_time = 0; - /* - * sched_avg's period_contrib should be strictly less then 1024, so - * we give it 1023 to make sure it is almost a period (1024us), and - * will definitely be update (after enqueue). - */ - sa->period_contrib = 1023; + memset(sa, 0, sizeof(*sa)); + /* * Tasks are intialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. @@ -731,13 +726,9 @@ void init_entity_runnable_average(struct sched_entity *se) */ if (entity_is_task(se)) sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; - /* - * At this point, util_avg won't be used in select_task_rq_fair anyway - */ - sa->util_avg = 0; - sa->util_sum = 0; + se->runnable_weight = se->load.weight; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -785,7 +776,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } else { sa->util_avg = cap; } - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } if (entity_is_task(se)) { @@ -3632,7 +3622,34 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + + /* + * When we attach the @se to the @cfs_rq, we must align the decay + * window because without that, really weird and wonderful things can + * happen. + * + * XXX illustrate + */ se->avg.last_update_time = cfs_rq->avg.last_update_time; + se->avg.period_contrib = cfs_rq->avg.period_contrib; + + /* + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new + * period_contrib. This isn't strictly correct, but since we're + * entirely outside of the PELT hierarchy, nobody cares if we truncate + * _sum a little. + */ + se->avg.util_sum = se->avg.util_avg * divider; + + se->avg.load_sum = divider; + if (se_weight(se)) { + se->avg.load_sum = + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); + } + + se->avg.runnable_load_sum = se->avg.load_sum; + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; From 9a2dd585b2c431ec1e5d46a9d9568291c7a534cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 May 2017 14:18:10 +0200 Subject: [PATCH 0235/1085] sched/fair: Implement more accurate async detach The problem with the overestimate is that it will subtract too big a value from the load_sum, thereby pushing it down further than it ought to go. Since runnable_load_avg is not subject to a similar 'force', this results in the occasional 'runnable_load > load' situation. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 954b332cd899..67c39642a512 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3574,6 +3574,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3582,17 +3583,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); - /* - * The LOAD_AVG_MAX for _sum is a slight over-estimate, - * which is safe due to sub_positive() clipping at 0. - */ r = removed_load; sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); + sub_positive(&sa->load_sum, r * divider); r = removed_util; sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + sub_positive(&sa->util_sum, r * divider); add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); From 2c8e4dce7963d2bae02db95fce2691365630685c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Aug 2017 11:13:39 -0400 Subject: [PATCH 0236/1085] sched/fair: Calculate runnable_weight slightly differently Our runnable_weight currently looks like this runnable_weight = shares * runnable_load_avg / load_avg The goal is to scale the runnable weight for the group based on its runnable to load_avg ratio. The problem with this is it biases us towards tasks that never go to sleep. Tasks that go to sleep are going to have their runnable_load_avg decayed pretty hard, which will drastically reduce the runnable weight of groups with interactive tasks. To solve this imbalance we tweak this slightly, so in the ideal case it is still the above, but in the interactive case it is runnable_weight = shares * runnable_weight / load_weight which will make the weight distribution fairer between interactive and non-interactive groups. Signed-off-by: Josef Bacik Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linux-kernel@vger.kernel.org Cc: riel@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 67c39642a512..a62098ec9deb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2883,7 +2883,7 @@ void reweight_task(struct task_struct *p, int prio) * * hence icky! */ -static long calc_cfs_shares(struct cfs_rq *cfs_rq) +static long calc_group_shares(struct cfs_rq *cfs_rq) { long tg_weight, tg_shares, load, shares; struct task_group *tg = cfs_rq->tg; @@ -2920,6 +2920,36 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } + +/* + * The runnable shares of this group are calculated as such + * + * max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight) + * shares * ------------------------------------------------------------ + * max(cfs_rq->avg.load_avg, cfs_rq->load.weight) + * + * We do this to keep the shares in line with expected load on the cfs_rq. + * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first + * time, it's runnable_load_avg is not going to be representative of the actual + * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq. + * The weight on the cfs_rq is the immediate effect of having new tasks + * enqueue'd onto it which should be used to calculate the new runnable shares. + * At the same time we need the actual load_avg to be the lower bounds for the + * calculation, to handle when our weight drops quickly from having entities + * dequeued. + */ +static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) +{ + long load_avg = max(cfs_rq->avg.load_avg, + scale_load_down(cfs_rq->load.weight)); + long runnable = max(cfs_rq->avg.runnable_load_avg, + scale_load_down(cfs_rq->runnable_weight)); + + runnable *= shares; + if (load_avg) + runnable /= load_avg; + return clamp_t(long, runnable, MIN_SHARES, shares); +} # endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -2945,17 +2975,8 @@ static void update_cfs_group(struct sched_entity *se) if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(gcfs_rq); - /* - * The hierarchical runnable load metric is the proportional part - * of this group's runnable_load_avg / load_avg. - * - * Note: we need to deal with very sporadic 'runnable > load' cases - * due to numerical instability. - */ - runnable = shares * gcfs_rq->avg.runnable_load_avg; - if (runnable) - runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); + shares = calc_group_shares(gcfs_rq); + runnable = calc_group_runnable(gcfs_rq, shares); #endif reweight_entity(cfs_rq_of(se), se, shares, runnable); From 17de4ee04ca925590df036b112c1db8a778e14bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 13:06:35 +0200 Subject: [PATCH 0237/1085] sched/fair: Update calc_group_*() comments I had a wee bit of trouble recalling how the calc_group_runnable() stuff worked.. add hopefully better comments. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a62098ec9deb..350dbec01523 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2860,7 +2860,7 @@ void reweight_task(struct task_struct *p, int prio) * Now, in that special case (1) reduces to: * * tg->weight * grq->load.weight - * ge->load.weight = ----------------------------- = tg>weight (4) + * ge->load.weight = ----------------------------- = tg->weight (4) * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. @@ -2874,6 +2874,18 @@ void reweight_task(struct task_struct *p, int prio) * --------------------------------------------------- (5) * tg->load_avg - grq->avg.load_avg + grq->load.weight * + * But because grq->load.weight can drop to 0, resulting in a divide by zero, + * we need to use grq->avg.load_avg as its lower bound, which then gives: + * + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (6) + * tg_load_avg' + * + * Where: + * + * tg_load_avg' = tg->load_avg - grq->avg.load_avg + + * max(grq->load.weight, grq->avg.load_avg) * * And that is shares_weight and is icky. In the (near) UP case it approaches * (4) while in the normal case it approaches (3). It consistently @@ -2890,10 +2902,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) tg_shares = READ_ONCE(tg->shares); - /* - * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3) - * as a lower bound. - */ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); tg_weight = atomic_long_read(&tg->load_avg); @@ -2922,32 +2930,46 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } /* - * The runnable shares of this group are calculated as such + * This calculates the effective runnable weight for a group entity based on + * the group entity weight calculated above. * - * max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight) - * shares * ------------------------------------------------------------ - * max(cfs_rq->avg.load_avg, cfs_rq->load.weight) + * Because of the above approximation (2), our group entity weight is + * an load_avg based ratio (3). This means that it includes blocked load and + * does not represent the runnable weight. * - * We do this to keep the shares in line with expected load on the cfs_rq. - * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first - * time, it's runnable_load_avg is not going to be representative of the actual - * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq. - * The weight on the cfs_rq is the immediate effect of having new tasks - * enqueue'd onto it which should be used to calculate the new runnable shares. - * At the same time we need the actual load_avg to be the lower bounds for the - * calculation, to handle when our weight drops quickly from having entities - * dequeued. + * Approximate the group entity's runnable weight per ratio from the group + * runqueue: + * + * grq->avg.runnable_load_avg + * ge->runnable_weight = ge->load.weight * -------------------------- (7) + * grq->avg.load_avg + * + * However, analogous to above, since the avg numbers are slow, this leads to + * transients in the from-idle case. Instead we use: + * + * ge->runnable_weight = ge->load.weight * + * + * max(grq->avg.runnable_load_avg, grq->runnable_weight) + * ----------------------------------------------------- (8) + * max(grq->avg.load_avg, grq->load.weight) + * + * Where these max() serve both to use the 'instant' values to fix the slow + * from-idle and avoid the /0 on to-idle, similar to (6). */ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) { - long load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - long runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); + long runnable, load_avg; + + load_avg = max(cfs_rq->avg.load_avg, + scale_load_down(cfs_rq->load.weight)); + + runnable = max(cfs_rq->avg.runnable_load_avg, + scale_load_down(cfs_rq->runnable_weight)); runnable *= shares; if (load_avg) runnable /= load_avg; + return clamp_t(long, runnable, MIN_SHARES, shares); } # endif /* CONFIG_SMP */ From 77072f09eab19326dd2424c8dad0a443341a228f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 29 Sep 2017 11:23:35 +0200 Subject: [PATCH 0238/1085] x86/stacktrace: Avoid recording save_stack_trace() wrappers The save_stack_trace() and save_stack_trace_tsk() wrappers of __save_stack_trace() add themselves to the call stack, and thus appear in the recorded stacktraces. This is redundant and wasteful when we have limited space to record the useful part of the backtrace with e.g. page_owner functionality. Fix this by making sure __save_stack_trace() is noinline (which matches the current gcc decision) and bumping the skip in the wrappers (save_stack_trace_tsk() only when called for the current task). This is similar to what was done for arm in 3683f44c42e9 ("ARM: stacktrace: avoid listing stacktrace functions in stacktrace") and is pending for arm64. Also make sure that __save_stack_trace_reliable() doesn't get this problem in the future by marking it __always_inline (which matches current gcc decision), per Josh Poimboeuf. Signed-off-by: Vlastimil Babka Acked-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929092335.2744-1-vbabka@suse.cz Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8dabd7bf1673..77835bc021c7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -30,7 +30,7 @@ static int save_stack_address(struct stack_trace *trace, unsigned long addr, return 0; } -static void __save_stack_trace(struct stack_trace *trace, +static void noinline __save_stack_trace(struct stack_trace *trace, struct task_struct *task, struct pt_regs *regs, bool nosched) { @@ -56,6 +56,7 @@ static void __save_stack_trace(struct stack_trace *trace, */ void save_stack_trace(struct stack_trace *trace) { + trace->skip++; __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -70,6 +71,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (!try_get_task_stack(tsk)) return; + if (tsk == current) + trace->skip++; __save_stack_trace(trace, tsk, NULL, true); put_task_stack(tsk); @@ -88,8 +91,9 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); } \ }) -static int __save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +static int __always_inline +__save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; From ddde0e7d950d912c8a75adfcd89134cca21b389b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 26 Sep 2017 15:55:46 +0100 Subject: [PATCH 0239/1085] mmc: sdhci-omap: make three functions static, fixes warnings The functions sdhci_omap_set_ios, sdhci_omap_set_power and sdhci_omap_get_min_clock are local to the source and do not need to be in global scope, so make them static. Cleans up sparse warnings: symbol 'sdhci_omap_set_ios' was not declared. Should it be static? symbol 'sdhci_omap_set_power' was not declared. Should it be static? symbol 'sdhci_omap_get_min_clock' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-omap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index 5ddae39816b7..628bfe9a3d17 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -262,7 +262,7 @@ static void sdhci_omap_set_bus_mode(struct sdhci_omap_host *omap_host, omap_host->bus_mode = mode; } -void sdhci_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +static void sdhci_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) { struct sdhci_host *host = mmc_priv(mmc); struct sdhci_pltfm_host *pltfm_host; @@ -323,7 +323,7 @@ static void sdhci_omap_set_clock(struct sdhci_host *host, unsigned int clock) sdhci_omap_start_clock(omap_host); } -void sdhci_omap_set_power(struct sdhci_host *host, unsigned char mode, +static void sdhci_omap_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) { struct mmc_host *mmc = host->mmc; @@ -344,7 +344,7 @@ static int sdhci_omap_enable_dma(struct sdhci_host *host) return 0; } -unsigned int sdhci_omap_get_min_clock(struct sdhci_host *host) +static unsigned int sdhci_omap_get_min_clock(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); From 10836d9f9ac63d40ccfa756f871ce4ed51ae3b52 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 3 Jul 2017 16:50:30 +0200 Subject: [PATCH 0240/1085] perf tests attr: Fix task term values The perf_event_attr::task is 1 by default for first (tracking) event in the session. Setting task=1 as default and adding task=0 for cases that need it. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas-Mich Richter Link: http://lkml.kernel.org/r/20170703145030.12903-16-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/base-record | 2 +- tools/perf/tests/attr/test-record-group | 1 + tools/perf/tests/attr/test-record-group-sampling | 2 +- tools/perf/tests/attr/test-record-group1 | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 31e0b1da830b..37940665f736 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -23,7 +23,7 @@ comm=1 freq=1 inherit_stat=0 enable_on_exec=1 -task=0 +task=1 watermark=0 precise_ip=0|1|2|3 mmap_data=0 diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group index 6e7961f6f7a5..618ba1c17474 100644 --- a/tools/perf/tests/attr/test-record-group +++ b/tools/perf/tests/attr/test-record-group @@ -17,5 +17,6 @@ sample_type=327 read_format=4 mmap=0 comm=0 +task=0 enable_on_exec=0 disabled=0 diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling index ef59afd6d635..f906b793196f 100644 --- a/tools/perf/tests/attr/test-record-group-sampling +++ b/tools/perf/tests/attr/test-record-group-sampling @@ -23,7 +23,7 @@ sample_type=343 # PERF_FORMAT_ID | PERF_FORMAT_GROUP read_format=12 - +task=0 mmap=0 comm=0 enable_on_exec=0 diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1 index 87a222d014d8..48e8bd12fe46 100644 --- a/tools/perf/tests/attr/test-record-group1 +++ b/tools/perf/tests/attr/test-record-group1 @@ -18,5 +18,6 @@ sample_type=327 read_format=4 mmap=0 comm=0 +task=0 enable_on_exec=0 disabled=0 From 3440fe2790aa3d13530260af6033533b18959aee Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 13 Sep 2017 10:12:08 +0200 Subject: [PATCH 0241/1085] perf test attr: Fix python error on empty result Commit d78ada4a767 ("perf tests attr: Do not store failed events") does not create an event file in the /tmp directory when the perf_open_event() system call failed. This can lead to a situation where not /tmp/event-xx-yy-zz result file exists at all (for example on a s390x virtual machine environment) where no CPUMF hardware is available. The following command then fails with a python call back chain instead of printing failure: [root@s8360046 perf]# /usr/bin/python2 ./tests/attr.py -d ./tests/attr/ \ -p ./perf -v -ttest-stat-basic running './tests/attr//test-stat-basic' Traceback (most recent call last): File "./tests/attr.py", line 379, in main() File "./tests/attr.py", line 370, in main run_tests(options) File "./tests/attr.py", line 311, in run_tests Test(f, options).run() File "./tests/attr.py", line 300, in run self.compare(self.expect, self.result) File "./tests/attr.py", line 248, in compare exp_event.diff(res_event) UnboundLocalError: local variable 'res_event' referenced before assignment [root@s8360046 perf]# This patch catches this pitfall and prints an error message instead: [root@s8360047 perf]# /usr/bin/python2 ./tests/attr.py -d ./tests/attr/ \ -p ./perf -vvv -ttest-stat-basic running './tests/attr//test-stat-basic' loading expected events Event event:base-stat fd = 1 group_fd = -1 flags = 0|8 [....] sample_regs_user = 0 sample_stack_user = 0 'PERF_TEST_ATTR=/tmp/tmpJbMQMP ./perf stat -o /tmp/tmpJbMQMP/perf.data -e cycles kill >/dev/null 2>&1' ret '1', expected '1' loading result events compare matching [event:base-stat] match: [event:base-stat] matches [] res_event is empty FAILED './tests/attr//test-stat-basic' - match failure [root@s8360047 perf]# Signed-off-by: Thomas-Mich Richter Acked-by: Jiri Olsa Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Martin Schwidefsky Cc: Thomas-Mich Richter LPU-Reference: 20170913081209.39570-1-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-04d63nn7svfgxdhi60gq2mlm@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py index 6bb50e82a3e3..a13cd780148e 100644 --- a/tools/perf/tests/attr.py +++ b/tools/perf/tests/attr.py @@ -237,6 +237,7 @@ class Test(object): # events in result. Fail if there's not any. for exp_name, exp_event in expect.items(): exp_list = [] + res_event = {} log.debug(" matching [%s]" % exp_name) for res_name, res_event in result.items(): log.debug(" to [%s]" % res_name) @@ -253,7 +254,10 @@ class Test(object): if exp_event.optional(): log.debug(" %s does not match, but is optional" % exp_name) else: - exp_event.diff(res_event) + if not res_event: + log.debug(" res_event is empty"); + else: + exp_event.diff(res_event) raise Fail(self, 'match failure'); match[exp_name] = exp_list From 22905582f6dd4bbd0c370fe5732c607452010c04 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 13 Sep 2017 10:12:09 +0200 Subject: [PATCH 0242/1085] perf test attr: Fix ignored test case result Command perf test -v 16 (Setup struct perf_event_attr test) always reports success even if the test case fails. It works correctly if you also specify -F (for don't fork). root@s35lp76 perf]# ./perf test -v 16 15: Setup struct perf_event_attr : --- start --- running './tests/attr/test-record-no-delay' [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB /tmp/tmp4E1h7R/perf.data (1 samples) ] expected task=0, got 1 expected precise_ip=0, got 3 expected wakeup_events=1, got 0 FAILED './tests/attr/test-record-no-delay' - match failure test child finished with 0 ---- end ---- Setup struct perf_event_attr: Ok The reason for the wrong error reporting is the return value of the system() library call. It is called in run_dir() file tests/attr.c and returns the exit status, in above case 0xff00. This value is given as parameter to the exit() function which can only handle values 0-0xff. The child process terminates with exit value of 0 and the parent does not detect any error. This patch corrects the error reporting and prints the correct test result. Signed-off-by: Thomas-Mich Richter Acked-by: Jiri Olsa Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Martin Schwidefsky Cc: Thomas-Mich Richter LPU-Reference: 20170913081209.39570-2-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-rdube6rfcjsr1nzue72c7lqn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index c9aafed7da15..25ede4472465 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -166,7 +166,7 @@ static int run_dir(const char *d, const char *perf) snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %.*s", d, d, perf, vcnt, v); - return system(cmd); + return system(cmd) ? TEST_FAIL : TEST_OK; } int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused) From b32ee9e522f7ba26339856a047cfe9efc0be0ff3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 29 Sep 2017 07:47:52 -0700 Subject: [PATCH 0243/1085] perf tools: Lock to protect namespaces and comm list Add two locks to protect namespaces_list and comm_list. The lock is only needed for multithreaded code, so using mutex wrappers provided by perf tool. Not all the comm_list/namespaces_list accessing are protected, e.g. thread__exec_comm. Because the multithread code for perf top event synthesizing does not touch them. They don't need a lock. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Andi Kleen Cc: He Kuang Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1506696477-146932-2-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 53 ++++++++++++++++++++++++++++++++++++---- tools/perf/util/thread.h | 3 +++ 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index c09bdb509d82..bf73117b4822 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -45,6 +45,8 @@ struct thread *thread__new(pid_t pid, pid_t tid) thread->cpu = -1; INIT_LIST_HEAD(&thread->namespaces_list); INIT_LIST_HEAD(&thread->comm_list); + init_rwsem(&thread->namespaces_lock); + init_rwsem(&thread->comm_lock); comm_str = malloc(32); if (!comm_str) @@ -83,18 +85,26 @@ void thread__delete(struct thread *thread) map_groups__put(thread->mg); thread->mg = NULL; } + down_write(&thread->namespaces_lock); list_for_each_entry_safe(namespaces, tmp_namespaces, &thread->namespaces_list, list) { list_del(&namespaces->list); namespaces__free(namespaces); } + up_write(&thread->namespaces_lock); + + down_write(&thread->comm_lock); list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) { list_del(&comm->list); comm__free(comm); } + up_write(&thread->comm_lock); + unwind__finish_access(thread); nsinfo__zput(thread->nsinfo); + exit_rwsem(&thread->namespaces_lock); + exit_rwsem(&thread->comm_lock); free(thread); } @@ -125,8 +135,8 @@ struct namespaces *thread__namespaces(const struct thread *thread) return list_first_entry(&thread->namespaces_list, struct namespaces, list); } -int thread__set_namespaces(struct thread *thread, u64 timestamp, - struct namespaces_event *event) +static int __thread__set_namespaces(struct thread *thread, u64 timestamp, + struct namespaces_event *event) { struct namespaces *new, *curr = thread__namespaces(thread); @@ -149,6 +159,17 @@ int thread__set_namespaces(struct thread *thread, u64 timestamp, return 0; } +int thread__set_namespaces(struct thread *thread, u64 timestamp, + struct namespaces_event *event) +{ + int ret; + + down_write(&thread->namespaces_lock); + ret = __thread__set_namespaces(thread, timestamp, event); + up_write(&thread->namespaces_lock); + return ret; +} + struct comm *thread__comm(const struct thread *thread) { if (list_empty(&thread->comm_list)) @@ -170,8 +191,8 @@ struct comm *thread__exec_comm(const struct thread *thread) return last; } -int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, - bool exec) +static int ____thread__set_comm(struct thread *thread, const char *str, + u64 timestamp, bool exec) { struct comm *new, *curr = thread__comm(thread); @@ -195,6 +216,17 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, return 0; } +int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, + bool exec) +{ + int ret; + + down_write(&thread->comm_lock); + ret = ____thread__set_comm(thread, str, timestamp, exec); + up_write(&thread->comm_lock); + return ret; +} + int thread__set_comm_from_proc(struct thread *thread) { char path[64]; @@ -212,7 +244,7 @@ int thread__set_comm_from_proc(struct thread *thread) return err; } -const char *thread__comm_str(const struct thread *thread) +static const char *__thread__comm_str(const struct thread *thread) { const struct comm *comm = thread__comm(thread); @@ -222,6 +254,17 @@ const char *thread__comm_str(const struct thread *thread) return comm__str(comm); } +const char *thread__comm_str(const struct thread *thread) +{ + const char *str; + + down_read((struct rw_semaphore *)&thread->comm_lock); + str = __thread__comm_str(thread); + up_read((struct rw_semaphore *)&thread->comm_lock); + + return str; +} + /* CHECKME: it should probably better return the max comm len from its comm list */ int thread__comm_len(struct thread *thread) { diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index cb1a5dd5c2b9..10555d6a0b86 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -9,6 +9,7 @@ #include "symbol.h" #include #include +#include "rwsem.h" struct thread_stack; struct unwind_libunwind_ops; @@ -29,7 +30,9 @@ struct thread { int comm_len; bool dead; /* if set thread has exited */ struct list_head namespaces_list; + struct rw_semaphore namespaces_lock; struct list_head comm_list; + struct rw_semaphore comm_lock; u64 db_id; void *priv; From f988e71bc6220d8b404dbd43c0e0962e30305795 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 29 Sep 2017 07:47:53 -0700 Subject: [PATCH 0244/1085] perf tools: Lock to protect comm_str rb tree Add comm_str_lock to protect comm_str rb tree. The lock is only needed for multithreaded code, so using mutex wrappers provided by perf tool. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Andi Kleen Cc: He Kuang Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1506696477-146932-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/comm.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index 7bc981b6bf29..756a9c14efbb 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -5,6 +5,7 @@ #include #include #include +#include "rwsem.h" struct comm_str { char *str; @@ -14,6 +15,7 @@ struct comm_str { /* Should perhaps be moved to struct machine */ static struct rb_root comm_str_root; +static struct rw_semaphore comm_str_lock = {.lock = PTHREAD_RWLOCK_INITIALIZER,}; static struct comm_str *comm_str__get(struct comm_str *cs) { @@ -25,7 +27,9 @@ static struct comm_str *comm_str__get(struct comm_str *cs) static void comm_str__put(struct comm_str *cs) { if (cs && refcount_dec_and_test(&cs->refcnt)) { + down_write(&comm_str_lock); rb_erase(&cs->rb_node, &comm_str_root); + up_write(&comm_str_lock); zfree(&cs->str); free(cs); } @@ -50,7 +54,8 @@ static struct comm_str *comm_str__alloc(const char *str) return cs; } -static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) +static +struct comm_str *__comm_str__findnew(const char *str, struct rb_root *root) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -81,6 +86,17 @@ static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) return new; } +static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) +{ + struct comm_str *cs; + + down_write(&comm_str_lock); + cs = __comm_str__findnew(str, root); + up_write(&comm_str_lock); + + return cs; +} + struct comm *comm__new(const char *str, u64 timestamp, bool exec) { struct comm *comm = zalloc(sizeof(*comm)); From 340b47f510bbe55a76b7309107276f02ea11f117 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 29 Sep 2017 07:47:54 -0700 Subject: [PATCH 0245/1085] perf top: Implement multithreading for perf_event__synthesize_threads The proc files which is sorted with alphabetical order are evenly assigned to several synthesize threads to be processed in parallel. For 'perf top', the threads number hard code to online CPU number. The following patch will introduce an option to set it. For other perf tools, the thread number is 1. Because the process function is not ready for multithreading, e.g. process_synthesized_event. This patch series only support event synthesize multithreading for 'perf top'. For other tools, it can be done separately later. With multithread applied, the total processing time can get up to 1.56x speedup on Knights Mill for 'perf top'. For specific single event processing, the processing time could increase because of the lock contention. So proc_map_timeout may need to be increased. Otherwise some proc maps will be truncated. Based on my test, increasing the proc_map_timeout has small impact on the total processing time. The total processing time still get 1.49x speedup on Knights Mill after increasing the proc_map_timeout. The patch itself doesn't increase the proc_map_timeout. Doesn't need to implement multithreading for per task monitoring, perf_event__synthesize_thread_map. It doesn't have performance issue. Committer testing: # getconf _NPROCESSORS_ONLN 4 # perf trace --no-inherit -e clone -o /tmp/output perf top # tail -4 /tmp/bla 0.124 ( 0.041 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eb3a8f30, parent_tidptr: 0x7fc3eb3a99d0, child_tidptr: 0x7fc3eb3a99d0, tls: 0x7fc3eb3a9700) = 9548 (perf) 0.246 ( 0.023 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eaba7f30, parent_tidptr: 0x7fc3eaba89d0, child_tidptr: 0x7fc3eaba89d0, tls: 0x7fc3eaba8700) = 9549 (perf) 0.286 ( 0.019 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9550 (perf) 246.540 ( 0.047 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9551 (perf) # Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Andi Kleen Cc: He Kuang Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1506696477-146932-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 3 +- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-top.c | 8 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/mmap-thread-lookup.c | 2 +- tools/perf/util/event.c | 160 +++++++++++++++++++++----- tools/perf/util/event.h | 3 +- tools/perf/util/machine.c | 8 +- tools/perf/util/machine.h | 9 +- 9 files changed, 155 insertions(+), 42 deletions(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index c747a1af49fe..721f4f91291a 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1441,7 +1441,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, perf_session__set_id_hdr_size(kvm->session); ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true); machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target, - kvm->evlist->threads, false, kvm->opts.proc_map_timeout); + kvm->evlist->threads, false, + kvm->opts.proc_map_timeout, 1); err = kvm_live_open_events(kvm); if (err) goto out; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 9b379f3a3d99..234fdf4734f6 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -863,7 +863,7 @@ static int record__synthesize(struct record *rec, bool tail) err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads, process_synthesized_event, opts->sample_address, - opts->proc_map_timeout); + opts->proc_map_timeout, 1); out: return err; } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ee954bde7e3e..bc31b93cc1d8 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -958,8 +958,14 @@ static int __cmd_top(struct perf_top *top) if (perf_session__register_idle_thread(top->session) < 0) goto out_delete; + perf_set_multithreaded(); + machine__synthesize_threads(&top->session->machines.host, &opts->target, - top->evlist->threads, false, opts->proc_map_timeout); + top->evlist->threads, false, + opts->proc_map_timeout, + (unsigned int)sysconf(_SC_NPROCESSORS_ONLN)); + + perf_set_singlethreaded(); if (perf_hpp_list.socket) { ret = perf_env__read_cpu_topology_map(&perf_env); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 967bd351b58d..afef6fe46c45 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1131,7 +1131,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, evlist->threads, trace__tool_process, false, - trace->opts.proc_map_timeout); + trace->opts.proc_map_timeout, 1); if (err) symbol__exit(); diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index f94a4196e7c9..2a0068afe3bf 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -131,7 +131,7 @@ static int synth_all(struct machine *machine) { return perf_event__synthesize_threads(NULL, perf_event__process, - machine, 0, 500); + machine, 0, 500, 1); } static int synth_process(struct machine *machine) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 10366b87d0b5..0e678dd6bdbe 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -678,23 +678,21 @@ out: return err; } -int perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data, - unsigned int proc_map_timeout) +static int __perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data, + unsigned int proc_map_timeout, + struct dirent **dirent, + int start, + int num) { union perf_event *comm_event, *mmap_event, *fork_event; union perf_event *namespaces_event; - char proc_path[PATH_MAX]; - struct dirent **dirent; int err = -1; char *end; pid_t pid; - int n, i; - - if (machine__is_default_guest(machine)) - return 0; + int i; comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); if (comm_event == NULL) @@ -714,34 +712,25 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (namespaces_event == NULL) goto out_free_fork; - snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); - n = scandir(proc_path, &dirent, 0, alphasort); - - if (n < 0) - goto out_free_namespaces; - - for (i = 0; i < n; i++) { + for (i = start; i < start + num; i++) { if (!isdigit(dirent[i]->d_name[0])) continue; pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); /* only interested in proper numerical dirents */ - if (!*end) { - /* - * We may race with exiting thread, so don't stop just because - * one thread couldn't be synthesized. - */ - __event__synthesize_thread(comm_event, mmap_event, fork_event, - namespaces_event, pid, 1, process, - tool, machine, mmap_data, - proc_map_timeout); - } - free(dirent[i]); + if (*end) + continue; + /* + * We may race with exiting thread, so don't stop just because + * one thread couldn't be synthesized. + */ + __event__synthesize_thread(comm_event, mmap_event, fork_event, + namespaces_event, pid, 1, process, + tool, machine, mmap_data, + proc_map_timeout); } - free(dirent); err = 0; -out_free_namespaces: free(namespaces_event); out_free_fork: free(fork_event); @@ -753,6 +742,115 @@ out: return err; } +struct synthesize_threads_arg { + struct perf_tool *tool; + perf_event__handler_t process; + struct machine *machine; + bool mmap_data; + unsigned int proc_map_timeout; + struct dirent **dirent; + int num; + int start; +}; + +static void *synthesize_threads_worker(void *arg) +{ + struct synthesize_threads_arg *args = arg; + + __perf_event__synthesize_threads(args->tool, args->process, + args->machine, args->mmap_data, + args->proc_map_timeout, args->dirent, + args->start, args->num); + return NULL; +} + +int perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data, + unsigned int proc_map_timeout, + unsigned int nr_threads_synthesize) +{ + struct synthesize_threads_arg *args = NULL; + pthread_t *synthesize_threads = NULL; + char proc_path[PATH_MAX]; + struct dirent **dirent; + int num_per_thread; + int m, n, i, j; + int thread_nr; + int base = 0; + int err = -1; + + + if (machine__is_default_guest(machine)) + return 0; + + snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); + n = scandir(proc_path, &dirent, 0, alphasort); + if (n < 0) + return err; + + thread_nr = nr_threads_synthesize; + + if (thread_nr <= 1) { + err = __perf_event__synthesize_threads(tool, process, + machine, mmap_data, + proc_map_timeout, + dirent, base, n); + goto free_dirent; + } + if (thread_nr > n) + thread_nr = n; + + synthesize_threads = calloc(sizeof(pthread_t), thread_nr); + if (synthesize_threads == NULL) + goto free_dirent; + + args = calloc(sizeof(*args), thread_nr); + if (args == NULL) + goto free_threads; + + num_per_thread = n / thread_nr; + m = n % thread_nr; + for (i = 0; i < thread_nr; i++) { + args[i].tool = tool; + args[i].process = process; + args[i].machine = machine; + args[i].mmap_data = mmap_data; + args[i].proc_map_timeout = proc_map_timeout; + args[i].dirent = dirent; + } + for (i = 0; i < m; i++) { + args[i].num = num_per_thread + 1; + args[i].start = i * args[i].num; + } + if (i != 0) + base = args[i-1].start + args[i-1].num; + for (j = i; j < thread_nr; j++) { + args[j].num = num_per_thread; + args[j].start = base + (j - i) * args[i].num; + } + + for (i = 0; i < thread_nr; i++) { + if (pthread_create(&synthesize_threads[i], NULL, + synthesize_threads_worker, &args[i])) + goto out_join; + } + err = 0; +out_join: + for (i = 0; i < thread_nr; i++) + pthread_join(synthesize_threads[i], NULL); + free(args); +free_threads: + free(synthesize_threads); +free_dirent: + for (i = 0; i < n; i++) + free(dirent[i]); + free(dirent); + + return err; +} + struct process_symbol_args { const char *name; u64 start; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index ee7bcc898d35..d6cbb0a0d919 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -680,7 +680,8 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool, int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, - unsigned int proc_map_timeout); + unsigned int proc_map_timeout, + unsigned int nr_threads_synthesize); int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 585b4a3d64a4..7c3aa479201a 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2218,12 +2218,16 @@ int machines__for_each_thread(struct machines *machines, int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, struct target *target, struct thread_map *threads, perf_event__handler_t process, bool data_mmap, - unsigned int proc_map_timeout) + unsigned int proc_map_timeout, + unsigned int nr_threads_synthesize) { if (target__has_task(target)) return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout); else if (target__has_cpu(target)) - return perf_event__synthesize_threads(tool, process, machine, data_mmap, proc_map_timeout); + return perf_event__synthesize_threads(tool, process, + machine, data_mmap, + proc_map_timeout, + nr_threads_synthesize); /* command specified */ return 0; } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index b1cd516f2025..c6a299ea506c 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -257,15 +257,18 @@ int machines__for_each_thread(struct machines *machines, int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, struct target *target, struct thread_map *threads, perf_event__handler_t process, bool data_mmap, - unsigned int proc_map_timeout); + unsigned int proc_map_timeout, + unsigned int nr_threads_synthesize); static inline int machine__synthesize_threads(struct machine *machine, struct target *target, struct thread_map *threads, bool data_mmap, - unsigned int proc_map_timeout) + unsigned int proc_map_timeout, + unsigned int nr_threads_synthesize) { return __machine__synthesize_threads(machine, NULL, target, threads, perf_event__process, data_mmap, - proc_map_timeout); + proc_map_timeout, + nr_threads_synthesize); } pid_t machine__get_current_tid(struct machine *machine, int cpu); From 0c6b499495e928777c41ca2de4fbb58788269690 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 29 Sep 2017 07:47:55 -0700 Subject: [PATCH 0246/1085] perf top: Add option to set the number of thread for event synthesize Using UINT_MAX to indicate the default thread#, which is the max number of online CPU. Committer testing: # perf trace --no-inherit -e clone -o /tmp/output perf top --num-thread-synthesize 9 # cat /tmp/output ? ( ? ): ... [continued]: clone()) = 26651 (perf) 0.059 ( 0.010 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5bfac44f30, parent_tidptr: 0x7f5bfac459d0, child_tidptr: 0x7f5bfac459d0, tls: 0x7f5bfac45700) = 26652 (perf) 0.116 ( 0.014 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5bfa443f30, parent_tidptr: 0x7f5bfa4449d0, child_tidptr: 0x7f5bfa4449d0, tls: 0x7f5bfa444700) = 26653 (perf) 0.141 ( 0.009 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5bf9c42f30, parent_tidptr: 0x7f5bf9c439d0, child_tidptr: 0x7f5bf9c439d0, tls: 0x7f5bf9c43700) = 26654 (perf) 0.160 ( 0.012 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5bf9441f30, parent_tidptr: 0x7f5bf94429d0, child_tidptr: 0x7f5bf94429d0, tls: 0x7f5bf9442700) = 26655 (perf) 0.232 ( 0.013 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5bf8c40f30, parent_tidptr: 0x7f5bf8c419d0, child_tidptr: 0x7f5bf8c419d0, tls: 0x7f5bf8c41700) = 26656 (perf) 0.393 ( 0.011 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5be3ffef30, parent_tidptr: 0x7f5be3fff9d0, child_tidptr: 0x7f5be3fff9d0, tls: 0x7f5be3fff700) = 26657 (perf) 0.802 ( 0.012 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5be37fdf30, parent_tidptr: 0x7f5be37fe9d0, child_tidptr: 0x7f5be37fe9d0, tls: 0x7f5be37fe700) = 26658 (perf) 1.411 ( 0.022 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5be2ffcf30, parent_tidptr: 0x7f5be2ffd9d0, child_tidptr: 0x7f5be2ffd9d0, tls: 0x7f5be2ffd700) = 26659 (perf) 246.422 ( 0.042 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7f5be2ffcf30, parent_tidptr: 0x7f5be2ffd9d0, child_tidptr: 0x7f5be2ffd9d0, tls: 0x7f5be2ffd700) = 26660 (perf) # Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Andi Kleen Cc: He Kuang Cc: Lukasz Odzioba Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1506696477-146932-5-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-top.txt | 3 +++ tools/perf/builtin-top.c | 11 ++++++++--- tools/perf/util/event.c | 5 ++++- tools/perf/util/top.h | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index d864ea6fd367..4353262bc462 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -240,6 +240,9 @@ Default is to monitor all CPUS. --force:: Don't do ownership validation. +--num-thread-synthesize:: + The number of threads to run when synthesizing events for existing processes. + By default, the number of threads equals to the number of online CPUs. INTERACTIVE PROMPTING KEYS -------------------------- diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index bc31b93cc1d8..477a8699f0b5 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -958,14 +958,16 @@ static int __cmd_top(struct perf_top *top) if (perf_session__register_idle_thread(top->session) < 0) goto out_delete; - perf_set_multithreaded(); + if (top->nr_threads_synthesize > 1) + perf_set_multithreaded(); machine__synthesize_threads(&top->session->machines.host, &opts->target, top->evlist->threads, false, opts->proc_map_timeout, - (unsigned int)sysconf(_SC_NPROCESSORS_ONLN)); + top->nr_threads_synthesize); - perf_set_singlethreaded(); + if (top->nr_threads_synthesize > 1) + perf_set_singlethreaded(); if (perf_hpp_list.socket) { ret = perf_env__read_cpu_topology_map(&perf_env); @@ -1118,6 +1120,7 @@ int cmd_top(int argc, const char **argv) }, .max_stack = sysctl_perf_event_max_stack, .sym_pcnt_filter = 5, + .nr_threads_synthesize = UINT_MAX, }; struct record_opts *opts = &top.record_opts; struct target *target = &opts->target; @@ -1227,6 +1230,8 @@ int cmd_top(int argc, const char **argv) OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy, "Show entries in a hierarchy"), OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"), + OPT_UINTEGER(0, "num-thread-synthesize", &top.nr_threads_synthesize, + "number of thread to run event synthesize"), OPT_END() }; const char * const top_usage[] = { diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 0e678dd6bdbe..47eff4767edb 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -790,7 +790,10 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (n < 0) return err; - thread_nr = nr_threads_synthesize; + if (nr_threads_synthesize == UINT_MAX) + thread_nr = sysconf(_SC_NPROCESSORS_ONLN); + else + thread_nr = nr_threads_synthesize; if (thread_nr <= 1) { err = __perf_event__synthesize_threads(tool, process, diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 9bdfb78a9a35..f4296e1e3bb8 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -37,6 +37,7 @@ struct perf_top { int sym_pcnt_filter; const char *sym_filter; float min_percent; + unsigned int nr_threads_synthesize; }; #define CONSOLE_CLEAR "" From f6a9820d572bd8384d982357cbad214b3a6c04bb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 28 Sep 2017 18:06:33 +0200 Subject: [PATCH 0247/1085] perf tests attr: Fix group stat tests We started to use group read whenever it's possible: 82bf311e15d2 perf stat: Use group read for event groups That breaks some of attr tests, this change adds the new possible read_format value. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Martin Schwidefsky Cc: Thomas-Mich Richter LPU-Reference: 20170928160633.GA26973@krava Link: http://lkml.kernel.org/n/tip-1ko2zc4nph93d8lfwjyk9ivz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/test-stat-group | 2 ++ tools/perf/tests/attr/test-stat-group1 | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/perf/tests/attr/test-stat-group b/tools/perf/tests/attr/test-stat-group index fdc1596a8862..e15d6946e9b3 100644 --- a/tools/perf/tests/attr/test-stat-group +++ b/tools/perf/tests/attr/test-stat-group @@ -6,6 +6,7 @@ ret = 1 [event-1:base-stat] fd=1 group_fd=-1 +read_format=3|15 [event-2:base-stat] fd=2 @@ -13,3 +14,4 @@ group_fd=1 config=1 disabled=0 enable_on_exec=0 +read_format=3|15 diff --git a/tools/perf/tests/attr/test-stat-group1 b/tools/perf/tests/attr/test-stat-group1 index 2a1f86e4a904..1746751123dc 100644 --- a/tools/perf/tests/attr/test-stat-group1 +++ b/tools/perf/tests/attr/test-stat-group1 @@ -6,6 +6,7 @@ ret = 1 [event-1:base-stat] fd=1 group_fd=-1 +read_format=3|15 [event-2:base-stat] fd=2 @@ -13,3 +14,4 @@ group_fd=1 config=1 disabled=0 enable_on_exec=0 +read_format=3|15 From a1652bb8a01c1a830cacbe958aec17f880cc1e47 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 11:47:27 +0200 Subject: [PATCH 0248/1085] x86/boot: Spell out "boot CPU" for BP It's not obvious to everybody that BP stands for boot processor. At least it was not for me. And BP is also a CPU register on x86, so it is ambiguous. Spell out "boot CPU" everywhere instead. Signed-off-by: Jean Delvare Cc: Alok Kataria Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec_def.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index b31f8c098271..298ce6284b56 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -58,7 +58,7 @@ struct mpc_table { #define MP_TRANSLATION 192 #define CPU_ENABLED 1 /* Processor is available */ -#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */ +#define CPU_BOOTPROCESSOR 2 /* Processor is the boot CPU */ #define CPU_STEPPING_MASK 0x000F #define CPU_MODEL_MASK 0x00F0 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d705c769f77d..0b7079f0fb9c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1462,7 +1462,7 @@ void setup_local_APIC(void) /* * Set up LVT0, LVT1: * - * set up through-local-APIC on the BP's LINT0. This is not + * set up through-local-APIC on the boot CPU's LINT0. This is not * strictly necessary in pure symmetric-IO mode, but sometimes * we delegate interrupts to the 8259A. */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..03f9a1a8a314 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -863,8 +863,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) * cache alignment. * The others are not touched to avoid unwanted side effects. * - * WARNING: this function is only called on the BP. Don't add code here - * that is supposed to run on all CPUs. + * WARNING: this function is only called on the boot CPU. Don't add code + * here that is supposed to run on all CPUs. */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..cb71626d49da 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1045,7 +1045,7 @@ void __init setup_arch(char **cmdline_p) /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine, for the BP. + * needs to be done after dmi_scan_machine(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..a98253e183be 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -249,7 +249,7 @@ static void notrace start_secondary(void *unused) /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* - * Check TSC synchronization with the BP: + * Check TSC synchronization with the boot CPU: */ check_tsc_sync_target(); From d19cd4bb234383166801b5d46365282808d0c0e5 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sat, 30 Sep 2017 05:57:39 +0200 Subject: [PATCH 0249/1085] Documentation/features/KASAN: mark KASAN as supported only on 64-bit on x86 Relevant part is: arch/x86/Kconfig: select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP Signed-off-by: Adam Borowski Signed-off-by: Jonathan Corbet --- Documentation/features/debug/KASAN/arch-support.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 76bbd7fe27b3..f377290fe48e 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -34,6 +34,6 @@ | tile: | TODO | | um: | TODO | | unicore32: | TODO | - | x86: | ok | + | x86: | ok | 64-bit only | xtensa: | TODO | ----------------------- From 3ce62385019fbf7d2aea4fd7c73fbd0ddad5c8fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 3 Oct 2017 17:54:07 +0200 Subject: [PATCH 0250/1085] Documentation: Improve softlockup_panic= description text It should say what that range is and what that integer value means. I had to look at the code... Signed-off-by: Borislav Petkov [jc: changed non-null to nonzero] Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fcaa8558ed7e..f02961bda6f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3885,6 +3885,12 @@ [KNL] Should the soft-lockup detector generate panics. Format: + A nonzero value instructs the soft-lockup detector + to panic the machine when a soft-lockup occurs. This + is also controlled by CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC + which is the respective build-time switch to that + functionality. + softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. From 852f1a21ffae19a45d4944791ce574e92a2b31ab Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 3 Oct 2017 16:16:37 -0500 Subject: [PATCH 0251/1085] docs: Update binfmt_misc links Documentation/binfmt_misc.txt moved to Documentation/admin-guide/binfmt-misc.rst Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/sysctl/README | 2 +- Documentation/sysctl/fs.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README index 91f54ffa0077..d5f24ab0ecc3 100644 --- a/Documentation/sysctl/README +++ b/Documentation/sysctl/README @@ -60,7 +60,7 @@ debug/ dev/ device specific information (eg dev/cdrom/info) fs/ specific filesystems filehandle, inode, dentry and quota tuning - binfmt_misc + binfmt_misc kernel/ global kernel info / tuning miscellaneous stuff net/ networking stuff, for documentation look in: diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 35e17f748ca7..6c00c1e2743f 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -277,7 +277,7 @@ in a mount namespace. ---------------------------------------------------------- Documentation for the files in /proc/sys/fs/binfmt_misc is -in Documentation/binfmt_misc.txt. +in Documentation/admin-guide/binfmt-misc.rst. 3. /proc/sys/fs/mqueue - POSIX message queues filesystem From 91a1fad759ffd616b836984ca0420c7ad5996eef Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 30 Sep 2017 10:54:31 +0200 Subject: [PATCH 0252/1085] s390: use generic rwsem implementation We never optimized our rwsem inline assemblies to make use of the new atomic instructions. The generic rwsem implementation implicitly makes use of the new instructions, since it implements the required rwsem primitives with atomic operations, which we did optimize. However even when compiling for old architectures the generic variant still generates better code. So it's time to simply remove our old code and switch to the generic implementation. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/Kbuild | 1 + arch/s390/include/asm/rwsem.h | 210 ---------------------------------- 2 files changed, 1 insertion(+), 210 deletions(-) delete mode 100644 arch/s390/include/asm/rwsem.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 6e2c9f7e47fa..41c211a4d8b1 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h +generic-y += rwsem.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h deleted file mode 100644 index 597e7e96b59e..000000000000 --- a/arch/s390/include/asm/rwsem.h +++ /dev/null @@ -1,210 +0,0 @@ -#ifndef _S390_RWSEM_H -#define _S390_RWSEM_H - -/* - * S390 version - * Copyright IBM Corp. 2002 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h - */ - -/* - * - * The MSW of the count is the negated number of active writers and waiting - * lockers, and the LSW is the total number of active locks - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an - * uncontended lock. This can be determined because XADD returns the old value. - * Readers increment by 1 and see a positive value when uncontended, negative - * if there are writers (and maybe) readers waiting (in which case it goes to - * sleep). - * - * The value of WAITING_BIAS supports up to 32766 waiting processes. This can - * be extended to 65534 by manually checking the whole MSW rather than relying - * on the S flag. - * - * The value of ACTIVE_BIAS supports up to 65535 active processes. - * - * This should be totally fair - if anything is waiting, a process that wants a - * lock will go to the back of the queue. When the currently active lock is - * released, if there's a writer at the front of the queue, then that and only - * that will be woken up; if there's a bunch of consecutive readers at the - * front, then they'll all be woken up, but no other readers will be. - */ - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L -#define RWSEM_ACTIVE_BIAS 0x0000000000000001L -#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL -#define RWSEM_WAITING_BIAS (-0x0000000100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - signed long old, new; - - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " aghi %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "i" (RWSEM_ACTIVE_READ_BIAS) - : "cc", "memory"); - if (old < 0) - rwsem_down_read_failed(sem); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - signed long old, new; - - asm volatile( - " lg %0,%2\n" - "0: ltgr %1,%0\n" - " jm 1f\n" - " aghi %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b\n" - "1:" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "i" (RWSEM_ACTIVE_READ_BIAS) - : "cc", "memory"); - return old >= 0 ? 1 : 0; -} - -/* - * lock for writing - */ -static inline long ___down_write(struct rw_semaphore *sem) -{ - signed long old, new, tmp; - - tmp = RWSEM_ACTIVE_WRITE_BIAS; - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " ag %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "m" (tmp) - : "cc", "memory"); - - return old; -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - if (___down_write(sem)) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (___down_write(sem)) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - signed long old; - - asm volatile( - " lg %0,%1\n" - "0: ltgr %0,%0\n" - " jnz 1f\n" - " csg %0,%3,%1\n" - " jl 0b\n" - "1:" - : "=&d" (old), "=Q" (sem->count) - : "Q" (sem->count), "d" (RWSEM_ACTIVE_WRITE_BIAS) - : "cc", "memory"); - return (old == RWSEM_UNLOCKED_VALUE) ? 1 : 0; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - signed long old, new; - - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " aghi %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "i" (-RWSEM_ACTIVE_READ_BIAS) - : "cc", "memory"); - if (new < 0) - if ((new & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - signed long old, new, tmp; - - tmp = -RWSEM_ACTIVE_WRITE_BIAS; - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " ag %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "m" (tmp) - : "cc", "memory"); - if (new < 0) - if ((new & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - signed long old, new, tmp; - - tmp = -RWSEM_WAITING_BIAS; - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " ag %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "m" (tmp) - : "cc", "memory"); - if (new > 1) - rwsem_downgrade_wake(sem); -} - -#endif /* _S390_RWSEM_H */ From 667063acb81931e2f8fd0cb91df9fcccad131d9a Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 29 Sep 2017 16:23:01 +0800 Subject: [PATCH 0253/1085] regmap: add iopoll-like polling macro for regmap_field This patch adds a macro regmap_field_read_poll_timeout that works similar to the readx_poll_timeout defined in linux/iopoll.h, except that this can also return the error value returned by a failed regmap_field_read. Signed-off-by: Chen-Yu Tsai Signed-off-by: Mark Brown --- include/linux/regmap.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 978abfbac617..93a4663d7acb 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -139,6 +139,45 @@ struct reg_sequence { pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ }) +/** + * regmap_field_read_poll_timeout - Poll until a condition is met or timeout + * + * @field: Regmap field to read from + * @val: Unsigned integer variable to read the value into + * @cond: Break condition (usually involving @val) + * @sleep_us: Maximum time to sleep between reads in us (0 + * tight-loops). Should be less than ~20ms since usleep_range + * is used (see Documentation/timers/timers-howto.txt). + * @timeout_us: Timeout in us, 0 means never timeout + * + * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read + * error return value in case of a error read. In the two former cases, + * the last read value at @addr is stored in @val. Must not be called + * from atomic context if sleep_us or timeout_us are used. + * + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + */ +#define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ +({ \ + ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ + int pollret; \ + might_sleep_if(sleep_us); \ + for (;;) { \ + pollret = regmap_field_read((field), &(val)); \ + if (pollret) \ + break; \ + if (cond) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ + pollret = regmap_field_read((field), &(val)); \ + break; \ + } \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ +}) + #ifdef CONFIG_REGMAP enum regmap_endian { From d81851c1764b26b46670c0b3bd6701308ddaab98 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 29 Sep 2017 11:25:09 +0800 Subject: [PATCH 0254/1085] regulator: axp20x: Add support for AXP813 regulators The AXP813 PMIC has 7 DC-DC buck regulators, 16 LDOs (including the fixed RTC LDO and 2 GPIO LDOs), and 1 switchable. The drive-vbus feature is also supported. All the hardware details are very similar to the AXP803, with the following exceptions: - Extra DCDC7 buck regulator, with the same range as DCDC6 - SWitch now has a separate supply pin, instead of being chained internaly from DCDC1 - RTC LDO output voltage is now 1.8V - FLDO3 is an LDO with switchable supplies, but unconfigurable output voltage. The voltage is always half that of its supply. Support for FLDO3 is currently unimplemented, as it requires runtime switching of its supplies, something the regulator subsystem does not support. It is not used in either the reference designs nor actually produced boards available. Signed-off-by: Chen-Yu Tsai Tested-by: Maxime Ripard Acked-by: Maxime Ripard Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 102 +++++++++++++++++++++++++-- include/linux/mfd/axp20x.h | 3 + 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 376a99b7cf5d..e1761df4cbfd 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -244,6 +244,7 @@ static const struct regulator_desc axp22x_drivevbus_regulator = { .ops = &axp20x_ops_sw, }; +/* DCDC ranges shared with AXP813 */ static const struct regulator_linear_range axp803_dcdc234_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 0x0, 0x46, 10000), REGULATOR_LINEAR_RANGE(1220000, 0x47, 0x4b, 20000), @@ -426,6 +427,69 @@ static const struct regulator_desc axp809_regulators[] = { AXP_DESC_SW(AXP809, SW, "sw", "swin", AXP22X_PWR_OUT_CTRL2, BIT(6)), }; +static const struct regulator_desc axp813_regulators[] = { + AXP_DESC(AXP813, DCDC1, "dcdc1", "vin1", 1600, 3400, 100, + AXP803_DCDC1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL1, BIT(0)), + AXP_DESC_RANGES(AXP813, DCDC2, "dcdc2", "vin2", axp803_dcdc234_ranges, + 76, AXP803_DCDC2_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(1)), + AXP_DESC_RANGES(AXP813, DCDC3, "dcdc3", "vin3", axp803_dcdc234_ranges, + 76, AXP803_DCDC3_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(2)), + AXP_DESC_RANGES(AXP813, DCDC4, "dcdc4", "vin4", axp803_dcdc234_ranges, + 76, AXP803_DCDC4_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(3)), + AXP_DESC_RANGES(AXP813, DCDC5, "dcdc5", "vin5", axp803_dcdc5_ranges, + 68, AXP803_DCDC5_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(4)), + AXP_DESC_RANGES(AXP813, DCDC6, "dcdc6", "vin6", axp803_dcdc6_ranges, + 72, AXP803_DCDC6_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(5)), + AXP_DESC_RANGES(AXP813, DCDC7, "dcdc7", "vin7", axp803_dcdc6_ranges, + 72, AXP813_DCDC7_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1, + BIT(6)), + AXP_DESC(AXP813, ALDO1, "aldo1", "aldoin", 700, 3300, 100, + AXP22X_ALDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(5)), + AXP_DESC(AXP813, ALDO2, "aldo2", "aldoin", 700, 3300, 100, + AXP22X_ALDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(6)), + AXP_DESC(AXP813, ALDO3, "aldo3", "aldoin", 700, 3300, 100, + AXP22X_ALDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(7)), + AXP_DESC(AXP813, DLDO1, "dldo1", "dldoin", 700, 3300, 100, + AXP22X_DLDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(3)), + AXP_DESC_RANGES(AXP813, DLDO2, "dldo2", "dldoin", axp803_dldo2_ranges, + 32, AXP22X_DLDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, + BIT(4)), + AXP_DESC(AXP813, DLDO3, "dldo3", "dldoin", 700, 3300, 100, + AXP22X_DLDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(5)), + AXP_DESC(AXP813, DLDO4, "dldo4", "dldoin", 700, 3300, 100, + AXP22X_DLDO4_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(6)), + AXP_DESC(AXP813, ELDO1, "eldo1", "eldoin", 700, 1900, 50, + AXP22X_ELDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(0)), + AXP_DESC(AXP813, ELDO2, "eldo2", "eldoin", 700, 1900, 50, + AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)), + AXP_DESC(AXP813, ELDO3, "eldo3", "eldoin", 700, 1900, 50, + AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)), + /* to do / check ... */ + AXP_DESC(AXP813, FLDO1, "fldo1", "fldoin", 700, 1450, 50, + AXP803_FLDO1_V_OUT, 0x0f, AXP22X_PWR_OUT_CTRL3, BIT(2)), + AXP_DESC(AXP813, FLDO2, "fldo2", "fldoin", 700, 1450, 50, + AXP803_FLDO2_V_OUT, 0x0f, AXP22X_PWR_OUT_CTRL3, BIT(3)), + /* + * TODO: FLDO3 = {DCDC5, FLDOIN} / 2 + * + * This means FLDO3 effectively switches supplies at runtime, + * something the regulator subsystem does not support. + */ + AXP_DESC_FIXED(AXP813, RTC_LDO, "rtc-ldo", "ips", 1800), + AXP_DESC_IO(AXP813, LDO_IO0, "ldo-io0", "ips", 700, 3300, 100, + AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07, + AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), + AXP_DESC_IO(AXP813, LDO_IO1, "ldo-io1", "ips", 700, 3300, 100, + AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07, + AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), + AXP_DESC_SW(AXP813, SW, "sw", "swin", AXP22X_PWR_OUT_CTRL2, BIT(7)), +}; + static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq) { struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent); @@ -441,9 +505,10 @@ static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq) step = 75; break; case AXP803_ID: + case AXP813_ID: /* - * AXP803 DCDC work frequency setting has the same range and - * step as AXP22X, but at a different register. + * AXP803/AXP813 DCDC work frequency setting has the same + * range and step as AXP22X, but at a different register. * Fall through to the check below. * (See include/linux/mfd/axp20x.h) */ @@ -561,6 +626,14 @@ static int axp20x_set_dcdc_workmode(struct regulator_dev *rdev, int id, u32 work workmode <<= id - AXP803_DCDC1; break; + case AXP813_ID: + if (id < AXP813_DCDC1 || id > AXP813_DCDC7) + return -EINVAL; + + mask = AXP22X_WORKMODE_DCDCX_MASK(id - AXP813_DCDC1); + workmode <<= id - AXP813_DCDC1; + break; + default: /* should not happen */ WARN_ON(1); @@ -579,8 +652,8 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id) u32 reg = 0; /* - * Currently in our supported AXP variants, only AXP803 and AXP806 - * have polyphase regulators. + * Currently in our supported AXP variants, only AXP803, AXP806, + * and AXP813 have polyphase regulators. */ switch (axp20x->variant) { case AXP803_ID: @@ -608,6 +681,17 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id) } break; + case AXP813_ID: + regmap_read(axp20x->regmap, AXP803_POLYPHASE_CTRL, ®); + + switch (id) { + case AXP803_DCDC3: + return !!(reg & BIT(6)); + case AXP803_DCDC6: + return !!(reg & BIT(5)); + } + break; + default: return false; } @@ -656,6 +740,12 @@ static int axp20x_regulator_probe(struct platform_device *pdev) regulators = axp809_regulators; nregulators = AXP809_REG_ID_MAX; break; + case AXP813_ID: + regulators = axp813_regulators; + nregulators = AXP813_REG_ID_MAX; + drivevbus = of_property_read_bool(pdev->dev.parent->of_node, + "x-powers,drive-vbus-en"); + break; default: dev_err(&pdev->dev, "Unsupported AXP variant: %ld\n", axp20x->variant); @@ -677,6 +767,10 @@ static int axp20x_regulator_probe(struct platform_device *pdev) if (axp20x_is_polyphase_slave(axp20x, i)) continue; + /* Support for AXP813's FLDO3 is not implemented */ + if (axp20x->variant == AXP813_ID && i == AXP813_FLDO3) + continue; + /* * Regulators DC1SW and DC5LDO are connected internally, * so we have to handle their supply names separately. diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index e9c908c4fba8..78dc85365c4f 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -131,6 +131,9 @@ enum axp20x_variants { #define AXP803_DCDC6_V_OUT 0x25 #define AXP803_DCDC_FREQ_CTRL 0x3b +/* Other DCDC regulator control registers are the same as AXP803 */ +#define AXP813_DCDC7_V_OUT 0x26 + /* Interrupt */ #define AXP152_IRQ1_EN 0x40 #define AXP152_IRQ2_EN 0x41 From 219a7bc577e6024cd6f84571d93d939b3517aafe Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Oct 2017 14:19:53 +0200 Subject: [PATCH 0255/1085] spi: rspi: Use of_device_get_match_data() helper Use the of_device_get_match_data() helper instead of open coding. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- drivers/spi/spi-rspi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c index 2a10b3f94ff7..2ce875764ca6 100644 --- a/drivers/spi/spi-rspi.c +++ b/drivers/spi/spi-rspi.c @@ -1221,7 +1221,6 @@ static int rspi_probe(struct platform_device *pdev) struct spi_master *master; struct rspi_data *rspi; int ret; - const struct of_device_id *of_id; const struct rspi_plat_data *rspi_pd; const struct spi_ops *ops; @@ -1229,9 +1228,8 @@ static int rspi_probe(struct platform_device *pdev) if (master == NULL) return -ENOMEM; - of_id = of_match_device(rspi_of_match, &pdev->dev); - if (of_id) { - ops = of_id->data; + ops = of_device_get_match_data(&pdev->dev); + if (ops) { ret = rspi_parse_dt(&pdev->dev, master); if (ret) goto error1; From ecb1596aa27856a256b0698a93d7be06ce041e73 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Oct 2017 14:20:27 +0200 Subject: [PATCH 0256/1085] spi: sh-msiof: Use of_device_get_match_data() helper Use the of_device_get_match_data() helper instead of open coding. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- drivers/spi/spi-sh-msiof.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 00808448cf35..2c5ea4c57508 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -1211,15 +1211,13 @@ static int sh_msiof_spi_probe(struct platform_device *pdev) struct resource *r; struct spi_master *master; const struct sh_msiof_chipdata *chipdata; - const struct of_device_id *of_id; struct sh_msiof_spi_info *info; struct sh_msiof_spi_priv *p; int i; int ret; - of_id = of_match_device(sh_msiof_match, &pdev->dev); - if (of_id) { - chipdata = of_id->data; + chipdata = of_device_get_match_data(&pdev->dev); + if (chipdata) { info = sh_msiof_spi_parse_dt(&pdev->dev); } else { chipdata = (const void *)pdev->id_entry->driver_data; From 5fd88b60e11b7d81b2c944c1b45834c4a6aa0157 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Oct 2017 15:59:31 +0100 Subject: [PATCH 0257/1085] x86/intel_rdt/cqm: Make integer rmid_limbo_count static rmid_limbo_count is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'rmid_limbo_count' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20171002145931.27479-1-colin.king@canonical.com --- arch/x86/kernel/cpu/intel_rdt_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 30827510094b..681450eee428 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -51,7 +51,7 @@ static LIST_HEAD(rmid_free_lru); * may have a occupancy value > intel_cqm_threshold. User can change * the threshold occupancy value. */ -unsigned int rmid_limbo_count; +static unsigned int rmid_limbo_count; /** * @rmid_entry - The entry in the limbo and free lists. From 3916a4135c696fa226a1abe6d6a0ff7f5edd9a7c Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Wed, 4 Oct 2017 15:48:57 -0700 Subject: [PATCH 0258/1085] x86/intel_rdt: Remove redundant assignment The assignment to the 'files' variable is immediately overwritten in the following line. Remove the older assignment, which was meant specifially for creating control groups files. Fixes: c7d9aac61311 ("x86/intel_rdt/cqm: Add mkdir support for RDT monitoring") Reported-by: Reinette Chatre Signed-off-by: Jithu Joseph Signed-off-by: Thomas Gleixner Acked-by: Fenghua Yu Cc: tony.luck@intel.com Cc: vikas.shivappa@intel.com Link: https://lkml.kernel.org/r/1507157337-18118-1-git-send-email-jithu.joseph@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index abd220bf6cd7..8a61b20c7e51 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1635,7 +1635,6 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | RFTYPE_CTRL; files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); ret = rdtgroup_add_files(kn, files); if (ret) { From 92bb6cb1403015b1b9520332c8f2ad983c220f67 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:54:25 -0700 Subject: [PATCH 0259/1085] x86/mce: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adjust sanity-check WARN to make sure the triggering timer matches the current CPU timer. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Tony Luck Cc: linux-edac@vger.kernel.org Link: https://lkml.kernel.org/r/20171005005425.GA23950@beast --- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3b413065c613..b1d616d08eee 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1367,13 +1367,12 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } -static void mce_timer_fn(unsigned long data) +static void mce_timer_fn(struct timer_list *t) { - struct timer_list *t = this_cpu_ptr(&mce_timer); - int cpu = smp_processor_id(); + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); unsigned long iv; - WARN_ON(cpu != data); + WARN_ON(cpu_t != t); iv = __this_cpu_read(mce_next_interval); @@ -1763,17 +1762,15 @@ static void mce_start_timer(struct timer_list *t) static void __mcheck_cpu_setup_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); } static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); mce_start_timer(t); } From 58e1177b4cd10b0d358faf7d7ebb3779f98bc3ea Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:55 -0700 Subject: [PATCH 0260/1085] timer: Convert schedule_timeout() to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new from_timer() helper and passing the timer pointer explicitly. Since this special timer is on the stack, it needs to have a wrapper structure to carry state once .data is eliminated. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: "Rafael J. Wysocki" Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: Stephen Boyd Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-2-git-send-email-keescook@chromium.org --- include/linux/timer.h | 8 ++++++++ kernel/time/timer.c | 26 +++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 6383c528b148..5ef5c9e41a09 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -179,6 +179,14 @@ static inline void timer_setup(struct timer_list *timer, (TIMER_DATA_TYPE)timer, flags); } +static inline void timer_setup_on_stack(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + #define from_timer(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f2674a056c26..38613ced2324 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1668,9 +1668,20 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -static void process_timeout(unsigned long __data) +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) { - wake_up_process((struct task_struct *)__data); + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); } /** @@ -1704,7 +1715,7 @@ static void process_timeout(unsigned long __data) */ signed long __sched schedule_timeout(signed long timeout) { - struct timer_list timer; + struct process_timer timer; unsigned long expire; switch (timeout) @@ -1738,13 +1749,14 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false); + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, false); schedule(); - del_singleshot_timer_sync(&timer); + del_singleshot_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); + destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; From 1d1fe902afb380571105d05d0be3de61b81bc9a8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:56 -0700 Subject: [PATCH 0261/1085] timer: Remove init_timer_pinned_deferrable() in favor of timer_setup() This refactors the only user of init_timer_pinned_deferrable() to use the new timer_setup() and from_timer(). Adds a pointer back to the policy, and drops the definition of init_timer_pinned_deferrable(). Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-3-git-send-email-keescook@chromium.org --- drivers/cpufreq/powernv-cpufreq.c | 13 +++++++------ include/linux/timer.h | 2 -- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 3ff5160451b4..b6d7c4c98d0a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -90,6 +90,7 @@ struct global_pstate_info { int last_gpstate_idx; spinlock_t gpstate_lock; struct timer_list timer; + struct cpufreq_policy *policy; }; static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; @@ -625,10 +626,10 @@ static inline void queue_gpstate_timer(struct global_pstate_info *gpstates) * according quadratic equation. Queues a new timer if it is still not equal * to local pstate */ -void gpstate_timer_handler(unsigned long data) +void gpstate_timer_handler(struct timer_list *t) { - struct cpufreq_policy *policy = (struct cpufreq_policy *)data; - struct global_pstate_info *gpstates = policy->driver_data; + struct global_pstate_info *gpstates = from_timer(gpstates, t, timer); + struct cpufreq_policy *policy = gpstates->policy; int gpstate_idx, lpstate_idx; unsigned long val; unsigned int time_diff = jiffies_to_msecs(jiffies) @@ -800,9 +801,9 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->driver_data = gpstates; /* initialize timer */ - init_timer_pinned_deferrable(&gpstates->timer); - gpstates->timer.data = (unsigned long)policy; - gpstates->timer.function = gpstate_timer_handler; + gpstates->policy = policy; + timer_setup(&gpstates->timer, gpstate_timer_handler, + TIMER_PINNED | TIMER_DEFERRABLE); gpstates->timer.expires = jiffies + msecs_to_jiffies(GPSTATE_TIMER_INTERVAL); spin_lock_init(&gpstates->gpstate_lock); diff --git a/include/linux/timer.h b/include/linux/timer.h index 5ef5c9e41a09..d11e819a86e2 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) -#define init_timer_pinned_deferrable(timer) \ - __init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED) #define init_timer_on_stack(timer) \ __init_timer_on_stack((timer), 0) From 9c6c273aa4248c60569de6ef7e7e9c7bed3cd32e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:57 -0700 Subject: [PATCH 0262/1085] timer: Remove init_timer_on_stack() in favor of timer_setup_on_stack() Remove uses of init_timer_on_stack() with open-coded function and data assignments that could be expressed using timer_setup_on_stack(). Several were removed from the stack entirely since there was a one-to-one mapping of parent structure to timer, those are switched to using timer_setup() instead. All related callbacks were adjusted to use from_timer(). Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: Wim Van Sebroeck Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: linux-scsi@vger.kernel.org Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Lai Jiangshan Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-4-git-send-email-keescook@chromium.org --- drivers/base/power/main.c | 8 +++----- drivers/firewire/core-transaction.c | 10 +++++----- drivers/parport/ieee1284.c | 21 +++++++-------------- drivers/s390/char/tape.h | 1 + drivers/s390/char/tape_std.c | 18 ++++++------------ drivers/s390/net/lcs.c | 16 ++++++---------- drivers/s390/net/lcs.h | 1 + drivers/scsi/qla1280.c | 14 +++++--------- drivers/scsi/qla1280.h | 1 + include/linux/parport.h | 1 + include/linux/timer.h | 2 -- 11 files changed, 36 insertions(+), 57 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 770b1539a083..ae47b2ec84b4 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -478,9 +478,9 @@ struct dpm_watchdog { * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ -static void dpm_watchdog_handler(unsigned long data) +static void dpm_watchdog_handler(struct timer_list *t) { - struct dpm_watchdog *wd = (void *)data; + struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL); @@ -500,11 +500,9 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) wd->dev = dev; wd->tsk = current; - init_timer_on_stack(timer); + timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT; - timer->function = dpm_watchdog_handler; - timer->data = (unsigned long)wd; add_timer(timer); } diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index d6a09b9cd8cc..4372f9e4b0da 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -137,9 +137,9 @@ int fw_cancel_transaction(struct fw_card *card, } EXPORT_SYMBOL(fw_cancel_transaction); -static void split_transaction_timeout_callback(unsigned long data) +static void split_transaction_timeout_callback(struct timer_list *timer) { - struct fw_transaction *t = (struct fw_transaction *)data; + struct fw_transaction *t = from_timer(t, timer, split_timeout_timer); struct fw_card *card = t->card; unsigned long flags; @@ -373,8 +373,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode, t->tlabel = tlabel; t->card = card; t->is_split_transaction = false; - setup_timer(&t->split_timeout_timer, - split_transaction_timeout_callback, (unsigned long)t); + timer_setup(&t->split_timeout_timer, + split_transaction_timeout_callback, 0); t->callback = callback; t->callback_data = callback_data; @@ -423,7 +423,7 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id, struct transaction_callback_data d; struct fw_transaction t; - init_timer_on_stack(&t.split_timeout_timer); + timer_setup_on_stack(&t.split_timeout_timer, NULL, 0); init_completion(&d.done); d.payload = payload; fw_send_request(card, &t, tcode, destination_id, generation, speed, diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c index 74cc6dd982d2..2d1a5c737c6e 100644 --- a/drivers/parport/ieee1284.c +++ b/drivers/parport/ieee1284.c @@ -44,10 +44,11 @@ static void parport_ieee1284_wakeup (struct parport *port) up (&port->physport->ieee1284.irq); } -static struct parport *port_from_cookie[PARPORT_MAX]; -static void timeout_waiting_on_port (unsigned long cookie) +static void timeout_waiting_on_port (struct timer_list *t) { - parport_ieee1284_wakeup (port_from_cookie[cookie % PARPORT_MAX]); + struct parport *port = from_timer(port, t, timer); + + parport_ieee1284_wakeup (port); } /** @@ -69,27 +70,19 @@ static void timeout_waiting_on_port (unsigned long cookie) int parport_wait_event (struct parport *port, signed long timeout) { int ret; - struct timer_list timer; if (!port->physport->cad->timeout) /* Zero timeout is special, and we can't down() the semaphore. */ return 1; - init_timer_on_stack(&timer); - timer.expires = jiffies + timeout; - timer.function = timeout_waiting_on_port; - port_from_cookie[port->number % PARPORT_MAX] = port; - timer.data = port->number; - - add_timer (&timer); + timer_setup(&port->timer, timeout_waiting_on_port, 0); + mod_timer(&port->timer, jiffies + timeout); ret = down_interruptible (&port->physport->ieee1284.irq); - if (!del_timer_sync(&timer) && !ret) + if (!del_timer_sync(&port->timer) && !ret) /* Timed out. */ ret = 1; - destroy_timer_on_stack(&timer); - return ret; } diff --git a/drivers/s390/char/tape.h b/drivers/s390/char/tape.h index ea664dd4f56d..52fbcd9c3cf8 100644 --- a/drivers/s390/char/tape.h +++ b/drivers/s390/char/tape.h @@ -128,6 +128,7 @@ struct tape_request { int options; /* options for execution. */ int retries; /* retry counter for error recovery. */ int rescnt; /* residual count from devstat. */ + struct timer_list timer; /* timer for std_assign_timeout(). */ /* Callback for delivering final status. */ void (*callback)(struct tape_request *, void *); diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index 3478e19ae194..cd204abdc0bc 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -32,14 +32,12 @@ * tape_std_assign */ static void -tape_std_assign_timeout(unsigned long data) +tape_std_assign_timeout(struct timer_list *t) { - struct tape_request * request; - struct tape_device * device; + struct tape_request * request = from_timer(request, t, timer); + struct tape_device * device = request->device; int rc; - request = (struct tape_request *) data; - device = request->device; BUG_ON(!device); DBF_EVENT(3, "%08x: Assignment timeout. Device busy.\n", @@ -70,16 +68,12 @@ tape_std_assign(struct tape_device *device) * to another host (actually this shouldn't happen but it does). * So we set up a timeout for this call. */ - init_timer_on_stack(&timeout); - timeout.function = tape_std_assign_timeout; - timeout.data = (unsigned long) request; - timeout.expires = jiffies + 2 * HZ; - add_timer(&timeout); + timer_setup(&request->timer, tape_std_assign_timeout, 0); + mod_timer(&timeout, jiffies + 2 * HZ); rc = tape_do_io_interruptible(device, request); - del_timer_sync(&timeout); - destroy_timer_on_stack(&timeout); + del_timer_sync(&request->timer); if (rc != 0) { DBF_EVENT(3, "%08x: assign failed - device might be busy\n", diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index d01b5c2a7760..21bba406d5be 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -834,9 +834,10 @@ lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd) * Emit buffer of a lan command. */ static void -lcs_lancmd_timeout(unsigned long data) +lcs_lancmd_timeout(struct timer_list *t) { - struct lcs_reply *reply, *list_reply, *r; + struct lcs_reply *reply = from_timer(reply, t, timer); + struct lcs_reply *list_reply, *r; unsigned long flags; LCS_DBF_TEXT(4, trace, "timeout"); @@ -864,7 +865,6 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer, { struct lcs_reply *reply; struct lcs_cmd *cmd; - struct timer_list timer; unsigned long flags; int rc; @@ -885,14 +885,10 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer, rc = lcs_ready_buffer(&card->write, buffer); if (rc) return rc; - init_timer_on_stack(&timer); - timer.function = lcs_lancmd_timeout; - timer.data = (unsigned long) reply; - timer.expires = jiffies + HZ*card->lancmd_timeout; - add_timer(&timer); + timer_setup(&reply->timer, lcs_lancmd_timeout, 0); + mod_timer(&reply->timer, jiffies + HZ * card->lancmd_timeout); wait_event(reply->wait_q, reply->received); - del_timer_sync(&timer); - destroy_timer_on_stack(&timer); + del_timer_sync(&reply->timer); LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc); rc = reply->rc; lcs_put_reply(reply); diff --git a/drivers/s390/net/lcs.h b/drivers/s390/net/lcs.h index 150fcb4cebc3..d44fb8d9378f 100644 --- a/drivers/s390/net/lcs.h +++ b/drivers/s390/net/lcs.h @@ -275,6 +275,7 @@ struct lcs_reply { void (*callback)(struct lcs_card *, struct lcs_cmd *); wait_queue_head_t wait_q; struct lcs_card *card; + struct timer_list timer; int received; int rc; }; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 8a29fb09db14..390775d5c918 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -758,9 +758,9 @@ enum action { }; -static void qla1280_mailbox_timeout(unsigned long __data) +static void qla1280_mailbox_timeout(struct timer_list *t) { - struct scsi_qla_host *ha = (struct scsi_qla_host *)__data; + struct scsi_qla_host *ha = from_timer(ha, t, mailbox_timer); struct device_reg __iomem *reg; reg = ha->iobase; @@ -2465,7 +2465,6 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb) uint16_t __iomem *mptr; uint16_t data; DECLARE_COMPLETION_ONSTACK(wait); - struct timer_list timer; ENTER("qla1280_mailbox_command"); @@ -2494,18 +2493,15 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb) /* Issue set host interrupt command. */ /* set up a timer just in case we're really jammed */ - init_timer_on_stack(&timer); - timer.expires = jiffies + 20*HZ; - timer.data = (unsigned long)ha; - timer.function = qla1280_mailbox_timeout; - add_timer(&timer); + timer_setup(&ha->mailbox_timer, qla1280_mailbox_timeout, 0); + mod_timer(&ha->mailbox_timer, jiffies + 20 * HZ); spin_unlock_irq(ha->host->host_lock); WRT_REG_WORD(®->host_cmd, HC_SET_HOST_INT); data = qla1280_debounce_register(®->istatus); wait_for_completion(&wait); - del_timer_sync(&timer); + del_timer_sync(&ha->mailbox_timer); spin_lock_irq(ha->host->host_lock); diff --git a/drivers/scsi/qla1280.h b/drivers/scsi/qla1280.h index 834884b9eed5..1522aca2c8c8 100644 --- a/drivers/scsi/qla1280.h +++ b/drivers/scsi/qla1280.h @@ -1055,6 +1055,7 @@ struct scsi_qla_host { struct list_head done_q; /* Done queue */ struct completion *mailbox_wait; + struct timer_list mailbox_timer; volatile struct { uint32_t online:1; /* 0 */ diff --git a/include/linux/parport.h b/include/linux/parport.h index 58e3c64c6b49..397607a0c0eb 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -225,6 +225,7 @@ struct parport { struct pardevice *waittail; struct list_head list; + struct timer_list timer; unsigned int flags; void *sysctl_table; diff --git a/include/linux/timer.h b/include/linux/timer.h index d11e819a86e2..b10c4bdc6fbd 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) -#define init_timer_on_stack(timer) \ - __init_timer_on_stack((timer), 0) #define __setup_timer(_timer, _fn, _data, _flags) \ do { \ From 185981d54a60ae90942c6ba9006b250f3348cef2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:58 -0700 Subject: [PATCH 0263/1085] timer: Remove init_timer_pinned() in favor of timer_setup() This refactors the only users of init_timer_pinned() to use the new timer_setup() and from_timer(). Drops the definition of init_timer_pinned(). Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: David S. Miller # for networking parts Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-5-git-send-email-keescook@chromium.org --- drivers/net/ethernet/tile/tilepro.c | 9 ++++----- include/linux/timer.h | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c index 49ccee4b9aec..56d06282fbde 100644 --- a/drivers/net/ethernet/tile/tilepro.c +++ b/drivers/net/ethernet/tile/tilepro.c @@ -608,9 +608,9 @@ static void tile_net_schedule_egress_timer(struct tile_net_cpu *info) * ISSUE: Maybe instead track number of expected completions, and free * only that many, resetting to zero if "pending" is ever false. */ -static void tile_net_handle_egress_timer(unsigned long arg) +static void tile_net_handle_egress_timer(struct timer_list *t) { - struct tile_net_cpu *info = (struct tile_net_cpu *)arg; + struct tile_net_cpu *info = from_timer(info, t, egress_timer); struct net_device *dev = info->napi.dev; /* The timer is no longer scheduled. */ @@ -1004,9 +1004,8 @@ static void tile_net_register(void *dev_ptr) BUG(); /* Initialize the egress timer. */ - init_timer_pinned(&info->egress_timer); - info->egress_timer.data = (long)info; - info->egress_timer.function = tile_net_handle_egress_timer; + timer_setup(&info->egress_timer, tile_net_handle_egress_timer, + TIMER_PINNED); u64_stats_init(&info->stats.syncp); diff --git a/include/linux/timer.h b/include/linux/timer.h index b10c4bdc6fbd..9da903562ed4 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define init_timer(timer) \ __init_timer((timer), 0) -#define init_timer_pinned(timer) \ - __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) From df7e828c1b699792b2ff26ebcf0a6d1025b2b790 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:59 -0700 Subject: [PATCH 0264/1085] timer: Remove init_timer_deferrable() in favor of timer_setup() This refactors the only users of init_timer_deferrable() to use the new timer_setup() and from_timer(). Removes definition of init_timer_deferrable(). Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: David S. Miller # for networking parts Acked-by: Sebastian Reichel # for drivers/hsi parts Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Oleg Nesterov Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: "Rafael J. Wysocki" Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: linux-wireless@vger.kernel.org Cc: Sebastian Reichel Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-6-git-send-email-keescook@chromium.org --- arch/powerpc/mm/numa.c | 12 +++----- drivers/hsi/clients/ssi_protocol.c | 32 +++++++++++--------- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 11 +++---- drivers/net/vxlan.c | 8 ++--- drivers/net/wireless/ath/ath6kl/recovery.c | 9 +++--- include/linux/timer.h | 2 -- 6 files changed, 34 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b95c584ce19d..f9b6107d6854 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1453,7 +1453,7 @@ static void topology_schedule_update(void) schedule_work(&topology_work); } -static void topology_timer_fn(unsigned long ignored) +static void topology_timer_fn(struct timer_list *unused) { if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) topology_schedule_update(); @@ -1463,14 +1463,11 @@ static void topology_timer_fn(unsigned long ignored) reset_topology_timer(); } } -static struct timer_list topology_timer = - TIMER_INITIALIZER(topology_timer_fn, 0, 0); +static struct timer_list topology_timer; static void reset_topology_timer(void) { - topology_timer.data = 0; - topology_timer.expires = jiffies + 60 * HZ; - mod_timer(&topology_timer, topology_timer.expires); + mod_timer(&topology_timer, jiffies + 60 * HZ); } #ifdef CONFIG_SMP @@ -1530,7 +1527,8 @@ int start_topology_update(void) prrn_enabled = 0; vphn_enabled = 1; setup_cpu_associativity_change_counters(); - init_timer_deferrable(&topology_timer); + timer_setup(&topology_timer, topology_timer_fn, + TIMER_DEFERRABLE); reset_topology_timer(); } } diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c index 93d28c0ec8bf..67af03d3aeb3 100644 --- a/drivers/hsi/clients/ssi_protocol.c +++ b/drivers/hsi/clients/ssi_protocol.c @@ -464,10 +464,10 @@ static void ssip_error(struct hsi_client *cl) hsi_async_read(cl, msg); } -static void ssip_keep_alive(unsigned long data) +static void ssip_keep_alive(struct timer_list *t) { - struct hsi_client *cl = (struct hsi_client *)data; - struct ssi_protocol *ssi = hsi_client_drvdata(cl); + struct ssi_protocol *ssi = from_timer(ssi, t, keep_alive); + struct hsi_client *cl = ssi->cl; dev_dbg(&cl->device, "Keep alive kick in: m(%d) r(%d) s(%d)\n", ssi->main_state, ssi->recv_state, ssi->send_state); @@ -490,9 +490,19 @@ static void ssip_keep_alive(unsigned long data) spin_unlock(&ssi->lock); } -static void ssip_wd(unsigned long data) +static void ssip_rx_wd(struct timer_list *t) { - struct hsi_client *cl = (struct hsi_client *)data; + struct ssi_protocol *ssi = from_timer(ssi, t, rx_wd); + struct hsi_client *cl = ssi->cl; + + dev_err(&cl->device, "Watchdog trigerred\n"); + ssip_error(cl); +} + +static void ssip_tx_wd(unsigned long data) +{ + struct ssi_protocol *ssi = from_timer(ssi, t, tx_wd); + struct hsi_client *cl = ssi->cl; dev_err(&cl->device, "Watchdog trigerred\n"); ssip_error(cl); @@ -1084,15 +1094,9 @@ static int ssi_protocol_probe(struct device *dev) } spin_lock_init(&ssi->lock); - init_timer_deferrable(&ssi->rx_wd); - init_timer_deferrable(&ssi->tx_wd); - init_timer(&ssi->keep_alive); - ssi->rx_wd.data = (unsigned long)cl; - ssi->rx_wd.function = ssip_wd; - ssi->tx_wd.data = (unsigned long)cl; - ssi->tx_wd.function = ssip_wd; - ssi->keep_alive.data = (unsigned long)cl; - ssi->keep_alive.function = ssip_keep_alive; + timer_setup(&ssi->rx_wd, ssip_rx_wd, TIMER_DEFERRABLE); + timer_setup(&ssi->tx_wd, ssip_tx_wd, TIMER_DEFERRABLE); + timer_setup(&ssi->keep_alive, ssip_keep_alive, 0); INIT_LIST_HEAD(&ssi->txqueue); INIT_LIST_HEAD(&ssi->cmdqueue); atomic_set(&ssi->tx_usecnt, 0); diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 9feec7009443..29fea74bff2e 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -4725,9 +4725,9 @@ static const struct net_device_ops qlge_netdev_ops = { .ndo_vlan_rx_kill_vid = qlge_vlan_rx_kill_vid, }; -static void ql_timer(unsigned long data) +static void ql_timer(struct timer_list *t) { - struct ql_adapter *qdev = (struct ql_adapter *)data; + struct ql_adapter *qdev = from_timer(qdev, t, timer); u32 var = 0; var = ql_read32(qdev, STS); @@ -4806,11 +4806,8 @@ static int qlge_probe(struct pci_dev *pdev, /* Start up the timer to trigger EEH if * the bus goes dead */ - init_timer_deferrable(&qdev->timer); - qdev->timer.data = (unsigned long)qdev; - qdev->timer.function = ql_timer; - qdev->timer.expires = jiffies + (5*HZ); - add_timer(&qdev->timer); + timer_setup(&qdev->timer, ql_timer, TIMER_DEFERRABLE); + mod_timer(&qdev->timer, jiffies + (5*HZ)); ql_link_off(qdev); ql_display_dev_info(ndev); atomic_set(&qdev->lb_count, 0); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d7c49cf1d5e9..3247d2feda07 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2325,9 +2325,9 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } /* Walk the forwarding table and purge stale entries */ -static void vxlan_cleanup(unsigned long arg) +static void vxlan_cleanup(struct timer_list *t) { - struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; + struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; unsigned int h; @@ -2647,9 +2647,7 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); - init_timer_deferrable(&vxlan->age_timer); - vxlan->age_timer.function = vxlan_cleanup; - vxlan->age_timer.data = (unsigned long) vxlan; + timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; diff --git a/drivers/net/wireless/ath/ath6kl/recovery.c b/drivers/net/wireless/ath/ath6kl/recovery.c index 3a8d5e97dc8e..c09e40c9010f 100644 --- a/drivers/net/wireless/ath/ath6kl/recovery.c +++ b/drivers/net/wireless/ath/ath6kl/recovery.c @@ -60,9 +60,9 @@ void ath6kl_recovery_hb_event(struct ath6kl *ar, u32 cookie) ar->fw_recovery.hb_pending = false; } -static void ath6kl_recovery_hb_timer(unsigned long data) +static void ath6kl_recovery_hb_timer(struct timer_list *t) { - struct ath6kl *ar = (struct ath6kl *) data; + struct ath6kl *ar = from_timer(ar, t, fw_recovery.hb_timer); int err; if (test_bit(RECOVERY_CLEANUP, &ar->flag) || @@ -104,9 +104,8 @@ void ath6kl_recovery_init(struct ath6kl *ar) recovery->seq_num = 0; recovery->hb_misscnt = 0; ar->fw_recovery.hb_pending = false; - ar->fw_recovery.hb_timer.function = ath6kl_recovery_hb_timer; - ar->fw_recovery.hb_timer.data = (unsigned long) ar; - init_timer_deferrable(&ar->fw_recovery.hb_timer); + timer_setup(&ar->fw_recovery.hb_timer, ath6kl_recovery_hb_timer, + TIMER_DEFERRABLE); if (ar->fw_recovery.hb_poll) mod_timer(&ar->fw_recovery.hb_timer, jiffies + diff --git a/include/linux/timer.h b/include/linux/timer.h index 9da903562ed4..10cc45ca5803 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define init_timer(timer) \ __init_timer((timer), 0) -#define init_timer_deferrable(timer) \ - __init_timer((timer), TIMER_DEFERRABLE) #define __setup_timer(_timer, _fn, _data, _flags) \ do { \ From 5cd79d6abd2c142352dead0e3df04e86ee32f5d3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:00 -0700 Subject: [PATCH 0265/1085] timer: Remove users of TIMER_DEFERRED_INITIALIZER This removes uses of TIMER_DEFERRED_INITIALIZER and chooses a location to call timer_setup() from before add_timer() or mod_timer() is called. Adjusts callbacks to use from_timer() as needed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-7-git-send-email-keescook@chromium.org --- arch/s390/kernel/lgr.c | 6 +++--- arch/s390/kernel/topology.c | 6 +++--- kernel/workqueue.c | 8 +++----- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index ae7dff110054..bf9622f0e6b1 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -153,14 +153,13 @@ static void lgr_timer_set(void); /* * LGR timer callback */ -static void lgr_timer_fn(unsigned long ignored) +static void lgr_timer_fn(struct timer_list *unused) { lgr_info_log(); lgr_timer_set(); } -static struct timer_list lgr_timer = - TIMER_DEFERRED_INITIALIZER(lgr_timer_fn, 0, 0); +static struct timer_list lgr_timer; /* * Setup next LGR timer @@ -181,6 +180,7 @@ static int __init lgr_init(void) debug_register_view(lgr_dbf, &debug_hex_ascii_view); lgr_info_get(&lgr_info_last); debug_event(lgr_dbf, 1, &lgr_info_last, sizeof(lgr_info_last)); + timer_setup(&lgr_timer, lgr_timer_fn, TIMER_DEFERRABLE); lgr_timer_set(); return 0; } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index ed0bdd220e1a..d7ece9888c29 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -320,15 +320,14 @@ static void topology_flush_work(void) flush_work(&topology_work); } -static void topology_timer_fn(unsigned long ignored) +static void topology_timer_fn(struct timer_list *unused) { if (ptf(PTF_CHECK)) topology_schedule_update(); set_topology_timer(); } -static struct timer_list topology_timer = - TIMER_DEFERRED_INITIALIZER(topology_timer_fn, 0, 0); +static struct timer_list topology_timer; static atomic_t topology_poll = ATOMIC_INIT(0); @@ -597,6 +596,7 @@ static struct ctl_table topology_dir_table[] = { static int __init topology_init(void) { + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); if (MACHINE_HAS_TOPOLOGY) set_topology_timer(); else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..a5361fc6215d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5390,11 +5390,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } */ #ifdef CONFIG_WQ_WATCHDOG -static void wq_watchdog_timer_fn(unsigned long data); - static unsigned long wq_watchdog_thresh = 30; -static struct timer_list wq_watchdog_timer = - TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); +static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; @@ -5408,7 +5405,7 @@ static void wq_watchdog_reset_touched(void) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } -static void wq_watchdog_timer_fn(unsigned long data) +static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; @@ -5510,6 +5507,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, static void wq_watchdog_init(void) { + timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } From 51487d9ed1e386f9f0863bbf385e5da8a586bff8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:01 -0700 Subject: [PATCH 0266/1085] timer: Remove last user of TIMER_INITIALIZER Drops the last user of TIMER_INITIALIZER and adapts timer.h to use the internal version. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: Mark Gross Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: "James E.J. Bottomley" Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-8-git-send-email-keescook@chromium.org --- drivers/char/tlclk.c | 12 +++++------- include/linux/timer.h | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c index 6210bff46341..8eeb4190207d 100644 --- a/drivers/char/tlclk.c +++ b/drivers/char/tlclk.c @@ -184,9 +184,8 @@ static unsigned int telclk_interrupt; static int int_events; /* Event that generate a interrupt */ static int got_event; /* if events processing have been done */ -static void switchover_timeout(unsigned long data); -static struct timer_list switchover_timer = - TIMER_INITIALIZER(switchover_timeout , 0, 0); +static void switchover_timeout(struct timer_list *t); +static struct timer_list switchover_timer; static unsigned long tlclk_timer_data; static struct tlclk_alarms *alarm_events; @@ -805,7 +804,7 @@ static int __init tlclk_init(void) goto out3; } - init_timer(&switchover_timer); + timer_setup(&switchover_timer, switchover_timeout, 0); ret = misc_register(&tlclk_miscdev); if (ret < 0) { @@ -855,9 +854,9 @@ static void __exit tlclk_cleanup(void) } -static void switchover_timeout(unsigned long data) +static void switchover_timeout(struct timer_list *unused) { - unsigned long flags = *(unsigned long *) data; + unsigned long flags = tlclk_timer_data; if ((flags & 1)) { if ((inb(TLCLK_REG1) & 0x08) != (flags & 0x08)) @@ -922,7 +921,6 @@ static irqreturn_t tlclk_interrupt(int irq, void *dev_id) /* TIMEOUT in ~10ms */ switchover_timer.expires = jiffies + msecs_to_jiffies(10); tlclk_timer_data = inb(TLCLK_REG1); - switchover_timer.data = (unsigned long) &tlclk_timer_data; mod_timer(&switchover_timer, switchover_timer.expires); } else { got_event = 1; diff --git a/include/linux/timer.h b/include/linux/timer.h index 10cc45ca5803..4f7476e4a727 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -87,7 +87,7 @@ struct timer_list { #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ - TIMER_INITIALIZER(_function, _expires, _data) + __TIMER_INITIALIZER(_function, _expires, _data, 0) void init_timer_key(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); From fca7ce5b7c6d3eda8c2e011cff98d22e0fbd5097 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:02 -0700 Subject: [PATCH 0267/1085] timer: Remove unused static initializer macros This removes the now unused TIMER_*INITIALIZER macros: TIMER_INITIALIZER TIMER_PINNED_INITIALIZER TIMER_DEFERRED_INITIALIZER TIMER_PINNED_DEFERRED_INITIALIZER Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-9-git-send-email-keescook@chromium.org --- include/linux/timer.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 4f7476e4a727..a33220311361 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -73,18 +73,6 @@ struct timer_list { __FILE__ ":" __stringify(__LINE__)) \ } -#define TIMER_INITIALIZER(_function, _expires, _data) \ - __TIMER_INITIALIZER((_function), (_expires), (_data), 0) - -#define TIMER_PINNED_INITIALIZER(_function, _expires, _data) \ - __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED) - -#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \ - __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE) - -#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data) \ - __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED) - #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ __TIMER_INITIALIZER(_function, _expires, _data, 0) From 1ff97897454b9a59edc7cf2cf2d95586b1e7a2cf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:03 -0700 Subject: [PATCH 0268/1085] timer: Remove users of expire and data arguments to DEFINE_TIMER The expire and data arguments of DEFINE_TIMER are only used in two places and are ignored by the code (malta-display.c only uses mod_timer(), never add_timer(), so the preset expires value is ignored). Set both sets of arguments to zero. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Guenter Roeck # for watchdog parts Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Oleg Nesterov Cc: Wim Van Sebroeck Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: linux-scsi@vger.kernel.org Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Chris Metcalf Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Sebastian Reichel Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Kalle Valo Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-10-git-send-email-keescook@chromium.org --- arch/mips/mti-malta/malta-display.c | 6 +++--- drivers/watchdog/alim7101_wdt.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c index d4f807191ecd..ac813158b9b8 100644 --- a/arch/mips/mti-malta/malta-display.c +++ b/arch/mips/mti-malta/malta-display.c @@ -36,10 +36,10 @@ void mips_display_message(const char *str) } } -static void scroll_display_message(unsigned long data); -static DEFINE_TIMER(mips_scroll_timer, scroll_display_message, HZ, 0); +static void scroll_display_message(unsigned long unused); +static DEFINE_TIMER(mips_scroll_timer, scroll_display_message, 0, 0); -static void scroll_display_message(unsigned long data) +static void scroll_display_message(unsigned long unused) { mips_display_message(&display_string[display_count++]); if (display_count == max_display_count) diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c index 665e0e7dfe1e..3c1f6ac68ea9 100644 --- a/drivers/watchdog/alim7101_wdt.c +++ b/drivers/watchdog/alim7101_wdt.c @@ -71,7 +71,7 @@ MODULE_PARM_DESC(use_gpio, "Use the gpio watchdog (required by old cobalt boards)."); static void wdt_timer_ping(unsigned long); -static DEFINE_TIMER(timer, wdt_timer_ping, 0, 1); +static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0); static unsigned long next_heartbeat; static unsigned long wdt_is_open; static char wdt_expect_close; @@ -87,7 +87,7 @@ MODULE_PARM_DESC(nowayout, * Whack the dog */ -static void wdt_timer_ping(unsigned long data) +static void wdt_timer_ping(unsigned long unused) { /* If we got a heartbeat pulse within the WDT_US_INTERVAL * we agree to ping the WDT From 1d27e3e2252ba9d949ca82fbdb73cde102cb2067 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:04 -0700 Subject: [PATCH 0269/1085] timer: Remove expires and data arguments from DEFINE_TIMER Drop the arguments from the macro and adjust all callers with the following script: perl -pi -e 's/DEFINE_TIMER\((.*), 0, 0\);/DEFINE_TIMER($1);/g;' \ $(git grep DEFINE_TIMER | cut -d: -f1 | sort -u | grep -v timer.h) Signed-off-by: Kees Cook Acked-by: Geert Uytterhoeven # for m68k parts Acked-by: Guenter Roeck # for watchdog parts Acked-by: David S. Miller # for networking parts Acked-by: Greg Kroah-Hartman Acked-by: Kalle Valo # for wireless parts Acked-by: Arnd Bergmann Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-11-git-send-email-keescook@chromium.org Signed-off-by: Thomas Gleixner --- arch/arm/mach-ixp4xx/dsmg600-setup.c | 2 +- arch/arm/mach-ixp4xx/nas100d-setup.c | 2 +- arch/m68k/amiga/amisound.c | 2 +- arch/m68k/mac/macboing.c | 2 +- arch/mips/mti-malta/malta-display.c | 2 +- arch/parisc/kernel/pdc_cons.c | 2 +- arch/s390/mm/cmm.c | 2 +- drivers/atm/idt77105.c | 4 ++-- drivers/atm/iphase.c | 2 +- drivers/block/ataflop.c | 8 ++++---- drivers/char/dtlk.c | 2 +- drivers/char/hangcheck-timer.c | 2 +- drivers/char/nwbutton.c | 2 +- drivers/char/rtc.c | 2 +- drivers/input/touchscreen/s3c2410_ts.c | 2 +- drivers/net/cris/eth_v10.c | 6 +++--- drivers/net/hamradio/yam.c | 2 +- drivers/net/wireless/atmel/at76c50x-usb.c | 2 +- drivers/staging/speakup/main.c | 2 +- drivers/staging/speakup/synth.c | 2 +- drivers/tty/cyclades.c | 2 +- drivers/tty/isicom.c | 2 +- drivers/tty/moxa.c | 2 +- drivers/tty/rocket.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- drivers/tty/vt/vt.c | 2 +- drivers/watchdog/alim7101_wdt.c | 2 +- drivers/watchdog/machzwd.c | 2 +- drivers/watchdog/mixcomwd.c | 2 +- drivers/watchdog/sbc60xxwdt.c | 2 +- drivers/watchdog/sc520_wdt.c | 2 +- drivers/watchdog/via_wdt.c | 2 +- drivers/watchdog/w83877f_wdt.c | 2 +- drivers/xen/grant-table.c | 2 +- fs/pstore/platform.c | 2 +- include/linux/timer.h | 4 ++-- kernel/irq/spurious.c | 2 +- lib/random32.c | 2 +- net/atm/mpc.c | 2 +- net/decnet/dn_route.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- net/netrom/nr_loopback.c | 2 +- security/keys/gc.c | 2 +- sound/oss/midibuf.c | 2 +- sound/oss/soundcard.c | 2 +- sound/oss/sys_timer.c | 2 +- sound/oss/uart6850.c | 2 +- 47 files changed, 54 insertions(+), 54 deletions(-) diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c index b3bd0e137f6d..b3689a141ec6 100644 --- a/arch/arm/mach-ixp4xx/dsmg600-setup.c +++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c @@ -174,7 +174,7 @@ static int power_button_countdown; #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */ static void dsmg600_power_handler(unsigned long data); -static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler, 0, 0); +static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler); static void dsmg600_power_handler(unsigned long data) { diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c index 4e0f762bc651..562d05f9888e 100644 --- a/arch/arm/mach-ixp4xx/nas100d-setup.c +++ b/arch/arm/mach-ixp4xx/nas100d-setup.c @@ -197,7 +197,7 @@ static int power_button_countdown; #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */ static void nas100d_power_handler(unsigned long data); -static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler, 0, 0); +static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler); static void nas100d_power_handler(unsigned long data) { diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c index 90a60d758f8b..a23f48181fd6 100644 --- a/arch/m68k/amiga/amisound.c +++ b/arch/m68k/amiga/amisound.c @@ -66,7 +66,7 @@ void __init amiga_init_sound(void) } static void nosound( unsigned long ignored ); -static DEFINE_TIMER(sound_timer, nosound, 0, 0); +static DEFINE_TIMER(sound_timer, nosound); void amiga_mksound( unsigned int hz, unsigned int ticks ) { diff --git a/arch/m68k/mac/macboing.c b/arch/m68k/mac/macboing.c index ffaa1f6439ae..9a52aff183d0 100644 --- a/arch/m68k/mac/macboing.c +++ b/arch/m68k/mac/macboing.c @@ -56,7 +56,7 @@ static void ( *mac_special_bell )( unsigned int, unsigned int, unsigned int ); /* * our timer to start/continue/stop the bell */ -static DEFINE_TIMER(mac_sound_timer, mac_nosound, 0, 0); +static DEFINE_TIMER(mac_sound_timer, mac_nosound); /* * Sort of initialize the sound chip (called from mac_mksound on the first diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c index ac813158b9b8..063de44675ce 100644 --- a/arch/mips/mti-malta/malta-display.c +++ b/arch/mips/mti-malta/malta-display.c @@ -37,7 +37,7 @@ void mips_display_message(const char *str) } static void scroll_display_message(unsigned long unused); -static DEFINE_TIMER(mips_scroll_timer, scroll_display_message, 0, 0); +static DEFINE_TIMER(mips_scroll_timer, scroll_display_message); static void scroll_display_message(unsigned long unused) { diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c index 10a5ae9553fd..27a2dd616a7d 100644 --- a/arch/parisc/kernel/pdc_cons.c +++ b/arch/parisc/kernel/pdc_cons.c @@ -92,7 +92,7 @@ static int pdc_console_setup(struct console *co, char *options) #define PDC_CONS_POLL_DELAY (30 * HZ / 1000) static void pdc_console_poll(unsigned long unused); -static DEFINE_TIMER(pdc_console_timer, pdc_console_poll, 0, 0); +static DEFINE_TIMER(pdc_console_timer, pdc_console_poll); static struct tty_port tty_port; static int pdc_console_tty_open(struct tty_struct *tty, struct file *filp) diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 829c63dbc81a..2dbdcd85b68f 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -56,7 +56,7 @@ static DEFINE_SPINLOCK(cmm_lock); static struct task_struct *cmm_thread_ptr; static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait); -static DEFINE_TIMER(cmm_timer, NULL, 0, 0); +static DEFINE_TIMER(cmm_timer, NULL); static void cmm_timer_fn(unsigned long); static void cmm_set_timer(void); diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index 082aa02abc57..57af9fd198e4 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -49,8 +49,8 @@ static void idt77105_stats_timer_func(unsigned long); static void idt77105_restart_timer_func(unsigned long); -static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func, 0, 0); -static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func, 0, 0); +static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func); +static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func); static int start_timer = 1; static struct idt77105_priv *idt77105_all = NULL; diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index fc72b763fdd7..ad6b582c268e 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -76,7 +76,7 @@ static IADEV *ia_dev[8]; static struct atm_dev *_ia_dev[8]; static int iadev_count; static void ia_led_timer(unsigned long arg); -static DEFINE_TIMER(ia_timer, ia_led_timer, 0, 0); +static DEFINE_TIMER(ia_timer, ia_led_timer); static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ; static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ; static uint IADebugFlag = /* IF_IADBG_ERR | IF_IADBG_CBR| IF_IADBG_INIT_ADAPTER diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 92da886180aa..ae596e55bcb6 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -373,10 +373,10 @@ static void floppy_release(struct gendisk *disk, fmode_t mode); /************************* End of Prototypes **************************/ -static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer, 0, 0); -static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0); -static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0); -static DEFINE_TIMER(fd_timer, check_change, 0, 0); +static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer); +static DEFINE_TIMER(readtrack_timer, fd_readtrack_check); +static DEFINE_TIMER(timeout_timer, fd_times_out); +static DEFINE_TIMER(fd_timer, check_change); static void fd_end_request_cur(blk_status_t err) { diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c index 58471394beb9..1a0385ed6417 100644 --- a/drivers/char/dtlk.c +++ b/drivers/char/dtlk.c @@ -84,7 +84,7 @@ static int dtlk_has_indexing; static unsigned int dtlk_portlist[] = {0x25e, 0x29e, 0x2de, 0x31e, 0x35e, 0x39e, 0}; static wait_queue_head_t dtlk_process_list; -static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick, 0, 0); +static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick); /* prototypes for file_operations struct */ static ssize_t dtlk_read(struct file *, char __user *, diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index 5406b90bf626..5b8db2ed844d 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -124,7 +124,7 @@ static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; static void hangcheck_fire(unsigned long); -static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire, 0, 0); +static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); static void hangcheck_fire(unsigned long data) { diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c index e6d0d271c58c..44006ed9558f 100644 --- a/drivers/char/nwbutton.c +++ b/drivers/char/nwbutton.c @@ -27,7 +27,7 @@ static void button_sequence_finished (unsigned long parameters); static int button_press_count; /* The count of button presses */ /* Times for the end of a sequence */ -static DEFINE_TIMER(button_timer, button_sequence_finished, 0, 0); +static DEFINE_TIMER(button_timer, button_sequence_finished); static DECLARE_WAIT_QUEUE_HEAD(button_wait_queue); /* Used for blocking read */ static char button_output_buffer[32]; /* Stores data to write out of device */ static int bcount; /* The number of bytes in the buffer */ diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index 974d48927b07..616871e68e09 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -137,7 +137,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rtc_wait); #ifdef RTC_IRQ static void rtc_dropped_irq(unsigned long data); -static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq, 0, 0); +static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq); #endif static ssize_t rtc_read(struct file *file, char __user *buf, diff --git a/drivers/input/touchscreen/s3c2410_ts.c b/drivers/input/touchscreen/s3c2410_ts.c index 3b3db8c868e0..d3265b6b58b8 100644 --- a/drivers/input/touchscreen/s3c2410_ts.c +++ b/drivers/input/touchscreen/s3c2410_ts.c @@ -145,7 +145,7 @@ static void touch_timer_fire(unsigned long data) } } -static DEFINE_TIMER(touch_timer, touch_timer_fire, 0, 0); +static DEFINE_TIMER(touch_timer, touch_timer_fire); /** * stylus_irq - touchscreen stylus event interrupt diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c index 017f48cdcab9..1fcc86fa4e05 100644 --- a/drivers/net/cris/eth_v10.c +++ b/drivers/net/cris/eth_v10.c @@ -165,8 +165,8 @@ static unsigned int network_rec_config_shadow = 0; static unsigned int network_tr_ctrl_shadow = 0; /* Network speed indication. */ -static DEFINE_TIMER(speed_timer, NULL, 0, 0); -static DEFINE_TIMER(clear_led_timer, NULL, 0, 0); +static DEFINE_TIMER(speed_timer, NULL); +static DEFINE_TIMER(clear_led_timer, NULL); static int current_speed; /* Speed read from transceiver */ static int current_speed_selection; /* Speed selected by user */ static unsigned long led_next_time; @@ -174,7 +174,7 @@ static int led_active; static int rx_queue_len; /* Duplex */ -static DEFINE_TIMER(duplex_timer, NULL, 0, 0); +static DEFINE_TIMER(duplex_timer, NULL); static int full_duplex; static enum duplex current_duplex; diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 7a7c5224a336..104f71fa9c5e 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -157,7 +157,7 @@ static struct net_device *yam_devs[NR_PORTS]; static struct yam_mcs *yam_data; -static DEFINE_TIMER(yam_timer, NULL, 0, 0); +static DEFINE_TIMER(yam_timer, NULL); /* --------------------------------------------------------------------- */ diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c index 94bf01f8b2a8..ede89d4ffc88 100644 --- a/drivers/net/wireless/atmel/at76c50x-usb.c +++ b/drivers/net/wireless/atmel/at76c50x-usb.c @@ -519,7 +519,7 @@ exit: /* LED trigger */ static int tx_activity; static void at76_ledtrig_tx_timerfunc(unsigned long data); -static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc, 0, 0); +static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc); DEFINE_LED_TRIGGER(ledtrig_tx); static void at76_ledtrig_tx_timerfunc(unsigned long data) diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c index 56f7be6af1f6..585925bb49a4 100644 --- a/drivers/staging/speakup/main.c +++ b/drivers/staging/speakup/main.c @@ -1165,7 +1165,7 @@ static const int NUM_CTL_LABELS = (MSG_CTL_END - MSG_CTL_START + 1); static void read_all_doc(struct vc_data *vc); static void cursor_done(u_long data); -static DEFINE_TIMER(cursor_timer, cursor_done, 0, 0); +static DEFINE_TIMER(cursor_timer, cursor_done); static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag) { diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c index a1ca68c76579..6ddd3fc3f08d 100644 --- a/drivers/staging/speakup/synth.c +++ b/drivers/staging/speakup/synth.c @@ -158,7 +158,7 @@ static void thread_wake_up(u_long data) wake_up_interruptible_all(&speakup_event); } -static DEFINE_TIMER(thread_timer, thread_wake_up, 0, 0); +static DEFINE_TIMER(thread_timer, thread_wake_up); void synth_start(void) { diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c index d272bc4e7fb5..dac8a1a8e4ac 100644 --- a/drivers/tty/cyclades.c +++ b/drivers/tty/cyclades.c @@ -283,7 +283,7 @@ static void cyz_poll(unsigned long); /* The Cyclades-Z polling cycle is defined by this variable */ static long cyz_polling_cycle = CZ_DEF_POLL; -static DEFINE_TIMER(cyz_timerlist, cyz_poll, 0, 0); +static DEFINE_TIMER(cyz_timerlist, cyz_poll); #else /* CONFIG_CYZ_INTR */ static void cyz_rx_restart(unsigned long); diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c index 61ecdd6b2fc2..40af32108ff5 100644 --- a/drivers/tty/isicom.c +++ b/drivers/tty/isicom.c @@ -177,7 +177,7 @@ static struct tty_driver *isicom_normal; static void isicom_tx(unsigned long _data); static void isicom_start(struct tty_struct *tty); -static DEFINE_TIMER(tx, isicom_tx, 0, 0); +static DEFINE_TIMER(tx, isicom_tx); /* baud index mappings from linux defns to isi */ diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c index 7f3d4cb0341b..93d37655d928 100644 --- a/drivers/tty/moxa.c +++ b/drivers/tty/moxa.c @@ -428,7 +428,7 @@ static const struct tty_port_operations moxa_port_ops = { }; static struct tty_driver *moxaDriver; -static DEFINE_TIMER(moxaTimer, moxa_poll, 0, 0); +static DEFINE_TIMER(moxaTimer, moxa_poll); /* * HW init diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c index 20d79a6007d5..aa695fda1084 100644 --- a/drivers/tty/rocket.c +++ b/drivers/tty/rocket.c @@ -111,7 +111,7 @@ static struct r_port *rp_table[MAX_RP_PORTS]; /* The main repository of static unsigned int xmit_flags[NUM_BOARDS]; /* Bit significant, indicates port had data to transmit. */ /* eg. Bit 0 indicates port 0 has xmit data, ... */ static atomic_t rp_num_ports_open; /* Number of serial ports open */ -static DEFINE_TIMER(rocket_timer, rp_do_poll, 0, 0); +static DEFINE_TIMER(rocket_timer, rp_do_poll); static unsigned long board1; /* ISA addresses, retrieved from rocketport.conf */ static unsigned long board2; diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index f4166263bb3a..f974d6340d04 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -250,7 +250,7 @@ static void kd_nosound(unsigned long ignored) input_handler_for_each_handle(&kbd_handler, &zero, kd_sound_helper); } -static DEFINE_TIMER(kd_mksound_timer, kd_nosound, 0, 0); +static DEFINE_TIMER(kd_mksound_timer, kd_nosound); void kd_mksound(unsigned int hz, unsigned int ticks) { diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 2ebaba16f785..602d71630952 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -228,7 +228,7 @@ static int scrollback_delta; */ int (*console_blank_hook)(int); -static DEFINE_TIMER(console_timer, blank_screen_t, 0, 0); +static DEFINE_TIMER(console_timer, blank_screen_t); static int blank_state; static int blank_timer_expired; enum { diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c index 3c1f6ac68ea9..18e896eeca62 100644 --- a/drivers/watchdog/alim7101_wdt.c +++ b/drivers/watchdog/alim7101_wdt.c @@ -71,7 +71,7 @@ MODULE_PARM_DESC(use_gpio, "Use the gpio watchdog (required by old cobalt boards)."); static void wdt_timer_ping(unsigned long); -static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0); +static DEFINE_TIMER(timer, wdt_timer_ping); static unsigned long next_heartbeat; static unsigned long wdt_is_open; static char wdt_expect_close; diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c index 9826b59ef734..8a616a57bb90 100644 --- a/drivers/watchdog/machzwd.c +++ b/drivers/watchdog/machzwd.c @@ -127,7 +127,7 @@ static int zf_action = GEN_RESET; static unsigned long zf_is_open; static char zf_expect_close; static DEFINE_SPINLOCK(zf_port_lock); -static DEFINE_TIMER(zf_timer, zf_ping, 0, 0); +static DEFINE_TIMER(zf_timer, zf_ping); static unsigned long next_heartbeat; diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c index be86ea359eee..c9e38096ea91 100644 --- a/drivers/watchdog/mixcomwd.c +++ b/drivers/watchdog/mixcomwd.c @@ -105,7 +105,7 @@ static unsigned long mixcomwd_opened; /* long req'd for setbit --RR */ static int watchdog_port; static int mixcomwd_timer_alive; -static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun, 0, 0); +static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun); static char expect_close; static bool nowayout = WATCHDOG_NOWAYOUT; diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c index 2eef58a0cf05..8d589939bc84 100644 --- a/drivers/watchdog/sbc60xxwdt.c +++ b/drivers/watchdog/sbc60xxwdt.c @@ -113,7 +113,7 @@ MODULE_PARM_DESC(nowayout, __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); static void wdt_timer_ping(unsigned long); -static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0); +static DEFINE_TIMER(timer, wdt_timer_ping); static unsigned long next_heartbeat; static unsigned long wdt_is_open; static char wdt_expect_close; diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c index 1cfd3f6a13d5..3e9bbaa37bf4 100644 --- a/drivers/watchdog/sc520_wdt.c +++ b/drivers/watchdog/sc520_wdt.c @@ -124,7 +124,7 @@ MODULE_PARM_DESC(nowayout, static __u16 __iomem *wdtmrctl; static void wdt_timer_ping(unsigned long); -static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0); +static DEFINE_TIMER(timer, wdt_timer_ping); static unsigned long next_heartbeat; static unsigned long wdt_is_open; static char wdt_expect_close; diff --git a/drivers/watchdog/via_wdt.c b/drivers/watchdog/via_wdt.c index 5f9cbc37520d..ad3c3be13b40 100644 --- a/drivers/watchdog/via_wdt.c +++ b/drivers/watchdog/via_wdt.c @@ -68,7 +68,7 @@ static struct resource wdt_res; static void __iomem *wdt_mem; static unsigned int mmio; static void wdt_timer_tick(unsigned long data); -static DEFINE_TIMER(timer, wdt_timer_tick, 0, 0); +static DEFINE_TIMER(timer, wdt_timer_tick); /* The timer that pings the watchdog */ static unsigned long next_heartbeat; /* the next_heartbeat for the timer */ diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c index f0483c75ed32..ba6b680af100 100644 --- a/drivers/watchdog/w83877f_wdt.c +++ b/drivers/watchdog/w83877f_wdt.c @@ -98,7 +98,7 @@ MODULE_PARM_DESC(nowayout, __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); static void wdt_timer_ping(unsigned long); -static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0); +static DEFINE_TIMER(timer, wdt_timer_ping); static unsigned long next_heartbeat; static unsigned long wdt_is_open; static char wdt_expect_close; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 2c6a9114d332..a8721d718186 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -305,7 +305,7 @@ struct deferred_entry { }; static LIST_HEAD(deferred_list); static void gnttab_handle_deferred(unsigned long); -static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred); static void gnttab_handle_deferred(unsigned long unused) { diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2b21d180157c..ec7199e859d2 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -62,7 +62,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " static int pstore_new_entry; static void pstore_timefunc(unsigned long); -static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); +static DEFINE_TIMER(pstore_timer, pstore_timefunc); static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); diff --git a/include/linux/timer.h b/include/linux/timer.h index a33220311361..91e5a2cc81b5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -73,9 +73,9 @@ struct timer_list { __FILE__ ":" __stringify(__LINE__)) \ } -#define DEFINE_TIMER(_name, _function, _expires, _data) \ +#define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ - __TIMER_INITIALIZER(_function, _expires, _data, 0) + __TIMER_INITIALIZER(_function, 0, 0, 0) void init_timer_key(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 061ba7eed4ed..c805e8691c22 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -20,7 +20,7 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); -static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); static int irq_poll_cpu; static atomic_t irq_poll_active; diff --git a/lib/random32.c b/lib/random32.c index fa594b1140e6..6e91b75c113f 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -214,7 +214,7 @@ core_initcall(prandom_init); static void __prandom_timer(unsigned long dontcare); -static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0); +static DEFINE_TIMER(seed_timer, __prandom_timer); static void __prandom_timer(unsigned long dontcare) { diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 5677147209e8..63138c8c2269 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -121,7 +121,7 @@ static struct notifier_block mpoa_notifier = { struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; -static DEFINE_TIMER(mpc_timer, NULL, 0, 0); +static DEFINE_TIMER(mpc_timer, NULL); static struct mpoa_client *find_mpc_by_itfnum(int itf) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 0bd3afd01dd2..6538632fbd03 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -131,7 +131,7 @@ static struct dn_rt_hash_bucket *dn_rt_hash_table; static unsigned int dn_rt_hash_mask; static struct timer_list dn_route_timer; -static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0); +static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush); int decnet_dst_gc_interval = 2; static struct dst_ops dn_dst_ops = { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 8081bafe441b..b39d0908be2e 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -47,7 +47,7 @@ static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); -static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); +static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 94d4e922af53..989ae647825e 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -18,7 +18,7 @@ static void nr_loopback_timer(unsigned long); static struct sk_buff_head loopback_queue; -static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0); +static DEFINE_TIMER(loopback_timer, nr_loopback_timer); void __init nr_loopback_init(void) { diff --git a/security/keys/gc.c b/security/keys/gc.c index 87cb260e4890..8673f7f58ead 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -30,7 +30,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector); * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(unsigned long); -static DEFINE_TIMER(key_gc_timer, key_gc_timer_func, 0, 0); +static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time_t key_gc_next_run = LONG_MAX; static struct key_type *key_gc_dead_keytype; diff --git a/sound/oss/midibuf.c b/sound/oss/midibuf.c index 701c7625c971..1277df815d5b 100644 --- a/sound/oss/midibuf.c +++ b/sound/oss/midibuf.c @@ -52,7 +52,7 @@ static struct midi_parms parms[MAX_MIDI_DEV]; static void midi_poll(unsigned long dummy); -static DEFINE_TIMER(poll_timer, midi_poll, 0, 0); +static DEFINE_TIMER(poll_timer, midi_poll); static volatile int open_devs; static DEFINE_SPINLOCK(lock); diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c index b70c7c8f9c5d..4391062e5cfd 100644 --- a/sound/oss/soundcard.c +++ b/sound/oss/soundcard.c @@ -662,7 +662,7 @@ static void do_sequencer_timer(unsigned long dummy) } -static DEFINE_TIMER(seq_timer, do_sequencer_timer, 0, 0); +static DEFINE_TIMER(seq_timer, do_sequencer_timer); void request_sound_timer(int count) { diff --git a/sound/oss/sys_timer.c b/sound/oss/sys_timer.c index d17019d25b99..8a4b5625dba6 100644 --- a/sound/oss/sys_timer.c +++ b/sound/oss/sys_timer.c @@ -28,7 +28,7 @@ static unsigned long prev_event_time; static void poll_def_tmr(unsigned long dummy); static DEFINE_SPINLOCK(lock); -static DEFINE_TIMER(def_tmr, poll_def_tmr, 0, 0); +static DEFINE_TIMER(def_tmr, poll_def_tmr); static unsigned long tmr2ticks(int tmr_value) diff --git a/sound/oss/uart6850.c b/sound/oss/uart6850.c index eda32d7eddbd..a9d3f7568525 100644 --- a/sound/oss/uart6850.c +++ b/sound/oss/uart6850.c @@ -78,7 +78,7 @@ static void (*midi_input_intr) (int dev, unsigned char data); static void poll_uart6850(unsigned long dummy); -static DEFINE_TIMER(uart6850_timer, poll_uart6850, 0, 0); +static DEFINE_TIMER(uart6850_timer, poll_uart6850); static void uart6850_input_loop(void) { From 8ede369b2cccfa585e2969bbed18edc0e2a18c50 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:05 -0700 Subject: [PATCH 0270/1085] timer: Remove expires argument from __TIMER_INITIALIZER() The expires field is normally initialized during the first mod_timer() call. It was unused by all callers, so remove it from the macro. Signed-off-by: Kees Cook Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-12-git-send-email-keescook@chromium.org Signed-off-by: Thomas Gleixner --- include/linux/kthread.h | 2 +- include/linux/timer.h | 5 ++--- include/linux/workqueue.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 82e197eeac91..0d622b350d3f 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -117,7 +117,7 @@ struct kthread_delayed_work { #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) { \ .work = KTHREAD_WORK_INIT((dwork).work, (fn)), \ .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn, \ - 0, (unsigned long)&(dwork), \ + (unsigned long)&(dwork), \ TIMER_IRQSAFE), \ } diff --git a/include/linux/timer.h b/include/linux/timer.h index 91e5a2cc81b5..10685c33e679 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -63,10 +63,9 @@ struct timer_list { #define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) -#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ +#define __TIMER_INITIALIZER(_function, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ - .expires = (_expires), \ .data = (_data), \ .flags = (_flags), \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ @@ -75,7 +74,7 @@ struct timer_list { #define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ - __TIMER_INITIALIZER(_function, 0, 0, 0) + __TIMER_INITIALIZER(_function, 0, 0) void init_timer_key(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1c49431f3121..f4960260feaf 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -176,7 +176,7 @@ struct execute_work { #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn, \ - 0, (unsigned long)&(n), \ + (unsigned long)&(n), \ (tflags) | TIMER_IRQSAFE), \ } From fe5c3b69b540e3387223a696f327c1bb8880d1ac Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:06 -0700 Subject: [PATCH 0271/1085] kthread: Convert callback to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch kthread to use from_timer() and pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Petr Mladek Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-13-git-send-email-keescook@chromium.org --- include/linux/kthread.h | 10 +++++----- kernel/kthread.c | 10 ++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0d622b350d3f..35cbe3b0ce5b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -75,7 +75,7 @@ extern int tsk_fork_get_node(struct task_struct *tsk); */ struct kthread_work; typedef void (*kthread_work_func_t)(struct kthread_work *work); -void kthread_delayed_work_timer_fn(unsigned long __data); +void kthread_delayed_work_timer_fn(struct timer_list *t); enum { KTW_FREEZABLE = 1 << 0, /* freeze during suspend */ @@ -116,8 +116,8 @@ struct kthread_delayed_work { #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) { \ .work = KTHREAD_WORK_INIT((dwork).work, (fn)), \ - .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn, \ - (unsigned long)&(dwork), \ + .timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\ + (TIMER_DATA_TYPE)&(dwork.timer), \ TIMER_IRQSAFE), \ } @@ -164,8 +164,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker, do { \ kthread_init_work(&(dwork)->work, (fn)); \ __setup_timer(&(dwork)->timer, \ - kthread_delayed_work_timer_fn, \ - (unsigned long)(dwork), \ + (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\ + (TIMER_DATA_TYPE)&(dwork)->timer, \ TIMER_IRQSAFE); \ } while (0) diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf82427..ba3992c8c375 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. - * @__data: pointer to the data associated with the timer + * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ -void kthread_delayed_work_timer_fn(unsigned long __data) +void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = - (struct kthread_delayed_work *)__data; + struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; @@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for From 8c20feb60604d91a29cd7fef8ac758bd92d9fd2c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:07 -0700 Subject: [PATCH 0272/1085] workqueue: Convert callback to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch workqueue to use from_timer() and pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-14-git-send-email-keescook@chromium.org --- include/linux/workqueue.h | 15 ++++++++------- kernel/workqueue.c | 7 +++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f4960260feaf..f3c47a05fd06 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -17,7 +17,7 @@ struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); -void delayed_work_timer_fn(unsigned long __data); +void delayed_work_timer_fn(struct timer_list *t); /* * The first word is the work queue pointer and the flags rolled into @@ -175,8 +175,8 @@ struct execute_work { #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ - .timer = __TIMER_INITIALIZER(delayed_work_timer_fn, \ - (unsigned long)&(n), \ + .timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\ + (TIMER_DATA_TYPE)&(n.timer), \ (tflags) | TIMER_IRQSAFE), \ } @@ -241,8 +241,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ - __setup_timer(&(_work)->timer, delayed_work_timer_fn, \ - (unsigned long)(_work), \ + __setup_timer(&(_work)->timer, \ + (TIMER_FUNC_TYPE)delayed_work_timer_fn, \ + (TIMER_DATA_TYPE)&(_work)->timer, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) @@ -250,8 +251,8 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __setup_timer_on_stack(&(_work)->timer, \ - delayed_work_timer_fn, \ - (unsigned long)(_work), \ + (TIMER_FUNC_TYPE)delayed_work_timer_fn,\ + (TIMER_DATA_TYPE)&(_work)->timer,\ (_tflags) | TIMER_IRQSAFE); \ } while (0) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a5361fc6215d..c77fdf6bf24f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1492,9 +1492,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); -void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = (struct delayed_work *)__data; + struct delayed_work *dwork = from_timer(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -1508,8 +1508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); From 6ac35264513e43634f127a92057f04a4b3c5cdbb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 Oct 2017 10:10:35 -0700 Subject: [PATCH 0273/1085] timer: Fix two mistakes in callback conversions Two errors found their way into the timer callback conversions that weren't noticed with x86 allmodconfig. Reported-by: kernel test robot Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171005171035.GA34831@beast --- drivers/hsi/clients/ssi_protocol.c | 2 +- drivers/s390/net/lcs.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c index 67af03d3aeb3..9b167bc6eee4 100644 --- a/drivers/hsi/clients/ssi_protocol.c +++ b/drivers/hsi/clients/ssi_protocol.c @@ -499,7 +499,7 @@ static void ssip_rx_wd(struct timer_list *t) ssip_error(cl); } -static void ssip_tx_wd(unsigned long data) +static void ssip_tx_wd(struct timer_list *t) { struct ssi_protocol *ssi = from_timer(ssi, t, tx_wd); struct hsi_client *cl = ssi->cl; diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index 21bba406d5be..b855c6f08e96 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -841,7 +841,6 @@ lcs_lancmd_timeout(struct timer_list *t) unsigned long flags; LCS_DBF_TEXT(4, trace, "timeout"); - reply = (struct lcs_reply *) data; spin_lock_irqsave(&reply->card->lock, flags); list_for_each_entry_safe(list_reply, r, &reply->card->lancmd_waiters,list) { From e83f374247c310bc558c8626fbfcc03f22f9bf02 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 5 Oct 2017 22:39:37 +0200 Subject: [PATCH 0274/1085] spi: spreadtrum adi: add hwspinlock dependency With CONFIG_HWSPINLOCK=m, the new driver fails to link as a built-in driver: drivers/spi/spi-sprd-adi.o: In function `sprd_adi_remove': spi-sprd-adi.c:(.text+0x18): undefined reference to `hwspin_lock_free' drivers/spi/spi-sprd-adi.o: In function `sprd_adi_probe': spi-sprd-adi.c:(.text+0xfc): undefined reference to `of_hwspin_lock_get_id' spi-sprd-adi.c:(.text+0x108): undefined reference to `hwspin_lock_request_specific' spi-sprd-adi.c:(.text+0x268): undefined reference to `hwspin_lock_free' This adds a hard Kconfig dependency on HWSPINLOCK for the !COMPILE_TEST case, and allows compile-testing with HWSPINLOCK completely disabled, which will then rely on the existing stub API. Fixes: 7e2903cb91df ("spi: Add ADI driver for Spreadtrum platform") Signed-off-by: Arnd Bergmann Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 0c38a5bfcd74..2c96f744352b 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -629,6 +629,7 @@ config SPI_SIRF config SPI_SPRD_ADI tristate "Spreadtrum ADI controller" depends on ARCH_SPRD || COMPILE_TEST + depends on HWSPINLOCK || (COMPILE_TEST && !HWSPINLOCK) help ADI driver based on SPI for Spreadtrum SoCs. From 13277782dd4bcef71341a1016bee1d09f8f95ae0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 6 Oct 2017 11:10:38 +0200 Subject: [PATCH 0275/1085] Documentation: add Kernel Driver Statement to the kernel Way back in 2008 we didn't have "robust" in-kernel documentation system, so the idea of putting something like the kernel driver statement in the kernel tree wasn't even imagined. But now that has changed, so add the old document to the kernel source itself to allow for us to properly reference it in one canonical place (as the LF wiki keeps moving things around.) This also will allow people to add their names to it, as I seem to have lost the ability to do that by not knowing how to edit things on the original document. Signed-off-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/process/index.rst | 1 + .../process/kernel-driver-statement.rst | 199 ++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 Documentation/process/kernel-driver-statement.rst diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 82fc399fcd33..68454319f006 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -25,6 +25,7 @@ Below are the essential guides that every developer should read. submitting-patches coding-style email-clients + kernel-driver-statement Other guides to the community that are of interest to most developers are: diff --git a/Documentation/process/kernel-driver-statement.rst b/Documentation/process/kernel-driver-statement.rst new file mode 100644 index 000000000000..60d9d868f300 --- /dev/null +++ b/Documentation/process/kernel-driver-statement.rst @@ -0,0 +1,199 @@ +Kernel Driver Statement +----------------------- + +Position Statement on Linux Kernel Modules +========================================== + + +We, the undersigned Linux kernel developers, consider any closed-source +Linux kernel module or driver to be harmful and undesirable. We have +repeatedly found them to be detrimental to Linux users, businesses, and +the greater Linux ecosystem. Such modules negate the openness, +stability, flexibility, and maintainability of the Linux development +model and shut their users off from the expertise of the Linux +community. Vendors that provide closed-source kernel modules force their +customers to give up key Linux advantages or choose new vendors. +Therefore, in order to take full advantage of the cost savings and +shared support benefits open source has to offer, we urge vendors to +adopt a policy of supporting their customers on Linux with open-source +kernel code. + +We speak only for ourselves, and not for any company we might work for +today, have in the past, or will in the future. + + - Dave Airlie + - Nick Andrew + - Jens Axboe + - Ralf Baechle + - Felipe Balbi + - Ohad Ben-Cohen + - Muli Ben-Yehuda + - Jiri Benc + - Arnd Bergmann + - Thomas Bogendoerfer + - Vitaly Bordug + - James Bottomley + - Josh Boyer + - Neil Brown + - Mark Brown + - David Brownell + - Michael Buesch + - Franck Bui-Huu + - Adrian Bunk + - François Cami + - Ralph Campbell + - Luiz Fernando N. Capitulino + - Mauro Carvalho Chehab + - Denis Cheng + - Jonathan Corbet + - Glauber Costa + - Alan Cox + - Magnus Damm + - Ahmed S. Darwish + - Robert P. J. Day + - Hans de Goede + - Arnaldo Carvalho de Melo + - Helge Deller + - Jean Delvare + - Mathieu Desnoyers + - Sven-Thorsten Dietrich + - Alexey Dobriyan + - Daniel Drake + - Alex Dubov + - Randy Dunlap + - Michael Ellerman + - Pekka Enberg + - Jan Engelhardt + - Mark Fasheh + - J. Bruce Fields + - Larry Finger + - Jeremy Fitzhardinge + - Mike Frysinger + - Kumar Gala + - Robin Getz + - Liam Girdwood + - Jan-Benedict Glaw + - Thomas Gleixner + - Brice Goglin + - Cyrill Gorcunov + - Andy Gospodarek + - Thomas Graf + - Krzysztof Halasa + - Harvey Harrison + - Stephen Hemminger + - Michael Hennerich + - Tejun Heo + - Benjamin Herrenschmidt + - Kristian Høgsberg + - Henrique de Moraes Holschuh + - Marcel Holtmann + - Mike Isely + - Takashi Iwai + - Olof Johansson + - Dave Jones + - Jesper Juhl + - Matthias Kaehlcke + - Kenji Kaneshige + - Jan Kara + - Jeremy Kerr + - Russell King + - Olaf Kirch + - Roel Kluin + - Hans-Jürgen Koch + - Auke Kok + - Peter Korsgaard + - Jiri Kosina + - Mariusz Kozlowski + - Greg Kroah-Hartman + - Michael Krufky + - Aneesh Kumar + - Clemens Ladisch + - Christoph Lameter + - Gunnar Larisch + - Anders Larsen + - Grant Likely + - John W. Linville + - Yinghai Lu + - Tony Luck + - Pavel Machek + - Matt Mackall + - Paul Mackerras + - Roland McGrath + - Patrick McHardy + - Kyle McMartin + - Paul Menage + - Thierry Merle + - Eric Miao + - Akinobu Mita + - Ingo Molnar + - James Morris + - Andrew Morton + - Paul Mundt + - Oleg Nesterov + - Luca Olivetti + - S.Çağlar Onur + - Pierre Ossman + - Keith Owens + - Venkatesh Pallipadi + - Nick Piggin + - Nicolas Pitre + - Evgeniy Polyakov + - Richard Purdie + - Mike Rapoport + - Sam Ravnborg + - Gerrit Renker + - Stefan Richter + - David Rientjes + - Luis R. Rodriguez + - Stefan Roese + - Francois Romieu + - Rami Rosen + - Stephen Rothwell + - Maciej W. Rozycki + - Mark Salyzyn + - Yoshinori Sato + - Deepak Saxena + - Holger Schurig + - Amit Shah + - Yoshihiro Shimoda + - Sergei Shtylyov + - Kay Sievers + - Sebastian Siewior + - Rik Snel + - Jes Sorensen + - Alexey Starikovskiy + - Alan Stern + - Timur Tabi + - Hirokazu Takata + - Eliezer Tamir + - Eugene Teo + - Doug Thompson + - FUJITA Tomonori + - Dmitry Torokhov + - Marcelo Tosatti + - Steven Toth + - Theodore Tso + - Matthias Urlichs + - Geert Uytterhoeven + - Arjan van de Ven + - Ivo van Doorn + - Rik van Riel + - Wim Van Sebroeck + - Hans Verkuil + - Horst H. von Brand + - Dmitri Vorobiev + - Anton Vorontsov + - Daniel Walker + - Johannes Weiner + - Harald Welte + - Matthew Wilcox + - Dan J. Williams + - Darrick J. Wong + - David Woodhouse + - Chris Wright + - Bryan Wu + - Rafael J. Wysocki + - Herbert Xu + - Vlad Yasevich + - Peter Zijlstra + - Bartlomiej Zolnierkiewicz From a1c4d24e02d093efd84cbde417051d2e767fa8fa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2017 08:43:38 -0700 Subject: [PATCH 0276/1085] linux/log2.h: fix kernel-doc notation Fix kernel-doc: - Add kernel-doc notation to some functions. - Fix kernel-doc notation in function parameters. Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- include/linux/log2.h | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/include/linux/log2.h b/include/linux/log2.h index c373295f359f..41a1ae010993 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -37,19 +37,23 @@ int __ilog2_u64(u64 n) } #endif -/* - * Determine whether some value is a power of two, where zero is +/** + * is_power_of_2() - check if a value is a power of two + * @n: the value to check + * + * Determine whether some value is a power of two, where zero is * *not* considered a power of two. + * Return: true if @n is a power of 2, otherwise false. */ - static inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); } -/* - * round up to nearest power of two +/** + * __roundup_pow_of_two() - round up to nearest power of two + * @n: value to round up */ static inline __attribute__((const)) unsigned long __roundup_pow_of_two(unsigned long n) @@ -57,8 +61,9 @@ unsigned long __roundup_pow_of_two(unsigned long n) return 1UL << fls_long(n - 1); } -/* - * round down to nearest power of two +/** + * __rounddown_pow_of_two() - round down to nearest power of two + * @n: value to round down */ static inline __attribute__((const)) unsigned long __rounddown_pow_of_two(unsigned long n) @@ -67,12 +72,12 @@ unsigned long __rounddown_pow_of_two(unsigned long n) } /** - * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value - * @n - parameter + * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value + * @n: parameter * * constant-capable log of base 2 calculation * - this can be used to initialise global variables from constant data, hence - * the massive ternary operator construction + * the massive ternary operator construction * * selects the appropriately-sized optimised version depending on sizeof(n) */ @@ -150,7 +155,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n) /** * roundup_pow_of_two - round the given value up to nearest power of two - * @n - parameter + * @n: parameter * * round the given value up to the nearest power of two * - the result is undefined when n == 0 @@ -167,7 +172,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n) /** * rounddown_pow_of_two - round the given value down to nearest power of two - * @n - parameter + * @n: parameter * * round the given value down to the nearest power of two * - the result is undefined when n == 0 @@ -180,6 +185,12 @@ unsigned long __rounddown_pow_of_two(unsigned long n) __rounddown_pow_of_two(n) \ ) +static inline __attribute_const__ +int __order_base_2(unsigned long n) +{ + return n > 1 ? ilog2(n - 1) + 1 : 0; +} + /** * order_base_2 - calculate the (rounded up) base 2 order of the argument * @n: parameter @@ -193,13 +204,6 @@ unsigned long __rounddown_pow_of_two(unsigned long n) * ob2(5) = 3 * ... and so on. */ - -static inline __attribute_const__ -int __order_base_2(unsigned long n) -{ - return n > 1 ? ilog2(n - 1) + 1 : 0; -} - #define order_base_2(n) \ ( \ __builtin_constant_p(n) ? ( \ From 6ec72e61cb09482fe08524480f7f3f18a6d23cfa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2017 08:43:42 -0700 Subject: [PATCH 0277/1085] div64: add missing kernel-doc Add missing kernel-doc notation for 2 div() functions. Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- include/asm-generic/div64.h | 14 ++++++++++++++ lib/div64.c | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 163f77999ea4..3de84f64423f 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -25,6 +25,20 @@ #if BITS_PER_LONG == 64 +/** + * do_div - returns 2 values: calculate remainder and update new dividend + * @n: pointer to uint64_t dividend (will be updated) + * @base: uint32_t divisor + * + * Summary: + * ``uint32_t remainder = *n % base;`` + * ``*n = *n / base;`` + * + * Return: (uint32_t)remainder + * + * NOTE: macro parameter @n is evaluated multiple times, + * beware of side effects! + */ # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ diff --git a/lib/div64.c b/lib/div64.c index 7f345259c32f..5660e8233293 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -60,6 +60,12 @@ uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base) EXPORT_SYMBOL(__div64_32); #endif +/** + * div_s64_rem - signed 64bit divide with 64bit divisor and remainder + * @dividend: 64bit dividend + * @divisor: 64bit divisor + * @remainder: 64bit remainder + */ #ifndef div_s64_rem s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { From 078843f75d234123e654b992d97e8ae8a744ba23 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2017 08:43:45 -0700 Subject: [PATCH 0278/1085] math64: add missing kernel-doc notation Add missing kernel-doc notation (function parameters) for several div() functions. Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- include/linux/math64.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/include/linux/math64.h b/include/linux/math64.h index 80690c96c734..8dd2488bf289 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -11,6 +11,11 @@ /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder + * @dividend: unsigned 64bit dividend + * @divisor: unsigned 32bit divisor + * @remainder: pointer to unsigned 32bit remainder + * + * Return: sets ``*remainder``, then returns dividend / divisor * * This is commonly provided by 32bit archs to provide an optimized 64bit * divide. @@ -23,6 +28,11 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) /** * div_s64_rem - signed 64bit divide with 32bit divisor with remainder + * @dividend: signed 64bit dividend + * @divisor: signed 32bit divisor + * @remainder: pointer to signed 32bit remainder + * + * Return: sets ``*remainder``, then returns dividend / divisor */ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { @@ -32,6 +42,11 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) /** * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder + * @dividend: unsigned 64bit dividend + * @divisor: unsigned 64bit divisor + * @remainder: pointer to unsigned 64bit remainder + * + * Return: sets ``*remainder``, then returns dividend / divisor */ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { @@ -41,6 +56,10 @@ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) /** * div64_u64 - unsigned 64bit divide with 64bit divisor + * @dividend: unsigned 64bit dividend + * @divisor: unsigned 64bit divisor + * + * Return: dividend / divisor */ static inline u64 div64_u64(u64 dividend, u64 divisor) { @@ -49,6 +68,10 @@ static inline u64 div64_u64(u64 dividend, u64 divisor) /** * div64_s64 - signed 64bit divide with 64bit divisor + * @dividend: signed 64bit dividend + * @divisor: signed 64bit divisor + * + * Return: dividend / divisor */ static inline s64 div64_s64(s64 dividend, s64 divisor) { @@ -88,6 +111,8 @@ extern s64 div64_s64(s64 dividend, s64 divisor); /** * div_u64 - unsigned 64bit divide with 32bit divisor + * @dividend: unsigned 64bit dividend + * @divisor: unsigned 32bit divisor * * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit @@ -103,6 +128,8 @@ static inline u64 div_u64(u64 dividend, u32 divisor) /** * div_s64 - signed 64bit divide with 32bit divisor + * @dividend: signed 64bit dividend + * @divisor: signed 32bit divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) From 341e9a323af95ac141b4df41e2697d626da14e3f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2017 08:43:49 -0700 Subject: [PATCH 0279/1085] lib/gcd: add kernel-doc notation Add kernel-doc notation for the gcd() function (so that it can be added to the kernel-api documentation). Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- lib/gcd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/gcd.c b/lib/gcd.c index 135ee6407a5e..227dea924425 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -13,6 +13,12 @@ #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS) /* If __ffs is available, the even/odd algorithm benchmarks slower. */ + +/** + * gcd - calculate and return the greatest common divisor of 2 unsigned longs + * @a: first value + * @b: second value + */ unsigned long gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; From 68e51252224d06e14457de98960a4622993350ab Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2017 08:43:53 -0700 Subject: [PATCH 0280/1085] Documentation: add kernel-api section on Math functions Add a kernel-api section on Math Functions. Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 657a89a219b2..9c6902ba6e67 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -103,6 +103,30 @@ idr/ida Functions .. kernel-doc:: lib/idr.c :export: +Math Functions in Linux +======================= + +Base 2 log and power Functions +------------------------------ + +.. kernel-doc:: include/linux/log2.h + :internal: + +Division Functions +------------------ + +.. kernel-doc:: include/asm-generic/div64.h + :functions: do_div + +.. kernel-doc:: include/linux/math64.h + :internal: + +.. kernel-doc:: lib/div64.c + :functions: div_s64_rem div64_u64_rem div64_u64 div64_s64 + +.. kernel-doc:: lib/gcd.c + :export: + Memory Management in Linux ========================== From 58e7cb9e935d53c64414361fba3f95ed261f9110 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Oct 2017 00:38:49 +0200 Subject: [PATCH 0281/1085] PM: docs: Fix stale reference in kernel-parameters.txt Commit 7aa7a0360a66 (PM: docs: Delete the obsolete states.txt document) forgot to update kernel-parameters.txt with a reference to the new sleep-states.rst document and it still points to states.txt that was dropped, so fix it now. Fixes: 7aa7a0360a66 (PM: docs: Delete the obsolete states.txt document) Signed-off-by: Rafael J. Wysocki Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f02961bda6f4..a3427399fd07 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2248,7 +2248,7 @@ s2idle - Suspend-To-Idle shallow - Power-On Suspend or equivalent (if supported) deep - Suspend-To-RAM or equivalent (if supported) - See Documentation/power/states.txt. + See Documentation/admin-guide/pm/sleep-states.rst. meye.*= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. From b7c92f1a4e131e459bcf53a570e7265e5ce64455 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:50 +0200 Subject: [PATCH 0282/1085] s390/sthyi: reorganize sthyi implementation As we need to support sthyi instruction on LPAR too, move the common code to kernel part and kvm related code to intercept.c for better reuse. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sysinfo.h | 1 + arch/s390/kernel/Makefile | 2 +- arch/s390/{kvm => kernel}/sthyi.c | 95 +++++++++---------------------- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/intercept.c | 66 +++++++++++++++++++++ arch/s390/kvm/kvm-s390.h | 5 +- 6 files changed, 98 insertions(+), 73 deletions(-) rename arch/s390/{kvm => kernel}/sthyi.c (84%) diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 2b498e58b914..e4a28307bc5d 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -198,4 +198,5 @@ struct service_level { int register_service_level(struct service_level *); int unregister_service_level(struct service_level *); +int sthyi_fill(void *dst, u64 *rc); #endif /* __ASM_S390_SYSINFO_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index adb3fe2e3d42..1fefb7f9216f 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -55,7 +55,7 @@ obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o -obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o extra-y += head.o head64.o vmlinux.lds diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kernel/sthyi.c similarity index 84% rename from arch/s390/kvm/sthyi.c rename to arch/s390/kernel/sthyi.c index 395926b8c1ed..3d51f86f9dec 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -8,22 +8,17 @@ * Copyright IBM Corp. 2016 * Author(s): Janosch Frank */ -#include #include #include #include #include -#include #include #include #include #include #include - -#include "kvm-s390.h" -#include "gaccess.h" -#include "trace.h" +#include #define DED_WEIGHT 0xffff /* @@ -382,88 +377,52 @@ out: vfree(diag204_buf); } -static int sthyi(u64 vaddr) +static int sthyi(u64 vaddr, u64 *rc) { register u64 code asm("0") = 0; register u64 addr asm("2") = vaddr; + register u64 rcode asm("3"); int cc; asm volatile( ".insn rre,0xB2560000,%[code],%[addr]\n" "ipm %[cc]\n" "srl %[cc],28\n" - : [cc] "=d" (cc) + : [cc] "=d" (cc), "=d" (rcode) : [code] "d" (code), [addr] "a" (addr) - : "3", "memory", "cc"); + : "memory", "cc"); + *rc = rcode; return cc; } -int handle_sthyi(struct kvm_vcpu *vcpu) +/* + * sthyi_fill - Fill page with data returned by the STHYI instruction + * + * @dst: Pointer to zeroed page + * @rc: Pointer for storing the return code of the instruction + * + * Fills the destination with system information returned by the STHYI + * instruction. The data is generated by emulation or execution of STHYI, + * if available. The return value is the condition code that would be + * returned, the rc parameter is the return code which is passed in + * register R2 + 1. + */ +int sthyi_fill(void *dst, u64 *rc) { - int reg1, reg2, r = 0; - u64 code, addr, cc = 0; - struct sthyi_sctns *sctns = NULL; - - if (!test_kvm_facility(vcpu->kvm, 74)) - return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; /* - * STHYI requires extensive locking in the higher hypervisors - * and is very computational/memory expensive. Therefore we - * ratelimit the executions per VM. + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. */ - if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { - kvm_s390_retry_instr(vcpu); - return 0; - } - - kvm_s390_get_regs_rre(vcpu, ®1, ®2); - code = vcpu->run->s.regs.gprs[reg1]; - addr = vcpu->run->s.regs.gprs[reg2]; - - vcpu->stat.instruction_sthyi++; - VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); - trace_kvm_s390_handle_sthyi(vcpu, code, addr); - - if (reg1 == reg2 || reg1 & 1 || reg2 & 1) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - if (code & 0xffff) { - cc = 3; - goto out; - } - - if (addr & ~PAGE_MASK) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - sctns = (void *)get_zeroed_page(GFP_KERNEL); - if (!sctns) - return -ENOMEM; - - /* - * If we are a guest, we don't want to emulate an emulated - * instruction. We ask the hypervisor to provide the data. - */ - if (test_facility(74)) { - cc = sthyi((u64)sctns); - goto out; - } + if (test_facility(74)) + return sthyi((u64)dst, rc); fill_hdr(sctns); fill_stsi(sctns); fill_diag(sctns); -out: - if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); - } - } - - free_page((unsigned long)sctns); - vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0; - kvm_s390_set_psw_cc(vcpu, cc); - return r; + *rc = 0; + return 0; } +EXPORT_SYMBOL_GPL(sthyi_fill); diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 09a9e6dfc09f..6048b1c6e580 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o +kvm-objs += diag.o gaccess.o guestdbg.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a4752bf6b526..46adda5e2b2c 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -360,6 +361,71 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +/* + * Handle the sthyi instruction that provides the guest with system + * information, like current CPU resources available at each level of + * the machine. + */ +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, r = 0; + u64 code, addr, cc = 0, rc = 0; + struct sthyi_sctns *sctns = NULL; + + if (!test_kvm_facility(vcpu->kvm, 74)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + /* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * ratelimit the executions per VM. + */ + if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { + kvm_s390_retry_instr(vcpu); + return 0; + } + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + rc = 4; + goto out; + } + + if (addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + sctns = (void *)get_zeroed_page(GFP_KERNEL); + if (!sctns) + return -ENOMEM; + + cc = sthyi_fill(sctns, &rc); + +out: + if (!cc) { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = rc; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} + static int handle_operexc(struct kvm_vcpu *vcpu) { psw_t oldpsw, newpsw; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9f8fdd7b2311..10d65dfbc306 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -242,6 +242,8 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } +int handle_sthyi(struct kvm_vcpu *vcpu); + /* implemented in priv.c */ int is_valid_psw(psw_t *psw); int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); @@ -268,9 +270,6 @@ void kvm_s390_vsie_destroy(struct kvm *kvm); int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); -/* implemented in sthyi.c */ -int handle_sthyi(struct kvm_vcpu *vcpu); - /* implemented in kvm-s390.c */ void kvm_s390_set_tod_clock_ext(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); From 9fb6c9b3fea1b1d1c6f14178373e8f7235f3b681 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:51 +0200 Subject: [PATCH 0283/1085] s390/sthyi: add cache to store hypervisor info STHYI requires extensive locking in the higher hypervisors and is very computational/memory expensive. Therefore we cache the retrieved hypervisor info whose valid period is 1s with mutex to allow concurrent access. rw semaphore can't benefit here due to cache line bounce. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kernel/sthyi.c | 84 +++++++++++++++++++++++++++----- arch/s390/kvm/intercept.c | 10 ---- arch/s390/kvm/kvm-s390.c | 2 - 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 51375e766e90..fd006a272024 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -736,7 +736,6 @@ struct kvm_arch{ wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; - struct ratelimit_state sthyi_limit; spinlock_t start_stop_lock; struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 3d51f86f9dec..27e3c3d87379 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -139,6 +138,21 @@ struct lpar_cpu_inf { struct cpu_inf ifl; }; +/* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * cache the retrieved data whose valid period is 1s. + */ +#define CACHE_VALID_JIFFIES HZ + +struct sthyi_info { + void *info; + unsigned long end; +}; + +static DEFINE_MUTEX(sthyi_mutex); +static struct sthyi_info sthyi_cache; + static inline u64 cpu_id(u8 ctidx, void *diag224_buf) { return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); @@ -395,6 +409,47 @@ static int sthyi(u64 vaddr, u64 *rc) return cc; } +static int fill_dst(void *dst, u64 *rc) +{ + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + + /* + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. + */ + if (test_facility(74)) + return sthyi((u64)dst, rc); + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + *rc = 0; + return 0; +} + +static int sthyi_init_cache(void) +{ + if (sthyi_cache.info) + return 0; + sthyi_cache.info = (void *)get_zeroed_page(GFP_KERNEL); + if (!sthyi_cache.info) + return -ENOMEM; + sthyi_cache.end = jiffies - 1; /* expired */ + return 0; +} + +static int sthyi_update_cache(u64 *rc) +{ + int r; + + memset(sthyi_cache.info, 0, PAGE_SIZE); + r = fill_dst(sthyi_cache.info, rc); + if (r) + return r; + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + return r; +} + /* * sthyi_fill - Fill page with data returned by the STHYI instruction * @@ -409,20 +464,23 @@ static int sthyi(u64 vaddr, u64 *rc) */ int sthyi_fill(void *dst, u64 *rc) { - struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + int r; - /* - * If the facility is on, we don't want to emulate the instruction. - * We ask the hypervisor to provide the data. - */ - if (test_facility(74)) - return sthyi((u64)dst, rc); - - fill_hdr(sctns); - fill_stsi(sctns); - fill_diag(sctns); + mutex_lock(&sthyi_mutex); + r = sthyi_init_cache(); + if (r) + goto out; + if (time_is_before_jiffies(sthyi_cache.end)) { + /* cache expired */ + r = sthyi_update_cache(rc); + if (r) + goto out; + } *rc = 0; - return 0; + memcpy(dst, sthyi_cache.info, PAGE_SIZE); +out: + mutex_unlock(&sthyi_mutex); + return r; } EXPORT_SYMBOL_GPL(sthyi_fill); diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 46adda5e2b2c..8fe034beb623 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -375,16 +375,6 @@ int handle_sthyi(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 74)) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); - /* - * STHYI requires extensive locking in the higher hypervisors - * and is very computational/memory expensive. Therefore we - * ratelimit the executions per VM. - */ - if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { - kvm_s390_retry_instr(vcpu); - return 0; - } - kvm_s390_get_regs_rre(vcpu, ®1, ®2); code = vcpu->run->s.regs.gprs[reg1]; addr = vcpu->run->s.regs.gprs[reg2]; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 40d0a1a97889..de6a5b790da0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1884,8 +1884,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) rc = -ENOMEM; - ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); - kvm->arch.use_esca = 0; /* start with basic SCA */ if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; From 3d8757b87d7fc15a87928bc970f060bc9c6dc618 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:52 +0200 Subject: [PATCH 0284/1085] s390/sthyi: add s390_sthyi system call Add a syscall of s390_sthyi to implement STHYI instruction in LPAR which reuses the implementation for KVM by Janosch Frank - commit 95ca2cb57985 ("KVM: s390: Add sthyi emulation"). STHYI(Store Hypervisor Information) is an emulated z/VM instruction that provides a guest with basic information about the layers it is running on. This includes information about the cpu configuration of both the machine and the lpar, as well as their names, machine model and machine type. This information enables an application to determine the maximum capacity of CPs and IFLs available to software. For the arguments of s390_sthyi, code shall be 0 and flags is reserved for future use, info is the output argument to store the required hypervisor info. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/sthyi.h | 6 ++++++ arch/s390/include/uapi/asm/unistd.h | 3 ++- arch/s390/kernel/compat_wrapper.c | 1 + arch/s390/kernel/entry.h | 1 + arch/s390/kernel/sthyi.c | 33 ++++++++++++++++++++++++++++- arch/s390/kernel/syscalls.S | 1 + 6 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 arch/s390/include/uapi/asm/sthyi.h diff --git a/arch/s390/include/uapi/asm/sthyi.h b/arch/s390/include/uapi/asm/sthyi.h new file mode 100644 index 000000000000..ec113db4eb7e --- /dev/null +++ b/arch/s390/include/uapi/asm/sthyi.h @@ -0,0 +1,6 @@ +#ifndef _UAPI_ASM_STHYI_H +#define _UAPI_ASM_STHYI_H + +#define STHYI_FC_CP_IFL_CAP 0 + +#endif /* _UAPI_ASM_STHYI_H */ diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index ea42290e7d51..61c64f543769 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -315,7 +315,8 @@ #define __NR_pwritev2 377 #define __NR_s390_guarded_storage 378 #define __NR_statx 379 -#define NR_syscalls 380 +#define __NR_s390_sthyi 380 +#define NR_syscalls 381 /* * There are some system calls that are not present on 64 bit, some diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index 986642a3543b..eb0b17ed95b6 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -180,3 +180,4 @@ COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags); COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb); COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer); +COMPAT_SYSCALL_WRAP4(s390_sthyi, unsigned long, code, void __user *, info, u64 __user *, rc, unsigned long, flags); diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index dbf5f7e18246..bb5301eeb4f4 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -77,6 +77,7 @@ long sys_s390_runtime_instr(int command, int signum); long sys_s390_guarded_storage(int command, struct gs_cb __user *); long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); +long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); DECLARE_PER_CPU(u64, mt_cycles[8]); diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 27e3c3d87379..12981e197f01 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -11,13 +11,16 @@ #include #include #include - +#include +#include #include #include #include #include #include #include +#include +#include "entry.h" #define DED_WEIGHT 0xffff /* @@ -484,3 +487,31 @@ out: return r; } EXPORT_SYMBOL_GPL(sthyi_fill); + +SYSCALL_DEFINE4(s390_sthyi, unsigned long, function_code, void __user *, buffer, + u64 __user *, return_code, unsigned long, flags) +{ + u64 sthyi_rc; + void *info; + int r; + + if (flags) + return -EINVAL; + if (function_code != STHYI_FC_CP_IFL_CAP) + return -EOPNOTSUPP; + info = (void *)get_zeroed_page(GFP_KERNEL); + if (!info) + return -ENOMEM; + r = sthyi_fill(info, &sthyi_rc); + if (r < 0) + goto out; + if (return_code && put_user(sthyi_rc, return_code)) { + r = -EFAULT; + goto out; + } + if (copy_to_user(buffer, info, PAGE_SIZE)) + r = -EFAULT; +out: + free_page((unsigned long)info); + return r; +} diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 54fce7b065de..0fb407ebbf46 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -388,3 +388,4 @@ SYSCALL(sys_preadv2,compat_sys_preadv2) SYSCALL(sys_pwritev2,compat_sys_pwritev2) SYSCALL(sys_s390_guarded_storage,compat_sys_s390_guarded_storage) /* 378 */ SYSCALL(sys_statx,compat_sys_statx) +SYSCALL(sys_s390_sthyi,compat_sys_s390_sthyi) From 979a9afe399f7c75e1be9f235fa8bf1a90d2e77d Mon Sep 17 00:00:00 2001 From: Ralf Ramsauer Date: Thu, 5 Oct 2017 13:22:36 +0200 Subject: [PATCH 0285/1085] spi: tegra114: correct register name in definition According to "Tegra K1 Processor Technical Reference Manual" (p. 2448), bit 20 of SPI_COMMAND1 is called CS_SW_VAL and not CS_SS_VAL. Signed-off-by: Ralf Ramsauer Signed-off-by: Mark Brown --- drivers/spi/spi-tegra114.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c index 44550182a4a3..a76acedd7e2f 100644 --- a/drivers/spi/spi-tegra114.c +++ b/drivers/spi/spi-tegra114.c @@ -50,7 +50,7 @@ #define SPI_IDLE_SDA_PULL_LOW (2 << 18) #define SPI_IDLE_SDA_PULL_HIGH (3 << 18) #define SPI_IDLE_SDA_MASK (3 << 18) -#define SPI_CS_SS_VAL (1 << 20) +#define SPI_CS_SW_VAL (1 << 20) #define SPI_CS_SW_HW (1 << 21) /* SPI_CS_POL_INACTIVE bits are default high */ /* n from 0 to 3 */ @@ -705,9 +705,9 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi, command1 |= SPI_CS_SW_HW; if (spi->mode & SPI_CS_HIGH) - command1 |= SPI_CS_SS_VAL; + command1 |= SPI_CS_SW_VAL; else - command1 &= ~SPI_CS_SS_VAL; + command1 &= ~SPI_CS_SW_VAL; tegra_spi_writel(tspi, 0, SPI_COMMAND2); } else { From 0b77d6701cf8d4eb343a83fa8d7eca81a863bb7c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 4 Oct 2017 19:27:05 +0200 Subject: [PATCH 0286/1085] s390: implement memset16, memset32 & memset64 Provide fast versions of the new memset variants. E.g. the generic memset64 is ten times slower than the optimized version if used on a whole page. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/string.h | 22 +++++++++++++++++ arch/s390/lib/mem.S | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 8fb43319693d..aa9c3a0f59ff 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -17,6 +17,9 @@ #define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ #define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMSET16 /* arch function */ +#define __HAVE_ARCH_MEMSET32 /* arch function */ +#define __HAVE_ARCH_MEMSET64 /* arch function */ #define __HAVE_ARCH_STRCAT /* inline & arch function */ #define __HAVE_ARCH_STRCMP /* arch function */ #define __HAVE_ARCH_STRCPY /* inline & arch function */ @@ -49,6 +52,25 @@ extern char *strstr(const char *, const char *); #undef __HAVE_ARCH_STRSEP #undef __HAVE_ARCH_STRSPN +void *__memset16(uint16_t *s, uint16_t v, size_t count); +void *__memset32(uint32_t *s, uint32_t v, size_t count); +void *__memset64(uint64_t *s, uint64_t v, size_t count); + +static inline void *memset16(uint16_t *s, uint16_t v, size_t count) +{ + return __memset16(s, v, count * sizeof(v)); +} + +static inline void *memset32(uint32_t *s, uint32_t v, size_t count) +{ + return __memset32(s, v, count * sizeof(v)); +} + +static inline void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + return __memset64(s, v, count * sizeof(v)); +} + #if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY)) static inline void *memchr(const void * s, int c, size_t n) diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 7ff79a4ff00c..f88cf6983849 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -126,3 +126,47 @@ ENTRY(memcpy) .Lmemcpy_mvc: mvc 0(1,%r1),0(%r3) EXPORT_SYMBOL(memcpy) + +/* + * __memset16/32/64 + * + * void *__memset16(uint16_t *s, uint16_t v, size_t count) + * void *__memset32(uint32_t *s, uint32_t v, size_t count) + * void *__memset64(uint64_t *s, uint64_t v, size_t count) + */ +.macro __MEMSET bits,bytes,insn +ENTRY(__memset\bits) + ltgr %r4,%r4 + bzr %r14 + cghi %r4,\bytes + je .L__memset_exit\bits + aghi %r4,-(\bytes+1) + srlg %r5,%r4,8 + ltgr %r5,%r5 + lgr %r1,%r2 + jz .L__memset_remainder\bits +.L__memset_loop\bits: + \insn %r3,0(%r1) + mvc \bytes(256-\bytes,%r1),0(%r1) + la %r1,256(%r1) + brctg %r5,.L__memset_loop\bits +.L__memset_remainder\bits: + \insn %r3,0(%r1) + larl %r5,.L__memset_mvc\bits + ex %r4,0(%r5) + br %r14 +.L__memset_exit\bits: + \insn %r3,0(%r2) + br %r14 +.L__memset_mvc\bits: + mvc \bytes(1,%r1),0(%r1) +.endm + +__MEMSET 16,2,sth +EXPORT_SYMBOL(__memset16) + +__MEMSET 32,4,st +EXPORT_SYMBOL(__memset32) + +__MEMSET 64,8,stg +EXPORT_SYMBOL(__memset64) From 41879ff65d8b025eace44610be0b07f678fb3224 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 4 Oct 2017 19:27:07 +0200 Subject: [PATCH 0287/1085] s390/mm: use memset64 instead of clear_table Use memset64 instead of the (now) open-coded variant clear_table. Performance wise there is no difference. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgalloc.h | 18 ++---------------- arch/s390/kernel/vdso.c | 6 ++---- arch/s390/mm/pgalloc.c | 14 +++++++------- arch/s390/mm/vmem.c | 2 +- 4 files changed, 12 insertions(+), 28 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index a0d9167519b1..6b341126bebb 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -12,6 +12,7 @@ #define _S390_PGALLOC_H #include +#include #include #include @@ -27,24 +28,9 @@ void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; -static inline void clear_table(unsigned long *s, unsigned long val, size_t n) -{ - struct addrtype { char _[256]; }; - int i; - - for (i = 0; i < n; i += 256) { - *s = val; - asm volatile( - "mvc 8(248,%[s]),0(%[s])\n" - : "+m" (*(struct addrtype *) s) - : [s] "a" (s)); - s += 256 / sizeof(long); - } -} - static inline void crst_table_init(unsigned long *crst, unsigned long entry) { - clear_table(crst, entry, _CRST_TABLE_SIZE); + memset64((u64 *)crst, entry, _CRST_ENTRIES); } static inline unsigned long pgd_entry_type(struct mm_struct *mm) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index eacda05b45d7..aaf8f77a636e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -166,10 +166,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) vd->node_id = cpu_to_node(vd->cpu_nr); /* Set up access register mode page table */ - clear_table((unsigned long *) segment_table, _SEGMENT_ENTRY_EMPTY, - PAGE_SIZE << SEGMENT_ORDER); - clear_table((unsigned long *) page_table, _PAGE_INVALID, - 256*sizeof(unsigned long)); + memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES); + memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE); *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; *(unsigned long *) page_table = _PAGE_PROTECT + page_frame; diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 05f1f27e6708..ffd87628a637 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -158,13 +158,13 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { struct page *page; - unsigned long *table; + u64 *table; page = alloc_page(GFP_KERNEL); if (page) { - table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + table = (u64 *)page_to_phys(page); + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } return page; } @@ -221,12 +221,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ atomic_set(&page->_mapcount, 3); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); - clear_table(table, _PAGE_INVALID, PAGE_SIZE); + memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c0af0d7b6e5f..2e84977796a2 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -59,7 +59,7 @@ pte_t __ref *vmem_pte_alloc(void) pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_INVALID, size); + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); return pte; } From 993fef95b9c1858894d14b221e04f1161e4f4ed9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 4 Oct 2017 19:27:08 +0200 Subject: [PATCH 0288/1085] s390: optimize memset implementation Like for the memset16/32/64 variants avoid that subsequent mvc instructions depend on each other since that might have negative performance impacts. This patch is currently hardly relevant since at least gcc 7.1 generates only inline memset code and not a single memset call. However there is no reason to not provide an optimized version just in case gcc generates memset calls again, like it did in the past. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/lib/mem.S | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index f88cf6983849..9255a087fa96 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -78,21 +78,25 @@ ENTRY(memset) ex %r4,0(%r3) br %r14 .Lmemset_fill: - stc %r3,0(%r2) cghi %r4,1 lgr %r1,%r2 - ber %r14 + je .Lmemset_fill_exit aghi %r4,-2 - srlg %r3,%r4,8 - ltgr %r3,%r3 + srlg %r5,%r4,8 + ltgr %r5,%r5 jz .Lmemset_fill_remainder .Lmemset_fill_loop: - mvc 1(256,%r1),0(%r1) + stc %r3,0(%r1) + mvc 1(255,%r1),0(%r1) la %r1,256(%r1) - brctg %r3,.Lmemset_fill_loop + brctg %r5,.Lmemset_fill_loop .Lmemset_fill_remainder: - larl %r3,.Lmemset_mvc - ex %r4,0(%r3) + stc %r3,0(%r1) + larl %r5,.Lmemset_mvc + ex %r4,0(%r5) + br %r14 +.Lmemset_fill_exit: + stc %r3,0(%r1) br %r14 .Lmemset_xc: xc 0(1,%r1),0(%r1) From 49913f1fd0e5ff5a484214501e84d1c73e739285 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 4 Oct 2017 19:27:09 +0200 Subject: [PATCH 0289/1085] s390: cleanup string ops prototypes Just some trivial changes like removing the extern keyword from the header file, renaming arguments to match the man pages, and whitespace removal. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/string.h | 22 +++++++++++----------- arch/s390/lib/string.c | 28 ++++++++++++++-------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index aa9c3a0f59ff..478767b262f3 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -33,17 +33,17 @@ #define __HAVE_ARCH_STRSTR /* arch function */ /* Prototypes for non-inlined arch strings functions. */ -extern int memcmp(const void *, const void *, size_t); -extern void *memcpy(void *, const void *, size_t); -extern void *memset(void *, int, size_t); -extern void *memmove(void *, const void *, size_t); -extern int strcmp(const char *,const char *); -extern size_t strlcat(char *, const char *, size_t); -extern size_t strlcpy(char *, const char *, size_t); -extern char *strncat(char *, const char *, size_t); -extern char *strncpy(char *, const char *, size_t); -extern char *strrchr(const char *, int); -extern char *strstr(const char *, const char *); +int memcmp(const void *s1, const void *s2, size_t n); +void *memcpy(void *dest, const void *src, size_t n); +void *memset(void *s, int c, size_t n); +void *memmove(void *dest, const void *src, size_t n); +int strcmp(const char *s1, const char *s2); +size_t strlcat(char *dest, const char *src, size_t n); +size_t strlcpy(char *dest, const char *src, size_t size); +char *strncat(char *dest, const char *src, size_t n); +char *strncpy(char *dest, const char *src, size_t n); +char *strrchr(const char *s, int c); +char *strstr(const char *s1, const char *s2); #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 4ee27339c792..4ff588e3928d 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL(strlen); * * returns the minimum of the length of @s and @n */ -size_t strnlen(const char * s, size_t n) +size_t strnlen(const char *s, size_t n) { return __strnend(s, n) - s; } @@ -194,14 +194,14 @@ EXPORT_SYMBOL(strncat); /** * strcmp - Compare two strings - * @cs: One string - * @ct: Another string + * @s1: One string + * @s2: Another string * - * returns 0 if @cs and @ct are equal, - * < 0 if @cs is less than @ct - * > 0 if @cs is greater than @ct + * returns 0 if @s1 and @s2 are equal, + * < 0 if @s1 is less than @s2 + * > 0 if @s1 is greater than @s2 */ -int strcmp(const char *cs, const char *ct) +int strcmp(const char *s1, const char *s2) { register int r0 asm("0") = 0; int ret = 0; @@ -213,7 +213,7 @@ int strcmp(const char *cs, const char *ct) " ic %1,0(%3)\n" " sr %0,%1\n" "1:" - : "+d" (ret), "+d" (r0), "+a" (cs), "+a" (ct) + : "+d" (ret), "+d" (r0), "+a" (s1), "+a" (s2) : : "cc", "memory"); return ret; } @@ -224,7 +224,7 @@ EXPORT_SYMBOL(strcmp); * @s: The string to be searched * @c: The character to search for */ -char * strrchr(const char * s, int c) +char *strrchr(const char *s, int c) { size_t len = __strend(s) - s; @@ -260,7 +260,7 @@ static inline int clcle(const char *s1, unsigned long l1, * @s1: The string to be searched * @s2: The string to search for */ -char * strstr(const char * s1,const char * s2) +char *strstr(const char *s1, const char *s2) { int l1, l2; @@ -306,15 +306,15 @@ EXPORT_SYMBOL(memchr); /** * memcmp - Compare two areas of memory - * @cs: One area of memory - * @ct: Another area of memory + * @s1: One area of memory + * @s2: Another area of memory * @count: The size of the area. */ -int memcmp(const void *cs, const void *ct, size_t n) +int memcmp(const void *s1, const void *s2, size_t n) { int ret; - ret = clcle(cs, n, ct, n); + ret = clcle(s1, n, s2, n); if (ret) ret = ret == 1 ? -1 : 1; return ret; From bb7e5ce7dde6f42e7793bd6cf4b1eb71a20145aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Aug 2017 17:23:20 -0700 Subject: [PATCH 0290/1085] documentation: RCU grace-period memory ordering guarantees This commit provides text and diagrams showing how Tree RCU implements its grace-period memory ordering guarantees. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Linus Torvalds --- .../Memory-Ordering/Tree-RCU-Diagram.html | 9 + .../Tree-RCU-Memory-Ordering.html | 707 +++ .../TreeRCU-callback-invocation.svg | 486 ++ .../TreeRCU-callback-registry.svg | 655 +++ .../Memory-Ordering/TreeRCU-dyntick.svg | 700 +++ .../Memory-Ordering/TreeRCU-gp-cleanup.svg | 1126 ++++ .../Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 1309 +++++ .../Memory-Ordering/TreeRCU-gp-init-1.svg | 656 +++ .../Memory-Ordering/TreeRCU-gp-init-2.svg | 656 +++ .../Memory-Ordering/TreeRCU-gp-init-3.svg | 632 ++ .../RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 5135 +++++++++++++++++ .../Memory-Ordering/TreeRCU-hotplug.svg | 775 +++ .../RCU/Design/Memory-Ordering/TreeRCU-qs.svg | 1095 ++++ .../Design/Memory-Ordering/rcu_node-lock.svg | 229 + 14 files changed, 14170 insertions(+) create mode 100644 Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html create mode 100644 Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-2.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg create mode 100644 Documentation/RCU/Design/Memory-Ordering/rcu_node-lock.svg diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html new file mode 100644 index 000000000000..e5b42a798ff3 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html @@ -0,0 +1,9 @@ + + + A Diagram of TREE_RCU's Grace-Period Memory Ordering + + +

TreeRCU-gp.svg + + diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html new file mode 100644 index 000000000000..8651b0b4fd79 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html @@ -0,0 +1,707 @@ + + + A Tour Through TREE_RCU's Grace-Period Memory Ordering + + +

August 8, 2017

+

This article was contributed by Paul E. McKenney

+ +

Introduction

+ +

This document gives a rough visual overview of how Tree RCU's +grace-period memory ordering guarantee is provided. + +

    +
  1. + What Is Tree RCU's Grace Period Memory Ordering Guarantee? +
  2. + Tree RCU Grace Period Memory Ordering Building Blocks +
  3. + Tree RCU Grace Period Memory Ordering Components +
  4. Putting It All Together +
+ +

+What Is Tree RCU's Grace Period Memory Ordering Guarantee?

+ +

RCU grace periods provide extremely strong memory-ordering guarantees +for non-idle non-offline code. +Any code that happens after the end of a given RCU grace period is guaranteed +to see the effects of all accesses prior to the beginning of that grace +period that are within RCU read-side critical sections. +Similarly, any code that happens before the beginning of a given RCU grace +period is guaranteed to see the effects of all accesses following the end +of that grace period that are within RCU read-side critical sections. + +

This guarantee is particularly pervasive for synchronize_sched(), +for which RCU-sched read-side critical sections include any region +of code for which preemption is disabled. +Given that each individual machine instruction can be thought of as +an extremely small region of preemption-disabled code, one can think of +synchronize_sched() as smp_mb() on steroids. + +

RCU updaters use this guarantee by splitting their updates into +two phases, one of which is executed before the grace period and +the other of which is executed after the grace period. +In the most common use case, phase one removes an element from +a linked RCU-protected data structure, and phase two frees that element. +For this to work, any readers that have witnessed state prior to the +phase-one update (in the common case, removal) must not witness state +following the phase-two update (in the common case, freeing). + +

The RCU implementation provides this guarantee using a network +of lock-based critical sections, memory barriers, and per-CPU +processing, as is described in the following sections. + +

+Tree RCU Grace Period Memory Ordering Building Blocks

+ +

The workhorse for RCU's grace-period memory ordering is the +critical section for the rcu_node structure's +->lock. +These critical sections use helper functions for lock acquisition, including +raw_spin_lock_rcu_node(), +raw_spin_lock_irq_rcu_node(), and +raw_spin_lock_irqsave_rcu_node(). +Their lock-release counterparts are +raw_spin_unlock_rcu_node(), +raw_spin_unlock_irq_rcu_node(), and +raw_spin_unlock_irqrestore_rcu_node(), +respectively. +For completeness, a +raw_spin_trylock_rcu_node() +is also provided. +The key point is that the lock-acquisition functions, including +raw_spin_trylock_rcu_node(), all invoke +smp_mb__after_unlock_lock() immediately after successful +acquisition of the lock. + +

Therefore, for any given rcu_node struction, any access +happening before one of the above lock-release functions will be seen +by all CPUs as happening before any access happening after a later +one of the above lock-acquisition functions. +Furthermore, any access happening before one of the +above lock-release function on any given CPU will be seen by all +CPUs as happening before any access happening after a later one +of the above lock-acquisition functions executing on that same CPU, +even if the lock-release and lock-acquisition functions are operating +on different rcu_node structures. +Tree RCU uses these two ordering guarantees to form an ordering +network among all CPUs that were in any way involved in the grace +period, including any CPUs that came online or went offline during +the grace period in question. + +

The following litmus test exhibits the ordering effects of these +lock-acquisition and lock-release functions: + +

+ 1 int x, y, z;
+ 2
+ 3 void task0(void)
+ 4 {
+ 5   raw_spin_lock_rcu_node(rnp);
+ 6   WRITE_ONCE(x, 1);
+ 7   r1 = READ_ONCE(y);
+ 8   raw_spin_unlock_rcu_node(rnp);
+ 9 }
+10
+11 void task1(void)
+12 {
+13   raw_spin_lock_rcu_node(rnp);
+14   WRITE_ONCE(y, 1);
+15   r2 = READ_ONCE(z);
+16   raw_spin_unlock_rcu_node(rnp);
+17 }
+18
+19 void task2(void)
+20 {
+21   WRITE_ONCE(z, 1);
+22   smp_mb();
+23   r3 = READ_ONCE(x);
+24 }
+25
+26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
+
+ +

The WARN_ON() is evaluated at “the end of time”, +after all changes have propagated throughout the system. +Without the smp_mb__after_unlock_lock() provided by the +acquisition functions, this WARN_ON() could trigger, for example +on PowerPC. +The smp_mb__after_unlock_lock() invocations prevent this +WARN_ON() from triggering. + +

This approach must be extended to include idle CPUs, which need +RCU's grace-period memory ordering guarantee to extend to any +RCU read-side critical sections preceding and following the current +idle sojourn. +This case is handled by calls to the strongly ordered +atomic_add_return() read-modify-write atomic operation that +is invoked within rcu_dynticks_eqs_enter() at idle-entry +time and within rcu_dynticks_eqs_exit() at idle-exit time. +The grace-period kthread invokes rcu_dynticks_snap() and +rcu_dynticks_in_eqs_since() (both of which invoke +an atomic_add_return() of zero) to detect idle CPUs. + + + + + + + + +
 
Quick Quiz:
+ But what about CPUs that remain offline for the entire + grace period? +
Answer:
+ Such CPUs will be offline at the beginning of the grace period, + so the grace period won't expect quiescent states from them. + Races between grace-period start and CPU-hotplug operations + are mediated by the CPU's leaf rcu_node structure's + ->lock as described above. +
 
+ +

The approach must be extended to handle one final case, that +of waking a task blocked in synchronize_rcu(). +This task might be affinitied to a CPU that is not yet aware that +the grace period has ended, and thus might not yet be subject to +the grace period's memory ordering. +Therefore, there is an smp_mb() after the return from +wait_for_completion() in the synchronize_rcu() +code path. + + + + + + + + +
 
Quick Quiz:
+ What? Where??? + I don't see any smp_mb() after the return from + wait_for_completion()!!! +
Answer:
+ That would be because I spotted the need for that + smp_mb() during the creation of this documentation, + and it is therefore unlikely to hit mainline before v4.14. + Kudos to Lance Roy, Will Deacon, Peter Zijlstra, and + Jonathan Cameron for asking questions that sensitized me + to the rather elaborate sequence of events that demonstrate + the need for this memory barrier. +
 
+ +

Tree RCU's grace--period memory-ordering guarantees rely most +heavily on the rcu_node structure's ->lock +field, so much so that it is necessary to abbreviate this pattern +in the diagrams in the next section. +For example, consider the rcu_prepare_for_idle() function +shown below, which is one of several functions that enforce ordering +of newly arrived RCU callbacks against future grace periods: + +

+ 1 static void rcu_prepare_for_idle(void)
+ 2 {
+ 3   bool needwake;
+ 4   struct rcu_data *rdp;
+ 5   struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ 6   struct rcu_node *rnp;
+ 7   struct rcu_state *rsp;
+ 8   int tne;
+ 9
+10   if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+11       rcu_is_nocb_cpu(smp_processor_id()))
+12     return;
+13   tne = READ_ONCE(tick_nohz_active);
+14   if (tne != rdtp->tick_nohz_enabled_snap) {
+15     if (rcu_cpu_has_callbacks(NULL))
+16       invoke_rcu_core();
+17     rdtp->tick_nohz_enabled_snap = tne;
+18     return;
+19   }
+20   if (!tne)
+21     return;
+22   if (rdtp->all_lazy &&
+23       rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+24     rdtp->all_lazy = false;
+25     rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+26     invoke_rcu_core();
+27     return;
+28   }
+29   if (rdtp->last_accelerate == jiffies)
+30     return;
+31   rdtp->last_accelerate = jiffies;
+32   for_each_rcu_flavor(rsp) {
+33     rdp = this_cpu_ptr(rsp->rda);
+34     if (rcu_segcblist_pend_cbs(&rdp->cblist))
+35       continue;
+36     rnp = rdp->mynode;
+37     raw_spin_lock_rcu_node(rnp);
+38     needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+39     raw_spin_unlock_rcu_node(rnp);
+40     if (needwake)
+41       rcu_gp_kthread_wake(rsp);
+42   }
+43 }
+
+ +

But the only part of rcu_prepare_for_idle() that really +matters for this discussion are lines 37–39. +We will therefore abbreviate this function as follows: + +

rcu_node-lock.svg + +

The box represents the rcu_node structure's ->lock +critical section, with the double line on top representing the additional +smp_mb__after_unlock_lock(). + +

+Tree RCU Grace Period Memory Ordering Components

+ +

Tree RCU's grace-period memory-ordering guarantee is provided by +a number of RCU components: + +

    +
  1. Callback Registry +
  2. Grace-Period Initialization +
  3. + Self-Reported Quiescent States +
  4. Dynamic Tick Interface +
  5. CPU-Hotplug Interface +
  6. Forcing Quiescent States +
  7. Grace-Period Cleanup +
  8. Callback Invocation +
+ +

Each of the following section looks at the corresponding component +in detail. + +

Callback Registry

+ +

If RCU's grace-period guarantee is to mean anything at all, any +access that happens before a given invocation of call_rcu() +must also happen before the corresponding grace period. +The implementation of this portion of RCU's grace period guarantee +is shown in the following figure: + +

TreeRCU-callback-registry.svg + +

Because call_rcu() normally acts only on CPU-local state, +it provides no ordering guarantees, either for itself or for +phase one of the update (which again will usually be removal of +an element from an RCU-protected data structure). +It simply enqueues the rcu_head structure on a per-CPU list, +which cannot become associated with a grace period until a later +call to rcu_accelerate_cbs(), as shown in the diagram above. + +

One set of code paths shown on the left invokes +rcu_accelerate_cbs() via +note_gp_changes(), either directly from call_rcu() (if +the current CPU is inundated with queued rcu_head structures) +or more likely from an RCU_SOFTIRQ handler. +Another code path in the middle is taken only in kernels built with +CONFIG_RCU_FAST_NO_HZ=y, which invokes +rcu_accelerate_cbs() via rcu_prepare_for_idle(). +The final code path on the right is taken only in kernels built with +CONFIG_HOTPLUG_CPU=y, which invokes +rcu_accelerate_cbs() via +rcu_advance_cbs(), rcu_migrate_callbacks, +rcutree_migrate_callbacks(), and takedown_cpu(), +which in turn is invoked on a surviving CPU after the outgoing +CPU has been completely offlined. + +

There are a few other code paths within grace-period processing +that opportunistically invoke rcu_accelerate_cbs(). +However, either way, all of the CPU's recently queued rcu_head +structures are associated with a future grace-period number under +the protection of the CPU's lead rcu_node structure's +->lock. +In all cases, there is full ordering against any prior critical section +for that same rcu_node structure's ->lock, and +also full ordering against any of the current task's or CPU's prior critical +sections for any rcu_node structure's ->lock. + +

The next section will show how this ordering ensures that any +accesses prior to the call_rcu() (particularly including phase +one of the update) +happen before the start of the corresponding grace period. + + + + + + + + +
 
Quick Quiz:
+ But what about synchronize_rcu()? +
Answer:
+ The synchronize_rcu() passes call_rcu() + to wait_rcu_gp(), which invokes it. + So either way, it eventually comes down to call_rcu(). +
 
+ +

Grace-Period Initialization

+ +

Grace-period initialization is carried out by +the grace-period kernel thread, which makes several passes over the +rcu_node tree within the rcu_gp_init() function. +This means that showing the full flow of ordering through the +grace-period computation will require duplicating this tree. +If you find this confusing, please note that the state of the +rcu_node changes over time, just like Heraclitus's river. +However, to keep the rcu_node river tractable, the +grace-period kernel thread's traversals are presented in multiple +parts, starting in this section with the various phases of +grace-period initialization. + +

The first ordering-related grace-period initialization action is to +increment the rcu_state structure's ->gpnum +grace-period-number counter, as shown below: + +

TreeRCU-gp-init-1.svg + +

The actual increment is carried out using smp_store_release(), +which helps reject false-positive RCU CPU stall detection. +Note that only the root rcu_node structure is touched. + +

The first pass through the rcu_node tree updates bitmasks +based on CPUs having come online or gone offline since the start of +the previous grace period. +In the common case where the number of online CPUs for this rcu_node +structure has not transitioned to or from zero, +this pass will scan only the leaf rcu_node structures. +However, if the number of online CPUs for a given leaf rcu_node +structure has transitioned from zero, +rcu_init_new_rnp() will be invoked for the first incoming CPU. +Similarly, if the number of online CPUs for a given leaf rcu_node +structure has transitioned to zero, +rcu_cleanup_dead_rnp() will be invoked for the last outgoing CPU. +The diagram below shows the path of ordering if the leftmost +rcu_node structure onlines its first CPU and if the next +rcu_node structure has no online CPUs +(or, alternatively if the leftmost rcu_node structure offlines +its last CPU and if the next rcu_node structure has no online CPUs). + +

TreeRCU-gp-init-1.svg + +

The final rcu_gp_init() pass through the rcu_node +tree traverses breadth-first, setting each rcu_node structure's +->gpnum field to the newly incremented value from the +rcu_state structure, as shown in the following diagram. + +

TreeRCU-gp-init-1.svg + +

This change will also cause each CPU's next call to +__note_gp_changes() +to notice that a new grace period has started, as described in the next +section. +But because the grace-period kthread started the grace period at the +root (with the increment of the rcu_state structure's +->gpnum field) before setting each leaf rcu_node +structure's ->gpnum field, each CPU's observation of +the start of the grace period will happen after the actual start +of the grace period. + + + + + + + + +
 
Quick Quiz:
+ But what about the CPU that started the grace period? + Why wouldn't it see the start of the grace period right when + it started that grace period? +
Answer:
+ In some deep philosophical and overly anthromorphized + sense, yes, the CPU starting the grace period is immediately + aware of having done so. + However, if we instead assume that RCU is not self-aware, + then even the CPU starting the grace period does not really + become aware of the start of this grace period until its + first call to __note_gp_changes(). + On the other hand, this CPU potentially gets early notification + because it invokes __note_gp_changes() during its + last rcu_gp_init() pass through its leaf + rcu_node structure. +
 
+ +

+Self-Reported Quiescent States

+ +

When all entities that might block the grace period have reported +quiescent states (or as described in a later section, had quiescent +states reported on their behalf), the grace period can end. +Online non-idle CPUs report their own quiescent states, as shown +in the following diagram: + +

TreeRCU-qs.svg + +

This is for the last CPU to report a quiescent state, which signals +the end of the grace period. +Earlier quiescent states would push up the rcu_node tree +only until they encountered an rcu_node structure that +is waiting for additional quiescent states. +However, ordering is nevertheless preserved because some later quiescent +state will acquire that rcu_node structure's ->lock. + +

Any number of events can lead up to a CPU invoking +note_gp_changes (or alternatively, directly invoking +__note_gp_changes()), at which point that CPU will notice +the start of a new grace period while holding its leaf +rcu_node lock. +Therefore, all execution shown in this diagram happens after the +start of the grace period. +In addition, this CPU will consider any RCU read-side critical +section that started before the invocation of __note_gp_changes() +to have started before the grace period, and thus a critical +section that the grace period must wait on. + + + + + + + + +
 
Quick Quiz:
+ But a RCU read-side critical section might have started + after the beginning of the grace period + (the ->gpnum++ from earlier), so why should + the grace period wait on such a critical section? +
Answer:
+ It is indeed not necessary for the grace period to wait on such + a critical section. + However, it is permissible to wait on it. + And it is furthermore important to wait on it, as this + lazy approach is far more scalable than a “big bang” + all-at-once grace-period start could possibly be. +
 
+ +

If the CPU does a context switch, a quiescent state will be +noted by rcu_node_context_switch() on the left. +On the other hand, if the CPU takes a scheduler-clock interrupt +while executing in usermode, a quiescent state will be noted by +rcu_check_callbacks() on the right. +Either way, the passage through a quiescent state will be noted +in a per-CPU variable. + +

The next time an RCU_SOFTIRQ handler executes on +this CPU (for example, after the next scheduler-clock +interrupt), __rcu_process_callbacks() will invoke +rcu_check_quiescent_state(), which will notice the +recorded quiescent state, and invoke +rcu_report_qs_rdp(). +If rcu_report_qs_rdp() verifies that the quiescent state +really does apply to the current grace period, it invokes +rcu_report_rnp() which traverses up the rcu_node +tree as shown at the bottom of the diagram, clearing bits from +each rcu_node structure's ->qsmask field, +and propagating up the tree when the result is zero. + +

Note that traversal passes upwards out of a given rcu_node +structure only if the current CPU is reporting the last quiescent +state for the subtree headed by that rcu_node structure. +A key point is that if a CPU's traversal stops at a given rcu_node +structure, then there will be a later traversal by another CPU +(or perhaps the same one) that proceeds upwards +from that point, and the rcu_node ->lock +guarantees that the first CPU's quiescent state happens before the +remainder of the second CPU's traversal. +Applying this line of thought repeatedly shows that all CPUs' +quiescent states happen before the last CPU traverses through +the root rcu_node structure, the “last CPU” +being the one that clears the last bit in the root rcu_node +structure's ->qsmask field. + +

Dynamic Tick Interface

+ +

Due to energy-efficiency considerations, RCU is forbidden from +disturbing idle CPUs. +CPUs are therefore required to notify RCU when entering or leaving idle +state, which they do via fully ordered value-returning atomic operations +on a per-CPU variable. +The ordering effects are as shown below: + +

TreeRCU-dyntick.svg + +

The RCU grace-period kernel thread samples the per-CPU idleness +variable while holding the corresponding CPU's leaf rcu_node +structure's ->lock. +This means that any RCU read-side critical sections that precede the +idle period (the oval near the top of the diagram above) will happen +before the end of the current grace period. +Similarly, the beginning of the current grace period will happen before +any RCU read-side critical sections that follow the +idle period (the oval near the bottom of the diagram above). + +

Plumbing this into the full grace-period execution is described +below. + +

CPU-Hotplug Interface

+ +

RCU is also forbidden from disturbing offline CPUs, which might well +be powered off and removed from the system completely. +CPUs are therefore required to notify RCU of their comings and goings +as part of the corresponding CPU hotplug operations. +The ordering effects are shown below: + +

TreeRCU-hotplug.svg + +

Because CPU hotplug operations are much less frequent than idle transitions, +they are heavier weight, and thus acquire the CPU's leaf rcu_node +structure's ->lock and update this structure's +->qsmaskinitnext. +The RCU grace-period kernel thread samples this mask to detect CPUs +having gone offline since the beginning of this grace period. + +

Plumbing this into the full grace-period execution is described +below. + +

Forcing Quiescent States

+ +

As noted above, idle and offline CPUs cannot report their own +quiescent states, and therefore the grace-period kernel thread +must do the reporting on their behalf. +This process is called “forcing quiescent states”, it is +repeated every few jiffies, and its ordering effects are shown below: + +

TreeRCU-gp-fqs.svg + +

Each pass of quiescent state forcing is guaranteed to traverse the +leaf rcu_node structures, and if there are no new quiescent +states due to recently idled and/or offlined CPUs, then only the +leaves are traversed. +However, if there is a newly offlined CPU as illustrated on the left +or a newly idled CPU as illustrated on the right, the corresponding +quiescent state will be driven up towards the root. +As with self-reported quiescent states, the upwards driving stops +once it reaches an rcu_node structure that has quiescent +states outstanding from other CPUs. + + + + + + + + +
 
Quick Quiz:
+ The leftmost drive to root stopped before it reached + the root rcu_node structure, which means that + there are still CPUs subordinate to that structure on + which the current grace period is waiting. + Given that, how is it possible that the rightmost drive + to root ended the grace period? +
Answer:
+ Good analysis! + It is in fact impossible in the absence of bugs in RCU. + But this diagram is complex enough as it is, so simplicity + overrode accuracy. + You can think of it as poetic license, or you can think of + it as misdirection that is resolved in the + stitched-together diagram. +
 
+ +

Grace-Period Cleanup

+ +

Grace-period cleanup first scans the rcu_node tree +breadth-first setting all the ->completed fields equal +to the number of the newly completed grace period, then it sets +the rcu_state structure's ->completed field, +again to the number of the newly completed grace period. +The ordering effects are shown below: + +

TreeRCU-gp-cleanup.svg + +

As indicated by the oval at the bottom of the diagram, once +grace-period cleanup is complete, the next grace period can begin. + + + + + + + + +
 
Quick Quiz:
+ But when precisely does the grace period end? +
Answer:
+ There is no useful single point at which the grace period + can be said to end. + The earliest reasonable candidate is as soon as the last + CPU has reported its quiescent state, but it may be some + milliseconds before RCU becomes aware of this. + The latest reasonable candidate is once the rcu_state + structure's ->completed field has been updated, + but it is quite possible that some CPUs have already completed + phase two of their updates by that time. + In short, if you are going to work with RCU, you need to + learn to embrace uncertainty. +
 
+ + +

Callback Invocation

+ +

Once a given CPU's leaf rcu_node structure's +->completed field has been updated, that CPU can begin +invoking its RCU callbacks that were waiting for this grace period +to end. +These callbacks are identified by rcu_advance_cbs(), +which is usually invoked by __note_gp_changes(). +As shown in the diagram below, this invocation can be triggered by +the scheduling-clock interrupt (rcu_check_callbacks() on +the left) or by idle entry (rcu_cleanup_after_idle() on +the right, but only for kernels build with +CONFIG_RCU_FAST_NO_HZ=y). +Either way, RCU_SOFTIRQ is raised, which results in +rcu_do_batch() invoking the callbacks, which in turn +allows those callbacks to carry out (either directly or indirectly +via wakeup) the needed phase-two processing for each update. + +

TreeRCU-callback-invocation.svg + +

Please note that callback invocation can also be prompted by any +number of corner-case code paths, for example, when a CPU notes that +it has excessive numbers of callbacks queued. +In all cases, the CPU acquires its leaf rcu_node structure's +->lock before invoking callbacks, which preserves the +required ordering against the newly completed grace period. + +

However, if the callback function communicates to other CPUs, +for example, doing a wakeup, then it is that function's responsibility +to maintain ordering. +For example, if the callback function wakes up a task that runs on +some other CPU, proper ordering must in place in both the callback +function and the task being awakened. +To see why this is important, consider the top half of the +grace-period cleanup diagram. +The callback might be running on a CPU corresponding to the leftmost +leaf rcu_node structure, and awaken a task that is to run on +a CPU corresponding to the rightmost leaf rcu_node structure, +and the grace-period kernel thread might not yet have reached the +rightmost leaf. +In this case, the grace period's memory ordering might not yet have +reached that CPU, so again the callback function and the awakened +task must supply proper ordering. + +

Putting It All Together

+ +

A stitched-together diagram is +here. + +

+Legal Statement

+ +

This work represents the view of the author and does not necessarily +represent the view of IBM. + +

Linux is a registered trademark of Linus Torvalds. + +

Other company, product, and service names may be trademarks or +service marks of others. + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg new file mode 100644 index 000000000000..832408313d93 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg @@ -0,0 +1,486 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_check_callbacks() + + rcu_cleanup_after_idle() + + rcu_advance_cbs() + + + Leaf + __note_gp_changes() + + + + Phase Two + of Update + + + RCU_SOFTIRQ + rcu_do_batch() + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg new file mode 100644 index 000000000000..7ac6f9269806 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg @@ -0,0 +1,655 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_accelerate_cbs() + + + + + rcu_prepare_for_idle() + + rcu_accelerate_cbs() + + + + + note_gp_changes() + rcu_advance_cbs() + __note_gp_changes() + + call_rcu() + + Wake up + grace-period + kernel thread + + rcu_accelerate_cbs() + + + + + takedown_cpu() + rcutree_migrate_callbacks() + rcu_migrate_callbacks() + rcu_advance_cbs() + Leaf + Leaf + Leaf + + Phase One + of Update + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg new file mode 100644 index 000000000000..423df00c4df9 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg @@ -0,0 +1,700 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->qsmask &= ~->grpmask + Leaf + dyntick_save_progress_counter() + rcu_implicit_dynticks_qs() + + + + RCU + read-side + critical section + + + + rcu_dynticks_eqs_enter() + atomic_add_return() + + + + rcu_dynticks_eqs_exit() + atomic_add_return() + + + + RCU + read-side + critical section + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg new file mode 100644 index 000000000000..754f426b297a --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg @@ -0,0 +1,1126 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->completed = ->gpnum + + + + + Root + + + rcu_gp_cleanup() + + + + + + ->completed = ->gpnum + + + + + + + Leaf + ->completed = ->gpnum + + + rsp->completed = + + + + + Root + rnp->completed + + + + + + + + + + + + Leaf + + + + + + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + + + + + + + + + + + + + + ->completed = ->gpnum + + + + + + + Leaf + + + + + + + Leaf + ->completed = ->gpnum + + + + + + + Leaf + ->completed = ->gpnum + + + + + + + + ->completed = ->gpnum + + + Start of + Next Grace + Period + + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg new file mode 100644 index 000000000000..7ddc094d7f28 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg @@ -0,0 +1,1309 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_gp_fqs() + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmask &= ~->grpmask + + + + + + + + + force_qs_rnp() + dyntick_save_progress_counter() + + + + + + Root + ->qsmask &= ~->grpmask + + rcu_implicit_dynticks_qs() + ->qsmask &= ~->grpmask + + + RCU + read-side + critical section + + + + rcu_dynticks_eqs_enter() + atomic_add_return() + + + + rcu_dynticks_eqs_exit() + atomic_add_return() + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + rcu_report_dead() + rcu_cleanup_dying_idle_cpu() + + + + + ->qsmaskinitnext + Leaf + + + + RCU + read-side + critical section + + + + rcu_cpu_starting() + + + + + ->qsmaskinitnext + Leaf + + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg new file mode 100644 index 000000000000..0161262904ec --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg @@ -0,0 +1,656 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rsp->gpnum++ + + + + + Root + + + rcu_gp_init() + + + + + + + + + + + + Leaf + + + + + + + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + + + + + + + + + + End of + Last Grace + Period + + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-2.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-2.svg new file mode 100644 index 000000000000..4d956a732685 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-2.svg @@ -0,0 +1,656 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_gp_init() + + + + + + + + + + + + Leaf + + + + + + + ->qsmaskinit + ->qsmaskinitnext + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmaskinit + + + + + + + + + rcu_init_new_rnp() or + rcu_cleanup_dead_rnp() + (optional) + + ->qsmaskinit + + + + + Root + ->qsmaskinitnext + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg new file mode 100644 index 000000000000..de6ecc51b00e --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg @@ -0,0 +1,632 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->gpnum = rsp->gpnum + + + + + Root + + + rcu_gp_init() + + + + + + ->gpnum = rsp->gpnum + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + ->gpnum = rsp->gpnum + + + + + + + Leaf + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + + ->gpnum = rsp->gpnum + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg new file mode 100644 index 000000000000..b13b7b01bb3a --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg @@ -0,0 +1,5135 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_accelerate_cbs() + + + + + rcu_prepare_for_idle() + + rcu_accelerate_cbs() + + + + + note_gp_changes() + rcu_advance_cbs() + __note_gp_changes() + + call_rcu() + + Wake up + grace-period + kernel thread + + rcu_accelerate_cbs() + + + + + takedown_cpu() + rcutree_migrate_callbacks() + rcu_migrate_callbacks() + rcu_advance_cbs() + Leaf + Leaf + Leaf + + Phase One + of Update + + + rsp->gpnum++ + + + + + Root + + + rcu_gp_init() + + + + + + + + + + + + Leaf + + + + + + + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + + + + + + + + + + End of + Last Grace + Period + + + + Grace-period + kernel thread + awakened + + + + + + + + + + + + + + Leaf + + + + + + + ->qsmaskinit + ->qsmaskinitnext + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmaskinit + + + + + + + + + rcu_init_new_rnp() or + rcu_cleanup_dead_rnp() + (optional) + + ->qsmaskinit + + + + + Root + ->qsmaskinitnext + + + + ->gpnum = rsp->gpnum + + + + + Root + + + + + + + ->gpnum = rsp->gpnum + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + ->gpnum = rsp->gpnum + + + + + + + Leaf + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + Leaf + ->gpnum = rsp->gpnum + + + + + + + + ->gpnum = rsp->gpnum + + + + + + + rcu_gp_fqs() + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmask &= ~->grpmask + + + + + + + + + force_qs_rnp() + dyntick_save_progress_counter() + + + + + + Root + ->qsmask &= ~->grpmask + + rcu_implicit_dynticks_qs() + ->qsmask &= ~->grpmask + + + RCU + read-side + critical section + + + + rcu_dynticks_eqs_enter() + atomic_add_return() + + + + rcu_dynticks_eqs_exit() + atomic_add_return() + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + rcu_report_dead() + rcu_cleanup_dying_idle_cpu() + + + + + ->qsmaskinitnext + Leaf + + + + RCU + read-side + critical section + + + + rcu_cpu_starting() + + + + + ->qsmaskinitnext + Leaf + + + + + ->qsmask &= ~->grpmask + + + + + Root + + + rcu_report_rnp() + + + + + + + + + + + + Leaf + + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmask &= ~->grpmask + + + + + + + + + + + + + + + + + note_gp_changes() + rdp->gpnum + __note_gp_changes() + Leaf + + + + rcu_node_context_switch() + + + + rcu_check_callbacks() + + + + rcu_process_callbacks() + rcu_check_quiescent_state()) + rcu__report_qs_rdp()) + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + + + Wake up + grace-period + kernel thread + + + + rcu_report_qs_rsp() + + + Grace-period + kernel thread + awakened + + + + ->completed = ->gpnum + + + + + Root + + + rcu_gp_cleanup() + + + + + + ->completed = ->gpnum + + + + + + Leaf + ->completed = ->gpnum + + rsp->completed = + + + + + Root + rnp->completed + + + + + + + + + + + Leaf + + + + + + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + + + + + + + + + + + + + ->completed = ->gpnum + + + + + + + Leaf + + + + + + + Leaf + ->completed = ->gpnum + + + + + + + Leaf + ->completed = ->gpnum + + + + + + + + ->completed = ->gpnum + + + Start of + Next Grace + Period + + + + + + rcu_check_callbacks() + + rcu_cleanup_after_idle() + rcu_advance_cbs() + + + Leaf + __note_gp_changes() + + + Phase Two + of Update + + + RCU_SOFTIRQ + rcu_do_batch() + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg new file mode 100644 index 000000000000..2c9310ba29ba --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg @@ -0,0 +1,775 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->qsmask &= ~->grpmask + dyntick_save_progress_counter() + rcu_implicit_dynticks_qs() + Leaf + + + + RCU + read-side + critical section + + + + rcu_report_dead() + rcu_cleanup_dying_idle_cpu() + + + + + ->qsmaskinitnext + Leaf + + + + RCU + read-side + critical section + + + + rcu_cpu_starting() + + + + + ->qsmaskinitnext + Leaf + + diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg new file mode 100644 index 000000000000..de3992f4cbe1 --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg @@ -0,0 +1,1095 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->qsmask &= ~->grpmask + + + + + Root + + + rcu_report_rnp() + + + + + + + + + + + + Leaf + + + + + + + ->qsmask &= ~->grpmask + + + + + + + Leaf + + + + + + + Leaf + + + + + + + Leaf + ->qsmask &= ~->grpmask + + + + + + + + + + + + + + + + + + note_gp_changes() + rdp->gpnum + __note_gp_changes() + Leaf + + + + rcu_node_context_switch() + + + + rcu_check_callbacks() + + + + rcu_process_callbacks() + rcu_check_quiescent_state()) + rcu__report_qs_rdp()) + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + RCU + read-side + critical section + + + + + + + Wake up + grace-period + kernel thread + + + + rcu_report_qs_rsp() + + diff --git a/Documentation/RCU/Design/Memory-Ordering/rcu_node-lock.svg b/Documentation/RCU/Design/Memory-Ordering/rcu_node-lock.svg new file mode 100644 index 000000000000..94c96c595aed --- /dev/null +++ b/Documentation/RCU/Design/Memory-Ordering/rcu_node-lock.svg @@ -0,0 +1,229 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_accelerate_cbs() + + + + + + + rcu_prepare_for_idle() + + From dfa0ee48ef86f79430d2be9d1e2e1b509abb3cce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Aug 2017 10:16:29 -0700 Subject: [PATCH 0291/1085] documentation: Long-running irq handlers can stall RCU grace periods If a periodic interrupt's handler takes longer to execute than the period between successive interrupts, RCU's kthreads and softirq handlers can be prevented from executing, resulting in otherwise inexplicable RCU CPU stall warnings. This commit therefore calls out this possibility in Documentation/RCU/stallwarn.txt. Reported-by: Daniel Lezcano Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 96a3d81837e1..21b8913acbdf 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -40,7 +40,9 @@ o Booting Linux using a console connection that is too slow to o Anything that prevents RCU's grace-period kthreads from running. This can result in the "All QSes seen" console-log message. This message will include information on when the kthread last - ran and how often it should be expected to run. + ran and how often it should be expected to run. It can also + result in the "rcu_.*kthread starved for" console-log message, + which will include additional debugging information. o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might happen to preempt a low-priority task in the middle of an RCU @@ -60,6 +62,14 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that CONFIG_PREEMPT_RCU case, you might see stall-warning messages. +o A periodic interrupt whose handler takes longer than the time + interval between successive pairs of interrupts. This can + prevent RCU's kthreads and softirq handlers from running. + Note that certain high-overhead debugging options, for example + the function_graph tracer, can result in interrupt handler taking + considerably longer than normal, which can in turn result in + RCU CPU stall warnings. + o A hardware or software issue shuts off the scheduler-clock interrupt on a CPU that is not in dyntick-idle mode. This problem really has happened, and seems to be most likely to From 3d916a443e97169a3d88765c4e0b07ac813f439f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Aug 2017 14:33:17 -0700 Subject: [PATCH 0292/1085] documentation: Slow systems can stall RCU grace periods If a fast system has a worst-case grace-period duration of (say) ten seconds, then running the same workload on a system ten times as slow will get you an RCU CPU stall warning given default stall-warning timeout settings. This commit therefore adds this possibility to stallwarn.txt. Reported-by: Daniel Lezcano Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 21b8913acbdf..238acbd94917 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -70,6 +70,12 @@ o A periodic interrupt whose handler takes longer than the time considerably longer than normal, which can in turn result in RCU CPU stall warnings. +o Testing a workload on a fast system, tuning the stall-warning + timeout down to just barely avoid RCU CPU stall warnings, and then + running the same workload with the same stall-warning timeout on a + slow system. Note that thermal throttling and on-demand governors + can cause a single system to be sometimes fast and sometimes slow! + o A hardware or software issue shuts off the scheduler-clock interrupt on a CPU that is not in dyntick-idle mode. This problem really has happened, and seems to be most likely to From d3cf5176d0b17610b7c7f1562e496ec401fe01f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Aug 2017 12:29:22 -0700 Subject: [PATCH 0293/1085] documentation: Update RCU CPU stall warning messages The RCU CPU stall warnings have morphed significantly since the last update, so this commit brings the documentation up to date. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 180 ++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 76 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 238acbd94917..a08f928c8557 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -171,67 +171,32 @@ Interpreting RCU's CPU Stall-Detector "Splats" For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, it will print a message similar to the following: -INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies) + INFO: rcu_sched detected stalls on CPUs/tasks: + 2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0 + 16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0 + (detected by 32, t=2603 jiffies, g=7073, c=7072, q=625) -This message indicates that CPU 5 detected that it was causing a stall, -and that the stall was affecting RCU-sched. This message will normally be -followed by a stack dump of the offending CPU. On TREE_RCU kernel builds, -RCU and RCU-sched are implemented by the same underlying mechanism, -while on PREEMPT_RCU kernel builds, RCU is instead implemented -by rcu_preempt_state. - -On the other hand, if the offending CPU fails to print out a stall-warning -message quickly enough, some other CPU will print a message similar to -the following: - -INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies) - -This message indicates that CPU 2 detected that CPUs 3 and 5 were both -causing stalls, and that the stall was affecting RCU-bh. This message +This message indicates that CPU 32 detected that CPUs 2 and 16 were both +causing stalls, and that the stall was affecting RCU-sched. This message will normally be followed by stack dumps for each CPU. Please note that -PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, -and that the tasks will be indicated by PID, for example, "P3421". -It is even possible for a rcu_preempt_state stall to be caused by both -CPUs -and- tasks, in which case the offending CPUs and tasks will all -be called out in the list. +PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that +the tasks will be indicated by PID, for example, "P3421". It is even +possible for a rcu_preempt_state stall to be caused by both CPUs -and- +tasks, in which case the offending CPUs and tasks will all be called +out in the list. -Finally, if the grace period ends just as the stall warning starts -printing, there will be a spurious stall-warning message: - -INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies) - -This is rare, but does happen from time to time in real life. It is also -possible for a zero-jiffy stall to be flagged in this case, depending -on how the stall warning and the grace-period initialization happen to -interact. Please note that it is not possible to entirely eliminate this -sort of false positive without resorting to things like stop_machine(), -which is overkill for this sort of problem. - -Recent kernels will print a long form of the stall-warning message: - - INFO: rcu_preempt detected stall on CPU - 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543 - (t=65000 jiffies) - -In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed: - - INFO: rcu_preempt detected stall on CPU - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D - (t=65000 jiffies) - -The "(64628 ticks this GP)" indicates that this CPU has taken more -than 64,000 scheduling-clock interrupts during the current stalled -grace period. If the CPU was not yet aware of the current grace -period (for example, if it was offline), then this part of the message -indicates how many grace periods behind the CPU is. +CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with +the RCU core for the past three grace periods. In contrast, CPU 16's "(0 +ticks this GP)" indicates that this CPU has not taken any scheduling-clock +interrupts during the current stalled grace period. The "idle=" portion of the message prints the dyntick-idle state. The hex number before the first "/" is the low-order 12 bits of the -dynticks counter, which will have an even-numbered value if the CPU is -in dyntick-idle mode and an odd-numbered value otherwise. The hex -number between the two "/"s is the value of the nesting, which will -be a small positive number if in the idle loop and a very large positive -number (as shown above) otherwise. +dynticks counter, which will have an even-numbered value if the CPU +is in dyntick-idle mode and an odd-numbered value otherwise. The hex +number between the two "/"s is the value of the nesting, which will be +a small non-negative number if in the idle loop (as shown above) and a +very large positive number otherwise. The "softirq=" portion of the message tracks the number of RCU softirq handlers that the stalled CPU has executed. The number before the "/" @@ -246,24 +211,72 @@ handlers are no longer able to execute on this CPU. This can happen if the stalled CPU is spinning with interrupts are disabled, or, in -rt kernels, if a high-priority process is starving RCU's softirq handler. -For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the -low-order 16 bits (in hex) of the jiffies counter when this CPU last -invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked -rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:" -prints the number of non-lazy callbacks posted since the last call to -rcu_needs_cpu(). Finally, an "L" indicates that there are currently -no non-lazy callbacks ("." is printed otherwise, as shown above) and -"D" indicates that dyntick-idle processing is enabled ("." is printed -otherwise, for example, if disabled via the "nohz=" kernel boot parameter). +The "fps=" shows the number of force-quiescent-state idle/offline +detection passes that the grace-period kthread has made across this +CPU since the last time that this CPU noted the beginning of a grace +period. + +The "detected by" line indicates which CPU detected the stall (in this +case, CPU 32), how many jiffies have elapsed since the start of the +grace period (in this case 2603), the number of the last grace period +to start and to complete (7073 and 7072, respectively), and an estimate +of the total number of RCU callbacks queued across all CPUs (625 in +this case). + +In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed +for each CPU: + + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D + +The "last_accelerate:" prints the low-order 16 bits (in hex) of the +jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() +from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from +rcu_prepare_for_idle(). The "nonlazy_posted:" prints the number +of non-lazy callbacks posted since the last call to rcu_needs_cpu(). +Finally, an "L" indicates that there are currently no non-lazy callbacks +("." is printed otherwise, as shown above) and "D" indicates that +dyntick-idle processing is enabled ("." is printed otherwise, for example, +if disabled via the "nohz=" kernel boot parameter). + +If the grace period ends just as the stall warning starts printing, +there will be a spurious stall-warning message, which will include +the following: + + INFO: Stall ended before state dump start + +This is rare, but does happen from time to time in real life. It is also +possible for a zero-jiffy stall to be flagged in this case, depending +on how the stall warning and the grace-period initialization happen to +interact. Please note that it is not possible to entirely eliminate this +sort of false positive without resorting to things like stop_machine(), +which is overkill for this sort of problem. + +If all CPUs and tasks have passed through quiescent states, but the +grace period has nevertheless failed to end, the stall-warning splat +will include something like the following: + + All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0 + +The "23807" indicates that it has been more than 23 thousand jiffies +since the grace-period kthread ran. The "jiffies_till_next_fqs" +indicates how frequently that kthread should run, giving the number +of jiffies between force-quiescent-state scans, in this case three, +which is way less than 23807. Finally, the root rcu_node structure's +->qsmask field is printed, which will normally be zero. If the relevant grace-period kthread has been unable to run prior to -the stall warning, the following additional line is printed: +the stall warning, as was the case in the "All QSes seen" line above, +the following additional line is printed: - rcu_preempt kthread starved for 2023 jiffies! + kthread starved for 23807 jiffies! g7073 c7072 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 -Starving the grace-period kthreads of CPU time can of course result in -RCU CPU stall warnings even when all CPUs and tasks have passed through -the required quiescent states. +Starving the grace-period kthreads of CPU time can of course result +in RCU CPU stall warnings even when all CPUs and tasks have passed +through the required quiescent states. The "g" and "c" numbers flag the +number of the last grace period started and completed, respectively, +the "f" precedes the ->gp_flags command to the grace-period kthread, +the "RCU_GP_WAIT_FQS" indicates that the kthread is waiting for a short +timeout, and the "state" precedes value of the task_struct ->state field. Multiple Warnings From One Stall @@ -280,13 +293,28 @@ Stall Warnings for Expedited Grace Periods If an expedited grace period detects a stall, it will place a message like the following in dmesg: - INFO: rcu_sched detected expedited stalls on CPUs: { 1 2 6 } 26009 jiffies s: 1043 + INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/. -This indicates that CPUs 1, 2, and 6 have failed to respond to a -reschedule IPI, that the expedited grace period has been going on for -26,009 jiffies, and that the expedited grace-period sequence counter is -1043. The fact that this last value is odd indicates that an expedited -grace period is in flight. +This indicates that CPU 7 has failed to respond to a reschedule IPI. +The three periods (".") following the CPU number indicate that the CPU +is online (otherwise the first period would instead have been "O"), +that the CPU was online at the beginning of the expedited grace period +(otherwise the second period would have instead been "o"), and that +the CPU has been online at least once since boot (otherwise, the third +period would instead have been "N"). The number before the "jiffies" +indicates that the expedited grace period has been going on for 21,119 +jiffies. The number following the "s:" indicates that the expedited +grace-period sequence counter is 73. The fact that this last value is +odd indicates that an expedited grace period is in flight. The number +following "root:" is a bitmask that indicates which children of the root +rcu_node structure correspond to CPUs and/or tasks that are blocking the +current expedited grace period. If the tree had more than one level, +additional hex numbers would be printed for the states of the other +rcu_node structures in the tree. + +As with normal grace periods, PREEMPT_RCU builds can be stalled by +tasks as well as by CPUs, and that the tasks will be indicated by PID, +for example, "P3421". It is entirely possible to see stall warnings from normal and from -expedited grace periods at about the same time from the same run. +expedited grace periods at about the same time during the same run. From f1ab25a30ce81f4e9be3cb33cd9bb9fb2db64b28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Aug 2017 15:49:21 -0700 Subject: [PATCH 0294/1085] memory-barriers: Replace uses of "transitive" The current version of memory-barriers.txt misuses the term "transitive", so this commit replaces it with multi-copy atomic, also adding a definition of this term. Reported-by: Alan Stern Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 175 +++++++++++++++--------------- 1 file changed, 86 insertions(+), 89 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index b759a60624fd..b6882680247e 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -53,7 +53,7 @@ CONTENTS - SMP barrier pairing. - Examples of memory barrier sequences. - Read memory barriers vs load speculation. - - Transitivity + - Multicopy atomicity. (*) Explicit kernel barriers. @@ -635,6 +635,11 @@ can be used to record rare error conditions and the like, and the CPUs' naturally occurring ordering prevents such records from being lost. +Note well that the ordering provided by a data dependency is local to +the CPU containing it. See the section on "Multicopy atomicity" for +more information. + + The data dependency barrier is very important to the RCU system, for example. See rcu_assign_pointer() and rcu_dereference() in include/linux/rcupdate.h. This permits the current target of an RCU'd @@ -851,38 +856,11 @@ In short, control dependencies apply only to the stores in the then-clause and else-clause of the if-statement in question (including functions invoked by those two clauses), not to code following that if-statement. -Finally, control dependencies do -not- provide transitivity. This is -demonstrated by two related examples, with the initial values of -'x' and 'y' both being zero: - CPU 0 CPU 1 - ======================= ======================= - r1 = READ_ONCE(x); r2 = READ_ONCE(y); - if (r1 > 0) if (r2 > 0) - WRITE_ONCE(y, 1); WRITE_ONCE(x, 1); +Note well that the ordering provided by a control dependency is local +to the CPU containing it. See the section on "Multicopy atomicity" +for more information. - assert(!(r1 == 1 && r2 == 1)); - -The above two-CPU example will never trigger the assert(). However, -if control dependencies guaranteed transitivity (which they do not), -then adding the following CPU would guarantee a related assertion: - - CPU 2 - ===================== - WRITE_ONCE(x, 2); - - assert(!(r1 == 2 && r2 == 1 && x == 2)); /* FAILS!!! */ - -But because control dependencies do -not- provide transitivity, the above -assertion can fail after the combined three-CPU example completes. If you -need the three-CPU example to provide ordering, you will need smp_mb() -between the loads and stores in the CPU 0 and CPU 1 code fragments, -that is, just before or just after the "if" statements. Furthermore, -the original two-CPU example is very fragile and should be avoided. - -These two examples are the LB and WWC litmus tests from this paper: -http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf and this -site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html. In summary: @@ -922,8 +900,8 @@ In summary: (*) Control dependencies pair normally with other types of barriers. - (*) Control dependencies do -not- provide transitivity. If you - need transitivity, use smp_mb(). + (*) Control dependencies do -not- provide multicopy atomicity. If you + need all the CPUs to see a given store at the same time, use smp_mb(). (*) Compilers do not understand control dependencies. It is therefore your job to ensure that they do not break your code. @@ -936,13 +914,14 @@ When dealing with CPU-CPU interactions, certain types of memory barrier should always be paired. A lack of appropriate pairing is almost certainly an error. General barriers pair with each other, though they also pair with most -other types of barriers, albeit without transitivity. An acquire barrier -pairs with a release barrier, but both may also pair with other barriers, -including of course general barriers. A write barrier pairs with a data -dependency barrier, a control dependency, an acquire barrier, a release -barrier, a read barrier, or a general barrier. Similarly a read barrier, -control dependency, or a data dependency barrier pairs with a write -barrier, an acquire barrier, a release barrier, or a general barrier: +other types of barriers, albeit without multicopy atomicity. An acquire +barrier pairs with a release barrier, but both may also pair with other +barriers, including of course general barriers. A write barrier pairs +with a data dependency barrier, a control dependency, an acquire barrier, +a release barrier, a read barrier, or a general barrier. Similarly a +read barrier, control dependency, or a data dependency barrier pairs +with a write barrier, an acquire barrier, a release barrier, or a +general barrier: CPU 1 CPU 2 =============== =============== @@ -1359,64 +1338,77 @@ the speculation will be cancelled and the value reloaded: retrieved : : +-------+ -TRANSITIVITY ------------- +MULTICOPY ATOMICITY +-------------------- -Transitivity is a deeply intuitive notion about ordering that is not -always provided by real computer systems. The following example -demonstrates transitivity: +Multicopy atomicity is a deeply intuitive notion about ordering that is +not always provided by real computer systems, namely that a given store +is visible at the same time to all CPUs, or, alternatively, that all +CPUs agree on the order in which all stores took place. However, use of +full multicopy atomicity would rule out valuable hardware optimizations, +so a weaker form called ``other multicopy atomicity'' instead guarantees +that a given store is observed at the same time by all -other- CPUs. The +remainder of this document discusses this weaker form, but for brevity +will call it simply ``multicopy atomicity''. + +The following example demonstrates multicopy atomicity: CPU 1 CPU 2 CPU 3 ======================= ======================= ======================= { X = 0, Y = 0 } - STORE X=1 LOAD X STORE Y=1 - - LOAD Y LOAD X + STORE X=1 r1=LOAD X (reads 1) LOAD Y (reads 1) + + STORE Y=r1 LOAD X -Suppose that CPU 2's load from X returns 1 and its load from Y returns 0. -This indicates that CPU 2's load from X in some sense follows CPU 1's -store to X and that CPU 2's load from Y in some sense preceded CPU 3's -store to Y. The question is then "Can CPU 3's load from X return 0?" +Suppose that CPU 2's load from X returns 1 which it then stores to Y and +that CPU 3's load from Y returns 1. This indicates that CPU 2's load +from X in some sense follows CPU 1's store to X and that CPU 2's store +to Y in some sense preceded CPU 3's load from Y. The question is then +"Can CPU 3's load from X return 0?" -Because CPU 2's load from X in some sense came after CPU 1's store, it +Because CPU 3's load from X in some sense came after CPU 2's load, it is natural to expect that CPU 3's load from X must therefore return 1. -This expectation is an example of transitivity: if a load executing on -CPU A follows a load from the same variable executing on CPU B, then -CPU A's load must either return the same value that CPU B's load did, -or must return some later value. +This expectation is an example of multicopy atomicity: if a load executing +on CPU A follows a load from the same variable executing on CPU B, then +an understandable but incorrect expectation is that CPU A's load must +either return the same value that CPU B's load did, or must return some +later value. -In the Linux kernel, use of general memory barriers guarantees -transitivity. Therefore, in the above example, if CPU 2's load from X -returns 1 and its load from Y returns 0, then CPU 3's load from X must -also return 1. +In the Linux kernel, the above use of a general memory barrier compensates +for any lack of multicopy atomicity. Therefore, in the above example, +if CPU 2's load from X returns 1 and its load from Y returns 0, and CPU 3's +load from Y returns 1, then CPU 3's load from X must also return 1. -However, transitivity is -not- guaranteed for read or write barriers. -For example, suppose that CPU 2's general barrier in the above example -is changed to a read barrier as shown below: +However, dependencies, read barriers, and write barriers are not always +able to compensate for non-multicopy atomicity. For example, suppose +that CPU 2's general barrier is removed from the above example, leaving +only the data dependency shown below: CPU 1 CPU 2 CPU 3 ======================= ======================= ======================= { X = 0, Y = 0 } - STORE X=1 LOAD X STORE Y=1 - - LOAD Y LOAD X + STORE X=1 r1=LOAD X (reads 1) LOAD Y (reads 1) + + STORE Y=r1 LOAD X (reads 0) -This substitution destroys transitivity: in this example, it is perfectly -legal for CPU 2's load from X to return 1, its load from Y to return 0, -and CPU 3's load from X to return 0. +This substitution allows non-multicopy atomicity to run rampant: in +this example, it is perfectly legal for CPU 2's load from X to return 1, +CPU 3's load from Y to return 1, and its load from X to return 0. -The key point is that although CPU 2's read barrier orders its pair -of loads, it does not guarantee to order CPU 1's store. Therefore, if -this example runs on a system where CPUs 1 and 2 share a store buffer -or a level of cache, CPU 2 might have early access to CPU 1's writes. -General barriers are therefore required to ensure that all CPUs agree -on the combined order of CPU 1's and CPU 2's accesses. +The key point is that although CPU 2's data dependency orders its load +and store, it does not guarantee to order CPU 1's store. Therefore, +if this example runs on a non-multicopy-atomic system where CPUs 1 and 2 +share a store buffer or a level of cache, CPU 2 might have early access +to CPU 1's writes. A general barrier is therefore required to ensure +that all CPUs agree on the combined order of CPU 1's and CPU 2's accesses. -General barriers provide "global transitivity", so that all CPUs will -agree on the order of operations. In contrast, a chain of release-acquire -pairs provides only "local transitivity", so that only those CPUs on -the chain are guaranteed to agree on the combined order of the accesses. -For example, switching to C code in deference to Herman Hollerith: +General barriers can compensate not only for non-multicopy atomicity, +but can also generate additional ordering that can ensure that -all- +CPUs will perceive the same order of -all- operations. In contrast, a +chain of release-acquire pairs do not provide this additional ordering, +which means that only those CPUs on the chain are guaranteed to agree +on the combined order of the accesses. For example, switching to C code +in deference to the ghost of Herman Hollerith: int u, v, x, y, z; @@ -1448,9 +1440,9 @@ For example, switching to C code in deference to Herman Hollerith: r3 = READ_ONCE(u); } -Because cpu0(), cpu1(), and cpu2() participate in a local transitive -chain of smp_store_release()/smp_load_acquire() pairs, the following -outcome is prohibited: +Because cpu0(), cpu1(), and cpu2() participate in a chain of +smp_store_release()/smp_load_acquire() pairs, the following outcome +is prohibited: r0 == 1 && r1 == 1 && r2 == 1 @@ -1460,9 +1452,9 @@ outcome is prohibited: r1 == 1 && r5 == 0 -However, the transitivity of release-acquire is local to the participating -CPUs and does not apply to cpu3(). Therefore, the following outcome -is possible: +However, the ordering provided by a release-acquire chain is local +to the CPUs participating in that chain and does not apply to cpu3(), +at least aside from stores. Therefore, the following outcome is possible: r0 == 0 && r1 == 1 && r2 == 1 && r3 == 0 && r4 == 0 @@ -1490,8 +1482,8 @@ following outcome is possible: Note that this outcome can happen even on a mythical sequentially consistent system where nothing is ever reordered. -To reiterate, if your code requires global transitivity, use general -barriers throughout. +To reiterate, if your code requires full ordering of all operations, +use general barriers throughout. ======================== @@ -3101,6 +3093,9 @@ AMD64 Architecture Programmer's Manual Volume 2: System Programming Chapter 7.1: Memory-Access Ordering Chapter 7.4: Buffering and Combining Memory Writes +ARM Architecture Reference Manual (ARMv8, for ARMv8-A architecture profile) + Chapter B2: The AArch64 Application Level Memory Model + IA-32 Intel Architecture Software Developer's Manual, Volume 3: System Programming Guide Chapter 7.1: Locked Atomic Operations @@ -3112,6 +3107,8 @@ The SPARC Architecture Manual, Version 9 Appendix D: Formal Specification of the Memory Models Appendix J: Programming with the Memory Models +Storage in the PowerPC (Stone and Fitzgerald) + UltraSPARC Programmer Reference Manual Chapter 5: Memory Accesses and Cacheability Chapter 15: Sparc-V9 Memory Models From 0902b1f44a72558aece92f074154044861681f84 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 1 Sep 2017 07:53:34 -0700 Subject: [PATCH 0295/1085] memory-barriers: Rework multicopy-atomicity section Signed-off-by: Alan Stern Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 56 ++++++++++++++++--------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index b6882680247e..7deee1441640 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1343,13 +1343,13 @@ MULTICOPY ATOMICITY Multicopy atomicity is a deeply intuitive notion about ordering that is not always provided by real computer systems, namely that a given store -is visible at the same time to all CPUs, or, alternatively, that all -CPUs agree on the order in which all stores took place. However, use of -full multicopy atomicity would rule out valuable hardware optimizations, -so a weaker form called ``other multicopy atomicity'' instead guarantees -that a given store is observed at the same time by all -other- CPUs. The -remainder of this document discusses this weaker form, but for brevity -will call it simply ``multicopy atomicity''. +becomes visible at the same time to all CPUs, or, alternatively, that all +CPUs agree on the order in which all stores become visible. However, +support of full multicopy atomicity would rule out valuable hardware +optimizations, so a weaker form called ``other multicopy atomicity'' +instead guarantees only that a given store becomes visible at the same +time to all -other- CPUs. The remainder of this document discusses this +weaker form, but for brevity will call it simply ``multicopy atomicity''. The following example demonstrates multicopy atomicity: @@ -1360,24 +1360,26 @@ The following example demonstrates multicopy atomicity: STORE Y=r1 LOAD X -Suppose that CPU 2's load from X returns 1 which it then stores to Y and -that CPU 3's load from Y returns 1. This indicates that CPU 2's load -from X in some sense follows CPU 1's store to X and that CPU 2's store -to Y in some sense preceded CPU 3's load from Y. The question is then -"Can CPU 3's load from X return 0?" +Suppose that CPU 2's load from X returns 1, which it then stores to Y, +and CPU 3's load from Y returns 1. This indicates that CPU 1's store +to X precedes CPU 2's load from X and that CPU 2's store to Y precedes +CPU 3's load from Y. In addition, the memory barriers guarantee that +CPU 2 executes its load before its store, and CPU 3 loads from Y before +it loads from X. The question is then "Can CPU 3's load from X return 0?" -Because CPU 3's load from X in some sense came after CPU 2's load, it +Because CPU 3's load from X in some sense comes after CPU 2's load, it is natural to expect that CPU 3's load from X must therefore return 1. -This expectation is an example of multicopy atomicity: if a load executing -on CPU A follows a load from the same variable executing on CPU B, then -an understandable but incorrect expectation is that CPU A's load must -either return the same value that CPU B's load did, or must return some -later value. +This expectation follows from multicopy atomicity: if a load executing +on CPU B follows a load from the same variable executing on CPU A (and +CPU A did not originally store the value which it read), then on +multicopy-atomic systems, CPU B's load must return either the same value +that CPU A's load did or some later value. However, the Linux kernel +does not require systems to be multicopy atomic. -In the Linux kernel, the above use of a general memory barrier compensates -for any lack of multicopy atomicity. Therefore, in the above example, -if CPU 2's load from X returns 1 and its load from Y returns 0, and CPU 3's -load from Y returns 1, then CPU 3's load from X must also return 1. +The use of a general memory barrier in the example above compensates +for any lack of multicopy atomicity. In the example, if CPU 2's load +from X returns 1 and CPU 3's load from Y returns 1, then CPU 3's load +from X must indeed also return 1. However, dependencies, read barriers, and write barriers are not always able to compensate for non-multicopy atomicity. For example, suppose @@ -1396,11 +1398,11 @@ this example, it is perfectly legal for CPU 2's load from X to return 1, CPU 3's load from Y to return 1, and its load from X to return 0. The key point is that although CPU 2's data dependency orders its load -and store, it does not guarantee to order CPU 1's store. Therefore, -if this example runs on a non-multicopy-atomic system where CPUs 1 and 2 -share a store buffer or a level of cache, CPU 2 might have early access -to CPU 1's writes. A general barrier is therefore required to ensure -that all CPUs agree on the combined order of CPU 1's and CPU 2's accesses. +and store, it does not guarantee to order CPU 1's store. Thus, if this +example runs on a non-multicopy-atomic system where CPUs 1 and 2 share a +store buffer or a level of cache, CPU 2 might have early access to CPU 1's +writes. General barriers are therefore required to ensure that all CPUs +agree on the combined order of multiple accesses. General barriers can compensate not only for non-multicopy atomicity, but can also generate additional ordering that can ensure that -all- From 135bd1a230bb69a68c9808a7d25467318900b80a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 7 Aug 2017 11:20:10 +0530 Subject: [PATCH 0296/1085] rcu: Fix up pending cbs check in rcu_prepare_for_idle The pending-callbacks check in rcu_prepare_for_idle() is backwards. It should accelerate if there are pending callbacks, but the check rather uselessly accelerates only if there are no callbacks. This commit therefore inverts this check. Fixes: 15fecf89e46a ("srcu: Abstract multi-tail callback list handling") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Cc: # 4.12.x --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..fed95fa941e6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_pend_cbs(&rdp->cblist)) + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ From c63eb17ff06dbcf73e771b9b425c531cc0a9c17b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Aug 2017 12:37:07 -0700 Subject: [PATCH 0297/1085] rcu: Create call_rcu_tasks() kthread at boot time Currently the call_rcu_tasks() kthread is created upon first invocation of call_rcu_tasks(). This has the advantage of avoiding creation if there are never any invocations of call_rcu_tasks() and of synchronize_rcu_tasks(), but it requires an unreliable heuristic to determine when it is safe to create the kthread. For example, it is not safe to create the kthread when call_rcu_tasks() is invoked with a spinlock held, but there is no good way to detect this in !PREEMPT kernels. This commit therefore creates this kthread unconditionally at core_initcall() time. If you don't want this kthread created, then build with CONFIG_TASKS_RCU=n. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..e9bbedbb8745 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -575,7 +575,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; /** @@ -600,7 +599,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; - bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -610,11 +608,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ - if ((needwake && havetask) || - (!havetask && !irqs_disabled_flags(flags))) { - rcu_spawn_tasks_kthread(); + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) wake_up(&rcu_tasks_cbs_wq); - } } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -853,27 +848,18 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ -static void rcu_spawn_tasks_kthread(void) +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ +static int __init rcu_spawn_tasks_kthread(void) { - static DEFINE_MUTEX(rcu_tasks_kthread_mutex); struct task_struct *t; - if (READ_ONCE(rcu_tasks_kthread_ptr)) { - smp_mb(); /* Ensure caller sees full kthread. */ - return; - } - mutex_lock(&rcu_tasks_kthread_mutex); - if (rcu_tasks_kthread_ptr) { - mutex_unlock(&rcu_tasks_kthread_mutex); - return; - } t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); - mutex_unlock(&rcu_tasks_kthread_mutex); + return 0; } +core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) From 6733bab7bc09b67668028dab562caea1b4ff3c69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2017 10:59:16 -0700 Subject: [PATCH 0298/1085] irq_work: Map irq_work_on_queue() to irq_work_on() in !SMP Commit 478850160636 ("irq_work: Implement remote queueing") provides irq_work_on_queue() only for SMP builds. However, providing it simplifies code that submits irq_work to lists of CPUs, eliminating the !SMP special cases. This commit therefore maps irq_work_on_queue() to irq_work_on() in !SMP builds, but validating the specified CPU. Signed-off-by: Paul E. McKenney --- include/linux/irq_work.h | 3 --- kernel/irq_work.c | 9 ++++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 47b9ebd4a74f..d8c9876d5da4 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -33,10 +33,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } bool irq_work_queue(struct irq_work *work); - -#ifdef CONFIG_SMP bool irq_work_queue_on(struct irq_work *work, int cpu); -#endif void irq_work_tick(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..9f20f6c72579 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void) */ } -#ifdef CONFIG_SMP /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. @@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); +#ifdef CONFIG_SMP + /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); @@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) arch_send_call_function_single_ipi(cpu); +#else /* #ifdef CONFIG_SMP */ + irq_work_queue(work); +#endif /* #else #ifdef CONFIG_SMP */ + return true; } -EXPORT_SYMBOL_GPL(irq_work_queue_on); -#endif /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) From 7c2102e56a3f7d85b5d8f33efbd7aecc1f36fdd8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Sep 2017 08:54:40 -0700 Subject: [PATCH 0299/1085] sched: Make resched_cpu() unconditional The current implementation of synchronize_sched_expedited() incorrectly assumes that resched_cpu() is unconditional, which it is not. This means that synchronize_sched_expedited() can hang when resched_cpu()'s trylock fails as follows (analysis by Neeraj Upadhyay): o CPU1 is waiting for expedited wait to complete: sync_rcu_exp_select_cpus rdp->exp_dynticks_snap & 0x1 // returns 1 for CPU5 IPI sent to CPU5 synchronize_sched_expedited_wait ret = swait_event_timeout(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); expmask = 0x20, CPU 5 in idle path (in cpuidle_enter()) o CPU5 handles IPI and fails to acquire rq lock. Handles IPI sync_sched_exp_handler resched_cpu returns while failing to try lock acquire rq->lock need_resched is not set o CPU5 calls rcu_idle_enter() and as need_resched is not set, goes to idle (schedule() is not called). o CPU 1 reports RCU stall. Given that resched_cpu() is now used only by RCU, this commit fixes the assumption by making resched_cpu() unconditional. Reported-by: Neeraj Upadhyay Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..8fa7b6f9e19b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,8 +505,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } From f79c3ad6189624c3de0ad5521610c9e22a1c33cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2016 06:24:30 -0800 Subject: [PATCH 0300/1085] sched,rcu: Make cond_resched() provide RCU quiescent state There is some confusion as to which of cond_resched() or cond_resched_rcu_qs() should be added to long in-kernel loops. This commit therefore eliminates the decision by adding RCU quiescent states to cond_resched(). This commit also simplifies the code that used to interact with cond_resched_rcu_qs(), and that now interacts with cond_resched(), to reduce its overhead. This reduction is necessary to allow the heavier-weight cond_resched_rcu_qs() mechanism to be invoked everywhere that cond_resched() is invoked. Part of that reduction in overhead converts the jiffies_till_sched_qs kernel parameter to read-only at runtime, thus eliminating the need for bounds checking. Reported-by: Michal Hocko Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra [ paulmck: Keep PREEMPT=n cond_resched a no-op, per Peter Zijlstra. ] --- kernel/rcu/tree.c | 25 +++++-------------------- kernel/sched/core.c | 1 + 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..0dda57a28276 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644); * How long the grace period must be before we start recruiting * quiescent-state help from rcu_note_context_switch(). */ -static ulong jiffies_till_sched_qs = HZ / 20; -module_param(jiffies_till_sched_qs, ulong, 0644); +static ulong jiffies_till_sched_qs = HZ / 10; +module_param(jiffies_till_sched_qs, ulong, 0444); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); @@ -1235,7 +1235,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - unsigned long rjtsc; struct rcu_node *rnp; /* @@ -1252,23 +1251,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Compute and saturate jiffies_till_sched_qs. */ - jtsq = jiffies_till_sched_qs; - rjtsc = rcu_jiffies_till_stall_check(); - if (jtsq > rjtsc / 2) { - WRITE_ONCE(jiffies_till_sched_qs, rjtsc); - jtsq = rjtsc / 2; - } else if (jtsq < 1) { - WRITE_ONCE(jiffies_till_sched_qs, 1); - jtsq = 1; - } - /* * Has this CPU encountered a cond_resched_rcu_qs() since the * beginning of the grace period? For this to be the case, * the CPU has to have noticed the current grace period. This * might not be the case for nohz_full CPUs looping in the kernel. */ + jtsq = jiffies_till_sched_qs; rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && @@ -1276,7 +1265,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; - } else { + } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ smp_store_release(ruqp, true); } @@ -1304,10 +1293,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * updates are only once every few jiffies, the probability of * lossage (and thus of slight grace-period extension) is * quite low. - * - * Note that if the jiffies_till_sched_qs boot/sysfs parameter - * is set too high, we override with half of the RCU CPU stall - * warning delay. */ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && @@ -1316,7 +1301,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..904d3ab35c83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4842,6 +4842,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); From 9b9500da81502738efa1b485a8835f174ff7be6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Aug 2017 17:05:59 -0700 Subject: [PATCH 0301/1085] rcu: Make RCU CPU stall warnings check for irq-disabled CPUs One common question upon seeing an RCU CPU stall warning is "did the stalled CPUs have interrupts disabled?" However, the current stall warnings are silent on this point. This commit therefore uses irq_work to check whether stalled CPUs still respond to IPIs, and flags this state in the RCU CPU stall warning console messages. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 104 ++++++++++++++++++++++++++++++++++----- kernel/rcu/tree.h | 5 ++ kernel/rcu/tree_plugin.h | 7 ++- 3 files changed, 103 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0dda57a28276..12838a9a128e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1206,6 +1206,22 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } +/* + * We are reporting a quiescent state on behalf of some other CPU, so + * it is our responsibility to check for and handle potential overflow + * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * After all, the CPU might be in deep idle state, and thus executing no + * code whatsoever. + */ +static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rnp->lock); + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + WRITE_ONCE(rdp->gpwrap, true); + if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) + rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; +} + /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -1216,14 +1232,33 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, - rdp->mynode->gpnum)) - WRITE_ONCE(rdp->gpwrap, true); + rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } return 0; } +/* + * Handler for the irq_work request posted when a grace period has + * gone on for too long, but not yet long enough for an RCU CPU + * stall warning. Set state appropriately, but just complain if + * there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + /* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks @@ -1235,7 +1270,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1248,6 +1283,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1258,12 +1294,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * might not be the case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; - rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ @@ -1274,6 +1310,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1305,11 +1342,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * If more than halfway to RCU CPU stall-warning time, do - * a resched_cpu() to try to loosen things up a bit. + * If more than halfway to RCU CPU stall-warning time, do a + * resched_cpu() to try to loosen things up a bit. Also check to + * see if the CPU is getting hammered with interrupts, but only + * once per grace period, just to keep the IPIs down to a dull roar. */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + (rnp->ffmask & rdp->grpmask)) { + init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + rdp->rcu_iw_pending = true; + rdp->rcu_iw_gpnum = rnp->gpnum; + irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); + } + } return 0; } @@ -1498,6 +1546,7 @@ static void print_cpu_stall(struct rcu_state *rsp) { int cpu; unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1513,7 +1562,9 @@ static void print_cpu_stall(struct rcu_state *rsp) */ pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(rsp, smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, @@ -1907,6 +1958,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); } return ret; } @@ -3685,6 +3737,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; + rdp->rcu_iw_pending = false; + rdp->rcu_iw_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -3722,10 +3776,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) */ int rcutree_online_cpu(unsigned int cpu) { - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return 0; /* Too early in boot for scheduler work. */ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); return 0; } @@ -3735,6 +3803,19 @@ int rcutree_online_cpu(unsigned int cpu) */ int rcutree_offline_cpu(unsigned int cpu) { + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_offline_cpu(cpu); @@ -4183,8 +4264,7 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); + rcutree_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e1f285f0a70..46a5d1991450 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -103,6 +103,7 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -285,6 +286,10 @@ struct rcu_data { /* 8) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ + /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ + struct irq_work rcu_iw; /* Check for non-irq activity. */ + bool rcu_iw_pending; /* Is ->rcu_iw pending? */ + unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..14977d0470d1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void) */ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { + unsigned long delta; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = rdp->dynticks; @@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, From 83b6ca1fede773eebcdfb44f5a94eb410d48b886 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Aug 2017 16:47:08 -0700 Subject: [PATCH 0302/1085] rcu: Turn off tracing before dumping trace Currently, RCU allows tracing to continue when it automatically does ftrace_dump() after detecting an error condition, which can result in excessively large traces and lost trace events. This commit therefore does a tracing_off() before any of these ftrace_dump() calls. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e4b43fef89f5..b8729eb09a5d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -220,8 +220,10 @@ do { \ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ \ if (!atomic_read(&___rfd_beenhere) && \ - !atomic_xchg(&___rfd_beenhere, 1)) \ + !atomic_xchg(&___rfd_beenhere, 1)) { \ + tracing_off(); \ ftrace_dump(oops_dump_mode); \ + } \ } while (0) void rcu_early_boot_tests(void); From f22ce09157239aab08eae99c678ef664f71a9097 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Sep 2017 14:40:54 -0700 Subject: [PATCH 0303/1085] rcu: Suppress RCU CPU stall warnings while dumping trace Currently, RCU emits Suppress RCU CPU stall warnings during its automatically initiated ftrace_dump() calls after detecting an error condition, which can result in excessively excessive console output and lost trace events. This commit therefore suppresses RCU CPU stall warnings across any of these ftrace_dump() calls. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/update.c | 1 + 2 files changed, 18 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index b8729eb09a5d..59c471de342a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) extern int rcu_cpu_stall_suppress; int rcu_jiffies_till_stall_check(void); +#define rcu_ftrace_dump_stall_suppress() \ +do { \ + if (!rcu_cpu_stall_suppress) \ + rcu_cpu_stall_suppress = 3; \ +} while (0) + +#define rcu_ftrace_dump_stall_unsuppress() \ +do { \ + if (rcu_cpu_stall_suppress == 3) \ + rcu_cpu_stall_suppress = 0; \ +} while (0) + +#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ +#define rcu_ftrace_dump_stall_suppress() +#define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ /* @@ -222,7 +237,9 @@ do { \ if (!atomic_read(&___rfd_beenhere) && \ !atomic_xchg(&___rfd_beenhere, 1)) { \ tracing_off(); \ + rcu_ftrace_dump_stall_suppress(); \ ftrace_dump(oops_dump_mode); \ + rcu_ftrace_dump_stall_unsuppress(); \ } \ } while (0) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..3dc8efb16dc7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -494,6 +494,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); From 2b1516e55f8416acfb48d5f43d41222d180fb5a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2017 16:11:37 -0700 Subject: [PATCH 0304/1085] rcutorture: Add interrupt-disable capability to stall-warning tests When rcutorture sees the rcutorture.stall_cpu kernel boot parameter, it loops with preemption disabled, which does in fact normally generate an RCU CPU stall warning message. However, there are test scenarios that need the stalling CPU to have interrupts disabled. This commit therefore adds an rcutorture.stall_cpu_irqsoff kernel boot parameter that causes the stalling CPU to disable interrupts. Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 3 +++ kernel/rcu/rcutorture.c | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..bc94c47085a5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3539,6 +3539,9 @@ rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. + rcutorture.stall_cpu_irqsoff= [KNL] + Disable interrupts while stalling if set. + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..0273bc0a8586 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -89,6 +89,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1357,7 +1358,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " + "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1365,7 +1366,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, + stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1430,12 +1431,19 @@ static int rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); - preempt_disable(); + if (stall_cpu_irqsoff) + local_irq_disable(); + else + preempt_disable(); + pr_alert("rcu_torture_stall start on CPU %d.\n", + smp_processor_id()); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); + if (stall_cpu_irqsoff) + local_irq_enable(); + else + preempt_enable(); rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } From 0032f4e889764d22ccccb6a15742071d6f0d1f5a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Aug 2017 10:40:17 -0700 Subject: [PATCH 0305/1085] rcutorture: Dump writer stack if stalled Right now, rcutorture warns if an rcu_torture_writer() kthread stalls, but this warning is not always all that helpful. This commit therefore makes the first such warning include a stack dump. This in turn requires that sched_show_task() be exported to GPL modules, so this commit makes that change as well. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++++ kernel/sched/core.c | 1 + 2 files changed, 7 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0273bc0a8586..362eb2f78b3c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "rcu.h" @@ -1240,6 +1241,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { @@ -1325,6 +1327,10 @@ rcu_torture_stats_print(void) gpnum, completed, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); + if (!splatted && wtp) { + sched_show_task(wtp); + splatted = true; + } show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..7ae0151dcc1d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5165,6 +5165,7 @@ void sched_show_task(struct task_struct *p) show_stack(p, NULL); put_task_stack(p); } +EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) From b038c58bd258522d9cde6619d730ebdb1f7be198 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Aug 2017 15:33:49 -0700 Subject: [PATCH 0306/1085] torture: Provide TMPDIR environment variable to specify tmpdir Both rcutorture and locktorture currently place temporary files in /tmp, in keeping with decades-long tradition. However, sometimes it is useful to specify an alternative temporary directory, for example, for space or performance reasons. This commit therefore causes the torture-test scripting to use the path specified in the TMPDIR environment variable, or to fall back to traditional /tmp if this variable is not set. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/config_override.sh | 2 +- tools/testing/selftests/rcutorture/bin/configcheck.sh | 2 +- tools/testing/selftests/rcutorture/bin/configinit.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-build.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++-- tools/testing/selftests/rcutorture/bin/parse-build.sh | 2 +- tools/testing/selftests/rcutorture/bin/parse-torture.sh | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/config_override.sh b/tools/testing/selftests/rcutorture/bin/config_override.sh index 49fa51726ce3..ef7fcbac3d42 100755 --- a/tools/testing/selftests/rcutorture/bin/config_override.sh +++ b/tools/testing/selftests/rcutorture/bin/config_override.sh @@ -42,7 +42,7 @@ else exit 1 fi -T=/tmp/config_override.sh.$$ +T=${TMPDIR-/tmp}/config_override.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh index 70fca318a82b..197deece7c7c 100755 --- a/tools/testing/selftests/rcutorture/bin/configcheck.sh +++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh @@ -19,7 +19,7 @@ # # Authors: Paul E. McKenney -T=/tmp/abat-chk-config.sh.$$ +T=${TMPDIR-/tmp}/abat-chk-config.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index 3f81a1095206..51f66a7ce876 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -32,7 +32,7 @@ # # Authors: Paul E. McKenney -T=/tmp/configinit.sh.$$ +T=${TMPDIR-/tmp}/configinit.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 46752c164676..fb66d0173638 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -35,7 +35,7 @@ then exit 1 fi -T=/tmp/test-linux.sh.$$ +T=${TMPDIR-/tmp}/test-linux.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0af36a721b9c..ab14b97c942c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -38,7 +38,7 @@ # # Authors: Paul E. McKenney -T=/tmp/kvm-test-1-run.sh.$$ +T=${TMPDIR-/tmp}/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index b55895fb10ed..ccd49e958fd2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -30,7 +30,7 @@ scriptname=$0 args="$*" -T=/tmp/kvm.sh.$$ +T=${TMPDIR-/tmp}/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T @@ -222,7 +222,7 @@ do exit 1 fi done -sort -k2nr $T/cfgcpu > $T/cfgcpu.sort +sort -k2nr $T/cfgcpu -T="$T" > $T/cfgcpu.sort # Use a greedy bin-packing algorithm, sorting the list accordingly. awk < $T/cfgcpu.sort > $T/cfgcpu.pack -v ncpus=$cpus ' diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh index a6b57622c2e5..24fe5f822b28 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-build.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh @@ -28,7 +28,7 @@ F=$1 title=$2 -T=/tmp/parse-build.sh.$$ +T=${TMPDIR-/tmp}/parse-build.sh.$$ trap 'rm -rf $T' 0 mkdir $T diff --git a/tools/testing/selftests/rcutorture/bin/parse-torture.sh b/tools/testing/selftests/rcutorture/bin/parse-torture.sh index e3c5f0705696..f12c38909b00 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-torture.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-torture.sh @@ -27,7 +27,7 @@ # # Authors: Paul E. McKenney -T=/tmp/parse-torture.sh.$$ +T=${TMPDIR-/tmp}/parse-torture.sh.$$ file="$1" title="$2" From dd7aa8d4b53b3484ba31ba56f3ff1be7deb38530 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 10 Oct 2017 10:43:18 +0200 Subject: [PATCH 0307/1085] spi: a3700: Change SPI mode before asserting chip-select The spi device mode should be configured in the controller before the chip-select is asserted, so that a clock polarity configuration change is not interpreted as a clock tick by the device. This patch moves the mode setting to the 'prepare_message' function instead of the 'transfer_one' function. By doing so, this patch also removes redundant code in a3700_spi_clock_set. This was tested on EspressoBin board, with spidev. Signed-off-by: Maxime Chevallier Signed-off-by: Mark Brown --- drivers/spi/spi-armada-3700.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c index 568e1c65aa82..77fe55ce790c 100644 --- a/drivers/spi/spi-armada-3700.c +++ b/drivers/spi/spi-armada-3700.c @@ -213,7 +213,7 @@ static void a3700_spi_mode_set(struct a3700_spi *a3700_spi, } static void a3700_spi_clock_set(struct a3700_spi *a3700_spi, - unsigned int speed_hz, u16 mode) + unsigned int speed_hz) { u32 val; u32 prescale; @@ -231,17 +231,6 @@ static void a3700_spi_clock_set(struct a3700_spi *a3700_spi, val |= A3700_SPI_CLK_CAPT_EDGE; spireg_write(a3700_spi, A3700_SPI_IF_TIME_REG, val); } - - val = spireg_read(a3700_spi, A3700_SPI_IF_CFG_REG); - val &= ~(A3700_SPI_CLK_POL | A3700_SPI_CLK_PHA); - - if (mode & SPI_CPOL) - val |= A3700_SPI_CLK_POL; - - if (mode & SPI_CPHA) - val |= A3700_SPI_CLK_PHA; - - spireg_write(a3700_spi, A3700_SPI_IF_CFG_REG, val); } static void a3700_spi_bytelen_set(struct a3700_spi *a3700_spi, unsigned int len) @@ -423,7 +412,7 @@ static void a3700_spi_transfer_setup(struct spi_device *spi, a3700_spi = spi_master_get_devdata(spi->master); - a3700_spi_clock_set(a3700_spi, xfer->speed_hz, spi->mode); + a3700_spi_clock_set(a3700_spi, xfer->speed_hz); byte_len = xfer->bits_per_word >> 3; @@ -584,6 +573,8 @@ static int a3700_spi_prepare_message(struct spi_master *master, a3700_spi_bytelen_set(a3700_spi, 4); + a3700_spi_mode_set(a3700_spi, spi->mode); + return 0; } From 62cb1188ed86a9cf082fd2f757d4dd9b54741f24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Aug 2017 15:07:54 +0200 Subject: [PATCH 0308/1085] sched/idle: Move quiet_vmstate() into the NOHZ code quiet_vmstat() is an expensive function that only makes sense when we go into NOHZ. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aubrey.li@linux.intel.com Cc: cl@linux.com Cc: fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 1 - kernel/time/tick-sched.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 257f4f0b4532..b2e8f0afbb42 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,7 +219,6 @@ static void do_idle(void) */ __current_set_polling(); - quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a899c5ce64..7b258c59d78a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -787,6 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { calc_load_nohz_start(); cpu_load_update_nohz_start(); + quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; From 1d48b080bcce0a5e7d7aa2dbcdb35deefc188c3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Sep 2017 13:50:16 +0200 Subject: [PATCH 0309/1085] sched/debug: Rename task-state printing helpers Steve requested better names for the new task-state helper functions. So introduce the concept of task-state index for the printing and rename __get_task_state() to task_state_index() and __task_state_to_char() to task_index_to_char(). Requested-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929115016.pzlqc7ss3ccystyg@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- fs/proc/array.c | 2 +- include/linux/sched.h | 6 +++--- include/trace/events/sched.h | 2 +- kernel/trace/trace_output.c | 12 ++++++------ kernel/trace/trace_sched_wakeup.c | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 77a8eacbe032..1b5406ca8e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -137,7 +137,7 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); - return task_state_array[__get_task_state(tsk)]; + return task_state_array[task_state_index(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index bdd6ad6fcce1..33a01f4deb00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1248,7 +1248,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) -static inline unsigned int __get_task_state(struct task_struct *tsk) +static inline unsigned int task_state_index(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; @@ -1261,7 +1261,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) return fls(state); } -static inline char __task_state_to_char(unsigned int state) +static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; @@ -1272,7 +1272,7 @@ static inline char __task_state_to_char(unsigned int state) static inline char task_state_to_char(struct task_struct *tsk) { - return __task_state_to_char(__get_task_state(tsk)); + return task_index_to_char(task_state_index(tsk)); } /** diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 3c8b7f625670..fab74a12ca0f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -117,7 +117,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * if (preempt) return TASK_STATE_MAX; - return __get_task_state(p); + return task_state_index(p); } #endif /* CREATE_TRACE_POINTS */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c738e764e2a5..90db994ac900 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = __task_state_to_char(field->next_state); - S = __task_state_to_char(field->prev_state); + T = task_index_to_char(field->next_state); + S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0c331978b1a6..500f370d3bb1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = __get_task_state(prev); + entry->prev_state = task_state_index(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = __get_task_state(next); + entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = __get_task_state(curr); + entry->prev_state = task_state_index(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = __get_task_state(wakee); + entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) From e964d3501b64d6930aaa4dd18955a8cd086ccb92 Mon Sep 17 00:00:00 2001 From: luca abeni Date: Thu, 7 Sep 2017 12:09:28 +0200 Subject: [PATCH 0310/1085] sched/headers: Remove duplicate prototype of __dl_clear_params() Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e83d1b8be611..9d5aa18fc9bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); From 295d6d5e373607729bcc8182c25afe964655714f Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 7 Sep 2017 12:09:29 +0200 Subject: [PATCH 0311/1085] sched/deadline: Fix switching to -deadline Fix a bug introduced in: 72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity") After that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..9d45354e4296 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, + rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* - * If p is boosted we already updated its params in - * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), - * p's deadline being now already after rq_clock(rq). - */ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl); if (rq->curr != p) { #ifdef CONFIG_SMP From 8c0944cee7af55291df0b28e6e2eeac0930e93c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 12:09:30 +0200 Subject: [PATCH 0312/1085] sched/deadline: Rename __dl_clear() to __dl_sub() __dl_sub() is more meaningful as a name, and is more consistent with the naming of the dual function (__dl_add()). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-4-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++----- kernel/sched/sched.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d45354e4296..8d1b946fa684 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -1209,7 +1209,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -2170,7 +2170,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -2448,7 +2448,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9d5aa18fc9bf..a81c9782e98c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,7 +226,7 @@ struct dl_bw { static inline void __dl_update(struct dl_bw *dl_b, s64 bw); static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; __dl_update(dl_b, (s32)tsk_bw / cpus); From 799ba82de01e7543f6b2042e1a739f3a20255f23 Mon Sep 17 00:00:00 2001 From: luca abeni Date: Thu, 7 Sep 2017 12:09:31 +0200 Subject: [PATCH 0313/1085] sched/deadline: Use C bitfields for the state flags Ask the compiler to use a single bit for storing true / false values, instead of wasting the size of a whole int value. Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected Assembly (similar to the Assembly code generated when explicitly accessing the bits with bitmasks, "&" and "|"). Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 33a01f4deb00..0f897dfc195e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -474,10 +474,10 @@ struct sched_dl_entity { * conditions between the inactive timer handler and the wakeup * code. */ - int dl_throttled; - int dl_boosted; - int dl_yielded; - int dl_non_contending; + int dl_throttled : 1; + int dl_boosted : 1; + int dl_yielded : 1; + int dl_non_contending : 1; /* * Bandwidth enforcement timer. Each -deadline task has its From ed4ad1ca08a53cf1a805478678d1e7ff0d2cf251 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Oct 2017 14:50:33 +0200 Subject: [PATCH 0314/1085] sched/topology: Restore SD_PREFER_SIBLING on MC domains The normal x86_topology on NHM+ machines degenerates because the MC and CPU domains are of the same size, therefore MC inherits SD_PREFER_SIBLING from CPU (which then gets taken out). The result is that we'll spread tasks across the first NUMA level in order to maximize cache utilization. However, for the x86_numa_in_package_topology we loose the CPU domain, and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct difference in behaviour. Commit: 8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") made a fail by not preserving the SD_PREFER_SIBLING for the !power_saving case on both CPU and MC. Then commit: 6956dc568f34 ("sched/numa: Add SD_PERFER_SIBLING to CPU domain") adds it back to the CPU but not MC. Restore that now, such that we get consistent spreading behaviour wrt L3 and NUMA. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1cf4f306a82..86e81f06d36b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1157,6 +1157,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; From 051f3ca02e46432c0965e8948f00c07d8a2f09c0 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 7 Sep 2017 02:20:05 -0500 Subject: [PATCH 0315/1085] sched/topology: Introduce NUMA identity node sched domain On AMD Family17h-based (EPYC) system, a logical NUMA node can contain upto 8 cores (16 threads) with the following topology. ---------------------------- C0 | T0 T1 | || | T0 T1 | C4 --------| || |-------- C1 | T0 T1 | L3 || L3 | T0 T1 | C5 --------| || |-------- C2 | T0 T1 | #0 || #1 | T0 T1 | C6 --------| || |-------- C3 | T0 T1 | || | T0 T1 | C7 ---------------------------- Here, there are 2 last-level (L3) caches per logical NUMA node. A socket can contain upto 4 NUMA nodes, and a system can support upto 2 sockets. With full system configuration, current scheduler creates 4 sched domains: domain0 SMT (span a core) domain1 MC (span a last-level-cache) domain2 NUMA (span a socket: 4 nodes) domain3 NUMA (span a system: 8 nodes) Note that there is no domain to represent cpus spaning a logical NUMA node. With this hierarchy of sched domains, the scheduler does not balance properly in the following cases: Case1: When running 8 tasks, a properly balanced system should schedule a task per logical NUMA node. This is not the case for the current scheduler. Case2: In some cases, threads are scheduled on the same cpu, while other cpus are idle. This results in run-to-run inconsistency. For example: taskset -c 0-7 sysbench --num-threads=8 --test=cpu \ --cpu-max-prime=100000 run Total execution time ranges from 25.1s to 33.5s depending on threads placement, where 25.1s is when all 8 threads are balanced properly on 8 cpus. Introducing NUMA identity node sched domain, which is based on how SRAT/SLIT table define a logical NUMA node. This results in the following hierarchy of sched domains on the same system described above. domain0 SMT (span a core) domain1 MC (span a last-level-cache) domain2 NODE (span a logical NUMA node) domain3 NUMA (span a socket: 4 nodes) domain4 NUMA (span a system: 8 nodes) This fixes the improper load balancing cases mentioned above. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 86e81f06d36b..f51d123f9fe1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1332,6 +1332,10 @@ void sched_init_numa(void) if (!sched_domains_numa_distance) return; + /* Includes NUMA identity node at level 0. */ + sched_domains_numa_distance[level++] = curr_distance; + sched_domains_numa_levels = level; + /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. @@ -1379,8 +1383,7 @@ void sched_init_numa(void) return; /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). + * 'level' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1441,10 +1444,19 @@ void sched_init_numa(void) for (i = 0; sched_domain_topology[i].mask; i++) tl[i] = sched_domain_topology[i]; + /* + * Add the NUMA identity distance, aka single NODE. + */ + tl[i++] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .numa_level = 0, + SD_INIT_NAME(NODE) + }; + /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 0; j < level; i++, j++) { + for (j = 1; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, From 93824900a2e242766f5fe6ae7697e3d7171aa234 Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Wed, 13 Sep 2017 12:24:30 +0200 Subject: [PATCH 0316/1085] sched/fair: Search a task from the tail of the queue As a first step this patch makes cfs_tasks list as MRU one. It means, that when a next task is picked to run on physical CPU it is moved to the front of the list. Therefore, the cfs_tasks list is more or less sorted (except woken tasks) starting from recently given CPU time tasks toward tasks with max wait time in a run-queue, i.e. MRU list. Second, as part of the load balance operation, this approach starts detach_tasks()/detach_one_task() from the tail of the queue instead of the head, giving some advantages: - tends to pick a task with highest wait time; - tasks located in the tail are less likely cache-hot, therefore the can_migrate_task() decision is higher. hackbench illustrates slightly better performance. For example doing 1000 samples and 40 groups on i5-3320M CPU, it shows below figures: default: 0.657 avg patched: 0.646 avg Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Nicolas Pitre Cc: Oleg Nesterov Cc: Oleksiy Avramchenko Cc: Paul Turner Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20170913102430.8985-2-urezki@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac6602c5f36f..cc0bfb035df9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6628,10 +6628,7 @@ again: set_next_entity(cfs_rq, se); } - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - return p; + goto done; simple: #endif @@ -6645,6 +6642,16 @@ simple: p = task_of(se); +done: __maybe_unused +#ifdef CONFIG_SMP + /* + * Move the next running task to the front of + * the list, so our cfs_tasks list becomes MRU + * one. + */ + list_move(&p->se.group_node, &rq->cfs_tasks); +#endif + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); @@ -7080,11 +7087,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) */ static struct task_struct *detach_one_task(struct lb_env *env) { - struct task_struct *p, *n; + struct task_struct *p; lockdep_assert_held(&env->src_rq->lock); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + list_for_each_entry_reverse(p, + &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; @@ -7130,7 +7138,7 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_first_entry(tasks, struct task_struct, se.group_node); + p = list_last_entry(tasks, struct task_struct, se.group_node); env->loop++; /* We've more or less seen every task there is, call it quits */ @@ -7180,7 +7188,7 @@ static int detach_tasks(struct lb_env *env) continue; next: - list_move_tail(&p->se.group_node, tasks); + list_move(&p->se.group_node, tasks); } /* From ea16f0ea6c3dc9e1aa083bd3d1792ba02860526e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 11:55:51 +0100 Subject: [PATCH 0317/1085] sched/fair: Sync task util before slow-path wakeup We use task_util() in find_idlest_group() via capacity_spare_wake(). This task_util() updated in wake_cap(). However wake_cap() is not the only reason for ending up in find_idlest_group() - we could have been sent there by wake_wide(). So explicitly sync the task util with prev_cpu when we are about to head to find_idlest_group(). We could simply do this at the beginning of select_task_rq_fair() (i.e. irrespective of whether we're heading to select_idle_sibling() or find_idlest_group() & co), but I didn't want to slow down the select_idle_sibling() path more than necessary. Don't do this during fork balancing, we won't need the task_util and we'd just clobber the last_update_time, which is supposed to be 0. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Andres Oportus Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cc0bfb035df9..c04a42556a59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6263,8 +6263,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { - pick_cpu: +pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); From 583ffd99d7657755736d831bbc182612d1d2697d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 11:58:54 +0100 Subject: [PATCH 0318/1085] sched/fair: Force balancing on NOHZ balance if local group has capacity The "goto force_balance" here is intended to mitigate the fact that avg_load calculations can result in bad placement decisions when priority is asymmetrical. The original commit that adds it: fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity") explains: Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU capacity) systems - consider 8 always-running, same-priority tasks on a system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on the "big" CPUs (which will be represented by one sched_group in the DIE sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving one CPU unused. Because the "big" group has a higher group_capacity its avg_load may not present an imbalance that would cause migrating a task to the idle "little". The force_balance case here solves the problem but currently only for CPU_NEWLY_IDLE balances, which in theory might never happen on the unused CPU. Including CPU_IDLE in the force_balance case means there's an upper bound on the time before we can attempt to solve the underutilization: after DIE's sd->balance_interval has passed the next nohz balance kick will help us out. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c04a42556a59..cf3e816db80f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8186,8 +8186,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; From 18bd1b4bd53aba81d76d55e91a68310a227dc187 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:12 +0100 Subject: [PATCH 0319/1085] sched/fair: Move select_task_rq_fair() slow-path into its own function In preparation for changes that would otherwise require adding a new level of indentation to the while(sd) loop, create a new function find_idlest_cpu() which contains this loop, and rename the existing find_idlest_cpu() to find_idlest_group_cpu(). Code inside the while(sd) loop is unchanged. @new_cpu is added as a variable in the new function, with the same initial value as the @new_cpu in select_task_rq_fair(). Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 83 ++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf3e816db80f..dd4f25305604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5859,10 +5859,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5911,6 +5911,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -6277,39 +6321,8 @@ pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + } else { + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); From e90381eaecf6d59c60fe396838e0e99789531419 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:13 +0100 Subject: [PATCH 0320/1085] sched/fair: Remove unnecessary comparison with -1 Since commit: 83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu") find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1, so we can simplify the checking of the return value in find_idlest_cpu(). Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd4f25305604..bdd28fbad7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5933,7 +5933,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } new_cpu = find_idlest_group_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { + if (new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; From 0d10ab952e99f3e9f374898e93f45452b81e5711 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:14 +0100 Subject: [PATCH 0321/1085] sched/fair: Fix find_idlest_group() when local group is not allowed When the local group is not allowed we do not modify this_*_load from their initial value of 0. That means that the load checks at the end of find_idlest_group cause us to incorrectly return NULL. Fixing the initial values to ULONG_MAX means we will instead return the idlest remote group in that case. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bdd28fbad7a3..cca1835efa7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5737,8 +5737,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; From 6fee85ccbc76e8aeba43dc120c5fa3c5409a4e2c Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:15 +0100 Subject: [PATCH 0322/1085] sched/fair: Fix usage of find_idlest_group() when no groups are allowed When 'p' is not allowed on any of the CPUs in the sched_domain, we currently return NULL from find_idlest_group(), and pointlessly continue the search on lower sched_domain levels (where 'p' is also not allowed) before returning prev_cpu regardless (as we have not updated new_cpu). Add an explicit check for this case, and add a comment to find_idlest_group(). Now when find_idlest_group() returns NULL, it always means that the local group is allowed and idlest. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cca1835efa7b..ed80d6bd76c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5730,6 +5730,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -5917,6 +5919,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = prev_cpu; + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + while (sd) { struct sched_group *group; struct sched_domain *tmp; From 93f50f90247e3e926bbe9830df089c64a5cec236 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:16 +0100 Subject: [PATCH 0323/1085] sched/fair: Fix usage of find_idlest_group() when the local group is idlest find_idlest_group() returns NULL when the local group is idlest. The caller then continues the find_idlest_group() search at a lower level of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is not consulted and, crucially, @new_cpu is not updated. This means the search is pointless and we return @prev_cpu from select_task_rq_fair(). This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed80d6bd76c8..56f343b8e749 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5917,7 +5917,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { - int new_cpu = prev_cpu; + int new_cpu = cpu; if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) return prev_cpu; From ff0d4a9dc16b1f4c954f6407c233ab848bdfe8b0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 4 Oct 2017 17:49:00 +0200 Subject: [PATCH 0324/1085] sched/rt: Add a helper to test for a RT task This helper returns true if a task has elevated priority which is true for RT tasks (SCHED_RR and SCHED_FIFO) and also for SCHED_DEADLINE. A task which runs at RT priority due to PI-boosting is not considered as one with elevated priority. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Jens Axboe Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171004154901.26904-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/sched/rt.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index f93329aba31a..133001627ba1 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -17,6 +17,17 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } +static inline bool task_is_realtime(struct task_struct *tsk) +{ + int policy = tsk->policy; + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; + if (policy == SCHED_DEADLINE) + return true; + return false; +} + #ifdef CONFIG_RT_MUTEXES /* * Must hold either p->pi_lock or task_rq(p)->lock. From 36436440cd19f59f5be12a1b181d299af2725140 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 4 Oct 2017 17:49:01 +0200 Subject: [PATCH 0325/1085] block/ioprio: Use a helper to check for RT prio A side-effect to the old code is that now SCHED_DEADLINE is also recognized. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Jens Axboe Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171004154901.26904-2-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/ioprio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 8c1239020d79..2f19aab84a4a 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -2,6 +2,7 @@ #define IOPRIO_H #include +#include #include /* @@ -62,7 +63,7 @@ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; - else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) + else if (task_is_realtime(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; From 4bdced5c9a2922521e325896a7bbbf0132c94e56 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Oct 2017 14:05:04 -0400 Subject: [PATCH 0326/1085] sched/rt: Simplify the IPI based RT balancing logic When a CPU lowers its priority (schedules out a high priority task for a lower priority one), a check is made to see if any other CPU has overloaded RT tasks (more than one). It checks the rto_mask to determine this and if so it will request to pull one of those tasks to itself if the non running RT task is of higher priority than the new priority of the next task to run on the current CPU. When we deal with large number of CPUs, the original pull logic suffered from large lock contention on a single CPU run queue, which caused a huge latency across all CPUs. This was caused by only having one CPU having overloaded RT tasks and a bunch of other CPUs lowering their priority. To solve this issue, commit: b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling") changed the way to request a pull. Instead of grabbing the lock of the overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work. Although the IPI logic worked very well in removing the large latency build up, it still could suffer from a large number of IPIs being sent to a single CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet, when I tested this on a 120 CPU box, with a stress test that had lots of RT tasks scheduling on all CPUs, it actually triggered the hard lockup detector! One CPU had so many IPIs sent to it, and due to the restart mechanism that is triggered when the source run queue has a priority status change, the CPU spent minutes! processing the IPIs. Thinking about this further, I realized there's no reason for each run queue to send its own IPI. As all CPUs with overloaded tasks must be scanned regardless if there's one or many CPUs lowering their priority, because there's no current way to find the CPU with the highest priority task that can schedule to one of these CPUs, there really only needs to be one IPI being sent around at a time. This greatly simplifies the code! The new approach is to have each root domain have its own irq work, as the rto_mask is per root domain. The root domain has the following fields attached to it: rto_push_work - the irq work to process each CPU set in rto_mask rto_lock - the lock to protect some of the other rto fields rto_loop_start - an atomic that keeps contention down on rto_lock the first CPU scheduling in a lower priority task is the one to kick off the process. rto_loop_next - an atomic that gets incremented for each CPU that schedules in a lower priority task. rto_loop - a variable protected by rto_lock that is used to compare against rto_loop_next rto_cpu - The cpu to send the next IPI to, also protected by the rto_lock. When a CPU schedules in a lower priority task and wants to make sure overloaded CPUs know about it. It increments the rto_loop_next. Then it atomically sets rto_loop_start with a cmpxchg. If the old value is not "0", then it is done, as another CPU is kicking off the IPI loop. If the old value is "0", then it will take the rto_lock to synchronize with a possible IPI being sent around to the overloaded CPUs. If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no IPI being sent around, or one is about to finish. Then rto_cpu is set to the first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set in rto_mask, then there's nothing to be done. When the CPU receives the IPI, it will first try to push any RT tasks that is queued on the CPU but can't run because a higher priority RT task is currently running on that CPU. Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it finds one, it simply sends an IPI to that CPU and the process continues. If there's no more CPUs in the rto_mask, then rto_loop is compared with rto_loop_next. If they match, everything is done and the process is over. If they do not match, then a CPU scheduled in a lower priority task as the IPI was being passed around, and the process needs to start again. The first CPU in rto_mask is sent the IPI. This change removes this duplication of work in the IPI logic, and greatly lowers the latency caused by the IPIs. This removed the lockup happening on the 120 CPU machine. It also simplifies the code tremendously. What else could anyone ask for? Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and supplying me with the rto_start_trylock() and rto_start_unlock() helper functions. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: John Kacur Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott Wood Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 322 +++++++++++++++------------------------- kernel/sched/sched.h | 24 ++- kernel/sched/topology.c | 6 + 3 files changed, 141 insertions(+), 211 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..fda27991699a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1875,68 +1864,6 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI -/* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. - * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. - */ -static int rto_next_cpu(struct rq *rq) -{ - int prev_cpu = rq->rt.push_cpu; - int cpu; - - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. - */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; - - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; - - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} - -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; - - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); - - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) - break; - } - - return cpu; -} - -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 /* * When a high priority task schedules out from a CPU and a lower priority @@ -1946,170 +1873,157 @@ static int find_next_push_cpu(struct rq *rq) * tasks queued on it (overloaded) needs to be notified that a CPU has opened * up that may be able to run one of its non-running queued RT tasks. * - * On large CPU boxes, there's the case that several CPUs could schedule - * a lower priority task at the same time, in which case it will look for - * any overloaded CPUs that it could pull a task from. To do this, the runqueue - * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting - * for a single overloaded CPU's runqueue lock can produce a large latency. - * (This has actually been observed on large boxes running cyclictest). - * Instead of taking the runqueue lock of the overloaded CPU, each of the - * CPUs that scheduled a lower priority task simply sends an IPI to the - * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with - * lots of contention. The overloaded CPU will look to push its non-running - * RT task off, and if it does, it can then ignore the other IPIs coming - * in, and just pass those IPIs off to any other overloaded CPU. + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. * - * When a CPU schedules a lower priority task, it only sends an IPI to - * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, - * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with - * RT overloaded tasks, would cause 100 IPIs to go out at once. + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. * - * The overloaded RT CPU, when receiving an IPI, will try to push off its - * overloaded RT tasks and then send an IPI to the next CPU that has - * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks - * have completed. Just because a CPU may have pushed off its own overloaded - * RT task does not mean it should stop sending the IPI around to other - * overloaded CPUs. There may be another RT task waiting to run on one of - * those CPUs that are of higher priority than the one that was just - * pushed. + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. * - * An optimization that could possibly be made is to make a CPU array similar - * to the cpupri array mask of all running RT tasks, but for the overloaded - * case, then the IPI could be sent to only the CPU with the highest priority - * RT task waiting, and that CPU could send off further IPIs to the CPU with - * the next highest waiting task. Since the overloaded case is much less likely - * to happen, the complexity of this implementation may not be worth it. - * Instead, just send an IPI around to all overloaded CPUs. + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. * - * The rq->rt.push_flags holds the status of the IPI that is going around. - * A run queue can only send out a single IPI at a time. The possible flags - * for rq->rt.push_flags are: + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * (None or zero): No IPI is going around for the current rq - * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around - * RT_PUSH_IPI_RESTART: The priority of the running task for the rq - * has changed, and the IPI should restart - * circulating the overloaded CPUs again. - * - * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated - * before sending to the next CPU. - * - * Instead of having all CPUs that schedule a lower priority task send - * an IPI to the same "first" CPU in the RT overload mask, they send it - * to the next overloaded CPU after their own CPU. This helps distribute - * the work when there's more than one overloaded CPU and multiple CPUs - * scheduling in lower priority tasks. - * - * When a rq schedules a lower priority task than what was currently - * running, the next CPU with overloaded RT tasks is examined first. - * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower - * priority task, it will send an IPI first to CPU 5, then CPU 5 will - * send to CPU 1 if it is still overloaded. CPU 1 will clear the - * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. - * - * The first CPU to notice IPI_RESTART is set, will clear that flag and then - * send an IPI to the next overloaded CPU after the rq->cpu and not the next - * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 - * schedules a lower priority task, and the IPI_RESTART gets set while the - * handling is being done on CPU 5, it will clear the flag and send it back to - * CPU 4 instead of CPU 1. - * - * Note, the above logic can be disabled by turning off the sched_feature - * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be - * taken by the CPU requesting a pull and the waiting RT task will be pulled - * by that CPU. This may be fine for machines with few CPUs. */ -static void tell_cpu_to_push(struct rq *rq) +static int rto_next_cpu(struct rq *rq) { + struct root_domain *rd = rq->rd; + int next; int cpu; - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); + /* + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. + */ + for (;;) { + + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); + + rd->rto_cpu = cpu; + + if (cpu < nr_cpu_ids) + return cpu; + + rd->rto_cpu = -1; + + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); + + if (rd->rto_loop == next) + break; + + rd->rto_loop = next; } - /* When here, there's no IPI going around */ + return -1; +} - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); +} + +static inline void rto_start_unlock(atomic_t *v) +{ + atomic_set_release(v, 0); +} + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu = -1; + + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); + + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); - irq_work_queue_on(&rq->rt.push_work, cpu); + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq); + + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; + rq = this_rq(); - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); - -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } + raw_spin_lock(&rq->rd->rto_lock); + /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + cpu = rto_next_cpu(rq); - cpu = find_next_push_cpu(src_rq); + raw_spin_unlock(&rq->rd->rto_lock); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); - - if (cpu >= nr_cpu_ids) + if (cpu < 0) return; - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; - /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a81c9782e98c..8aa24b41f652 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -505,7 +505,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -527,12 +527,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -641,6 +635,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -658,6 +665,9 @@ extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f51d123f9fe1..e50450c2fed8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -268,6 +268,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; From 6be53520ad8f87dc748f381e4e8e6a846a4b466b Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 9 Oct 2017 17:03:33 +0800 Subject: [PATCH 0327/1085] x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter Commit: 8309f86cd41e ("x86/tsc: Provide 'tsc=unstable' boot parameter") added a new 'tsc=unstable' parameter, but didn't document it. Document it. Signed-off-by: Dou Liyang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1507539813-11420-1-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..6b99c8b77c59 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4194,6 +4194,9 @@ Used to run time disable IRQ_TIME_ACCOUNTING on any platforms where RDTSC is slow and this accounting can add overhead. + [x86] unstable: mark the TSC clocksource as unstable, this + marks the TSC unconditionally unstable at bootup and + avoids any further wobbles once the TSC watchdog notices. turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface From 9043442b43b1fddf202591b84702863286700c1a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 6 Sep 2017 19:36:24 +0200 Subject: [PATCH 0328/1085] locking/paravirt: Use new static key for controlling call of virt_spin_lock() There are cases where a guest tries to switch spinlocks to bare metal behavior (e.g. by setting "xen_nopvspin" boot parameter). Today this has the downside of falling back to unfair test and set scheme for qspinlocks due to virt_spin_lock() detecting the virtualized environment. Add a static key controlling whether virt_spin_lock() should be called or not. When running on bare metal set the new key to false. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: hpa@zytor.com Cc: jeremy@goop.org Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20170906173625.18158-2-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/qspinlock.h | 11 ++++++++++- arch/x86/kernel/paravirt.c | 14 ++++++++++++-- arch/x86/kernel/smpboot.c | 2 ++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 48a706f641f2..308dfd0714c7 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_QSPINLOCK_H #define _ASM_X86_QSPINLOCK_H +#include #include #include #include @@ -46,10 +47,14 @@ static inline void queued_spin_unlock(struct qspinlock *lock) #endif #ifdef CONFIG_PARAVIRT +DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void native_pv_lock_init(void) __init; + #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!static_branch_likely(&virt_spin_lock_key)) return false; /* @@ -65,6 +70,10 @@ static inline bool virt_spin_lock(struct qspinlock *lock) return true; } +#else +static inline void native_pv_lock_init(void) +{ +} #endif /* CONFIG_PARAVIRT */ #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 19a3e8f961c7..041096bdef86 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -115,8 +115,18 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, return 5; } -/* Neat trick to map patch type back to the call within the - * corresponding structure. */ +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void __init native_pv_lock_init(void) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_disable(&virt_spin_lock_key); +} + +/* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ static void *get_call_destination(u8 type) { struct paravirt_patch_template tmpl = { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..361f91674ce5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,6 +77,7 @@ #include #include #include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -1384,6 +1385,7 @@ void __init native_smp_prepare_boot_cpu(void) /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); + native_pv_lock_init(); } void __init native_smp_cpus_done(unsigned int max_cpus) From e6fd28eb3522b31f79a938e24674f77268a120fd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 6 Sep 2017 19:36:25 +0200 Subject: [PATCH 0329/1085] locking/spinlocks, paravirt, xen: Correct the xen_nopvspin case With the boot parameter "xen_nopvspin" specified a Xen guest should not make use of paravirt spinlocks, but behave as if running on bare metal. This is not true, however, as the qspinlock code will fall back to a test-and-set scheme when it is detecting a hypervisor. In order to avoid this disable the virt_spin_lock_key. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: hpa@zytor.com Cc: jeremy@goop.org Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20170906173625.18158-3-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 25a7c4302ce7..e8ab80ad7a6f 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -129,6 +130,7 @@ void __init xen_init_spinlocks(void) if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + static_branch_disable(&virt_spin_lock_key); return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); From 8c74392a8db9f19f7b2d07f3bb7c5accb4cb4702 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:05:56 +0300 Subject: [PATCH 0330/1085] locking/arch, alpha: Add __down_read_killable() Similar to __down_write_killable(), and read killable primitive. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670115677.23930.5711263025537758463.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/rwsem.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 77873d0ad293..9624cb4cbf2f 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -21,7 +21,7 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -static inline void __down_read(struct rw_semaphore *sem) +static inline int ___down_read(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -41,10 +41,24 @@ static inline void __down_read(struct rw_semaphore *sem) :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); #endif - if (unlikely(oldcount < 0)) + return (oldcount < 0); +} + +static inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(___down_read(sem))) rwsem_down_read_failed(sem); } +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(___down_read(sem))) + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for reading -- returns 1 if successful, 0 if contention */ @@ -94,9 +108,10 @@ static inline void __down_write(struct rw_semaphore *sem) static inline int __down_write_killable(struct rw_semaphore *sem) { - if (unlikely(___down_write(sem))) + if (unlikely(___down_write(sem))) { if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; + } return 0; } From c0905115a55c8a3011d7b7e6ee5d63653c1e99f1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:07 +0300 Subject: [PATCH 0331/1085] locking/arch, ia64: Add __down_read_killable() Similar to __down_write_killable(), and read killable primitive. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670116749.23930.14976888440968191759.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/rwsem.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 8fa98dd303b4..49f7db0a1bcd 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -37,15 +37,31 @@ /* * lock for reading */ -static inline void -__down_read (struct rw_semaphore *sem) +static inline int +___down_read (struct rw_semaphore *sem) { long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); - if (result < 0) + return (result < 0); +} + +static inline void +__down_read (struct rw_semaphore *sem) +{ + if (___down_read(sem)) rwsem_down_read_failed(sem); } +static inline int +__down_read_killable (struct rw_semaphore *sem) +{ + if (___down_read(sem)) + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * lock for writing */ @@ -72,9 +88,10 @@ __down_write (struct rw_semaphore *sem) static inline int __down_write_killable (struct rw_semaphore *sem) { - if (___down_write(sem)) + if (___down_write(sem)) { if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; + } return 0; } From a61ba2c8a48f150f5e6b6d14328fc7f1aa32969d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:18 +0300 Subject: [PATCH 0332/1085] locking/arch, s390: Add __down_read_killable() Similar to __down_write_killable(), and read killable primitive. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670117817.23930.13068785028558453848.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/s390/include/asm/rwsem.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 597e7e96b59e..fda9481dec5c 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -49,7 +49,7 @@ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline int ___down_read(struct rw_semaphore *sem) { signed long old, new; @@ -62,10 +62,25 @@ static inline void __down_read(struct rw_semaphore *sem) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "i" (RWSEM_ACTIVE_READ_BIAS) : "cc", "memory"); - if (old < 0) + return (old < 0); +} + +static inline void __down_read(struct rw_semaphore *sem) +{ + if (___down_read(sem)) rwsem_down_read_failed(sem); } +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (___down_read(sem)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + } + + return 0; +} + /* * trylock for reading -- returns 1 if successful, 0 if contention */ From 19c60923010b3f56f10ca1fb6d537c9cc852b3f1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:28 +0300 Subject: [PATCH 0333/1085] locking/arch, x86: Add __down_read_killable() Similar to __down_write_killable(), add read killable primitive: extract current __down_read() code to macros and teach it to get different functions as slow_path argument: store ax register to ret, and add sp register and preserve its value. Add call_rwsem_down_read_failed_killable() assembly entry similar to call_rwsem_down_read_failed(): push dx register to stack in additional to common registers, as it's not declarated as modifiable in ____down_read(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670118802.23930.1316107715255410256.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/include/asm/rwsem.h | 35 +++++++++++++++++++++++++---------- arch/x86/lib/rwsem.S | 12 ++++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a8f486e7a124..1e51195c0b63 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -60,18 +60,33 @@ /* * lock for reading */ +#define ____down_read(sem, slow_path) \ +({ \ + struct rw_semaphore* ret; \ + asm volatile("# beginning down_read\n\t" \ + LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \ + /* adds 0x00000001 */ \ + " jns 1f\n" \ + " call " slow_path "\n" \ + "1:\n\t" \ + "# ending down_read\n\t" \ + : "+m" (sem->count), "=a" (ret), \ + ASM_CALL_CONSTRAINT \ + : [sem] "a" (sem) \ + : "memory", "cc"); \ + ret; \ +}) + static inline void __down_read(struct rw_semaphore *sem) { - asm volatile("# beginning down_read\n\t" - LOCK_PREFIX _ASM_INC "(%[sem])\n\t" - /* adds 0x00000001 */ - " jns 1f\n" - " call call_rwsem_down_read_failed\n" - "1:\n\t" - "# ending down_read\n\t" - : "+m" (sem->count) - : [sem] "a" (sem) - : "memory", "cc"); + ____down_read(sem, "call_rwsem_down_read_failed"); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable"))) + return -EINTR; + return 0; } /* diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index bf2c6074efd2..dc2ab6ea6768 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -98,6 +98,18 @@ ENTRY(call_rwsem_down_read_failed) ret ENDPROC(call_rwsem_down_read_failed) +ENTRY(call_rwsem_down_read_failed_killable) + FRAME_BEGIN + save_common_regs + __ASM_SIZE(push,) %__ASM_REG(dx) + movq %rax,%rdi + call rwsem_down_read_failed_killable + __ASM_SIZE(pop,) %__ASM_REG(dx) + restore_common_regs + FRAME_END + ret +ENDPROC(call_rwsem_down_read_failed_killable) + ENTRY(call_rwsem_down_write_failed) FRAME_BEGIN save_common_regs From 76f8507f7a6442215df19de74f07eabca2462f1e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:38 +0300 Subject: [PATCH 0334/1085] locking/rwsem: Add down_read_killable() Similar to down_read() and down_write_killable(), add killable version of down_read(), based on __down_read_killable() function, added in previous patches. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670119884.23930.2585570605960763239.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- include/asm-generic/rwsem.h | 10 ++++++++++ include/linux/rwsem.h | 1 + kernel/locking/rwsem.c | 16 ++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index 6c6a2141f271..b2d68d2d198e 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -37,6 +37,16 @@ static inline void __down_read(struct rw_semaphore *sem) rwsem_down_read_failed(sem); } +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + } + + return 0; +} + static inline int __down_read_trylock(struct rw_semaphore *sem) { long tmp; diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 0ad7318ff299..6ac8ee5f15dd 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -111,6 +111,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) * lock for reading */ extern void down_read(struct rw_semaphore *sem); +extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4d48b1c4870d..e53f7746d9fd 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,22 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); +int __sched down_read_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_reader_owned(sem); + return 0; +} + +EXPORT_SYMBOL(down_read_killable); + /* * trylock for reading -- returns 1 if successful, 0 if contention */ From 0dc208b5d5feedc795cbf124539decd182c8e99e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:48 +0300 Subject: [PATCH 0335/1085] locking/rwsem, fs: Use killable down_read() in iterate_dir() There was mutex_lock_interruptible() initially, and it was changed to rwsem, but there were not killable rwsem primitives that time. >From commit 9902af79c01a: "The main issue is the lack of down_write_killable(), so the places like readdir.c switched to plain inode_lock(); once killable variants of rwsem primitives appear, that'll be dealt with" Use down_read_killable() same as down_write_killable() in !shared case, as concurrent inode_lock() may take much time, that may be wanted to be interrupted by user. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670120820.23930.5455667921545937220.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- fs/readdir.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/readdir.c b/fs/readdir.c index 89659549c09d..7c584bbb4ce3 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -36,13 +36,12 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) { - inode_lock_shared(inode); - } else { + if (shared) + res = down_read_killable(&inode->i_rwsem); + else res = down_write_killable(&inode->i_rwsem); - if (res) - goto out; - } + if (res) + goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { From 26c4eb192c6224e5297496cead36404b62fb071b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 19:25:26 +0100 Subject: [PATCH 0336/1085] locking/rwsem, security/apparmor: Replace homebrew use of write_can_lock() with lockdep The lockdep subsystem provides a robust way to assert that a lock is held, so use that instead of write_can_lock, which can give incorrect results for qrwlocks. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Johansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- security/apparmor/include/lib.h | 11 ----------- security/apparmor/label.c | 8 ++++---- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h index 436b3a722357..f546707a2bbb 100644 --- a/security/apparmor/include/lib.h +++ b/security/apparmor/include/lib.h @@ -19,17 +19,6 @@ #include "match.h" -/* Provide our own test for whether a write lock is held for asserts - * this is because on none SMP systems write_can_lock will always - * resolve to true, which is what you want for code making decisions - * based on it, but wrong for asserts checking that the lock is held - */ -#ifdef CONFIG_SMP -#define write_is_locked(X) !write_can_lock(X) -#else -#define write_is_locked(X) (1) -#endif /* CONFIG_SMP */ - /* * DEBUG remains global (no per profile flag) since it is mostly used in sysctl * which is not related to profile accesses. diff --git a/security/apparmor/label.c b/security/apparmor/label.c index c5b99b954580..ad28e03a6f30 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -80,7 +80,7 @@ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) AA_BUG(!orig); AA_BUG(!new); - AA_BUG(!write_is_locked(&labels_set(orig)->lock)); + lockdep_assert_held_exclusive(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); @@ -571,7 +571,7 @@ static bool __label_remove(struct aa_label *label, struct aa_label *new) AA_BUG(!ls); AA_BUG(!label); - AA_BUG(!write_is_locked(&ls->lock)); + lockdep_assert_held_exclusive(&ls->lock); if (new) __aa_proxy_redirect(label, new); @@ -608,7 +608,7 @@ static bool __label_replace(struct aa_label *old, struct aa_label *new) AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); - AA_BUG(!write_is_locked(&ls->lock)); + lockdep_assert_held_exclusive(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) @@ -645,7 +645,7 @@ static struct aa_label *__label_insert(struct aa_labelset *ls, AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); - AA_BUG(!write_is_locked(&ls->lock)); + lockdep_assert_held_exclusive(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ From a8a217c22116eff6c120d753c9934089fb229af0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 19:25:27 +0100 Subject: [PATCH 0337/1085] locking/core: Remove {read,spin,write}_can_lock() Outside of the locking code itself, {read,spin,write}_can_lock() have no users in tree. Apparmor (the last remaining user of write_can_lock()) got moved over to lockdep by the previous patch. This patch removes the use of {read,spin,write}_can_lock() from the BUILD_LOCK_OPS macro, deferring to the trylock operation for testing the lock status, and subsequently removes the unused macros altogether. They aren't guaranteed to work in a concurrent environment and can give incorrect results in the case of qrwlock. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/spinlock.h | 10 -------- arch/arc/include/asm/spinlock.h | 3 --- arch/arm/include/asm/spinlock.h | 6 ----- arch/blackfin/include/asm/spinlock.h | 10 -------- arch/hexagon/include/asm/spinlock.h | 10 -------- arch/ia64/include/asm/spinlock.h | 3 --- arch/m32r/include/asm/spinlock.h | 12 ---------- arch/metag/include/asm/spinlock_lnkget.h | 30 ------------------------ arch/metag/include/asm/spinlock_lock1.h | 20 ---------------- arch/mn10300/include/asm/spinlock.h | 12 ---------- arch/parisc/include/asm/spinlock.h | 18 -------------- arch/powerpc/include/asm/spinlock.h | 3 --- arch/s390/include/asm/spinlock.h | 12 ---------- arch/sh/include/asm/spinlock-cas.h | 12 ---------- arch/sh/include/asm/spinlock-llsc.h | 12 ---------- arch/sparc/include/asm/spinlock_32.h | 3 --- arch/tile/include/asm/spinlock_32.h | 16 ------------- arch/tile/include/asm/spinlock_64.h | 18 -------------- arch/xtensa/include/asm/spinlock.h | 2 -- include/asm-generic/qrwlock.h | 20 ---------------- include/linux/rwlock.h | 3 --- include/linux/spinlock.h | 11 --------- include/linux/spinlock_up.h | 3 --- kernel/locking/spinlock.c | 6 ++--- 24 files changed, 2 insertions(+), 253 deletions(-) diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index 718ac0b64adf..7bff6316b8bb 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -54,16 +54,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) /***********************************************************/ -static inline int arch_read_can_lock(arch_rwlock_t *lock) -{ - return (lock->lock & 1) == 0; -} - -static inline int arch_write_can_lock(arch_rwlock_t *lock) -{ - return lock->lock == 0; -} - static inline void arch_read_lock(arch_rwlock_t *lock) { long regx; diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 47efc8451b70..ce9bfcf1d870 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -410,9 +410,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #endif -#define arch_read_can_lock(x) ((x)->counter > 0) -#define arch_write_can_lock(x) ((x)->counter == __ARCH_RW_LOCK_UNLOCKED__) - #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index c030143c18c6..f5223260c73e 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -193,9 +193,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) dsb_sev(); } -/* write_can_lock - would write_trylock() succeed? */ -#define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0) - /* * Read locks are a bit more hairy: * - Exclusively load the lock value. @@ -273,9 +270,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) } } -/* read_can_lock - would read_trylock() succeed? */ -#define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000) - #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index f6431439d15d..607ef98c0f6c 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __raw_spin_unlock_asm(&lock->lock); } -static inline int arch_read_can_lock(arch_rwlock_t *rw) -{ - return __raw_uncached_fetch_asm(&rw->lock) > 0; -} - -static inline int arch_write_can_lock(arch_rwlock_t *rw) -{ - return __raw_uncached_fetch_asm(&rw->lock) == RW_LOCK_BIAS; -} - static inline void arch_read_lock(arch_rwlock_t *rw) { __raw_read_lock_asm(&rw->lock); diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index 53a8d5885887..9f9414b9c303 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -86,16 +86,6 @@ static inline int arch_read_trylock(arch_rwlock_t *lock) return temp; } -static inline int arch_read_can_lock(arch_rwlock_t *rwlock) -{ - return rwlock->lock == 0; -} - -static inline int arch_write_can_lock(arch_rwlock_t *rwlock) -{ - return rwlock->lock == 0; -} - /* Stuffs a -1 in the lock value? */ static inline void arch_write_lock(arch_rwlock_t *lock) { diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index df2c121164b8..c728dda59bd2 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -127,9 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0) -#define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0) - #ifdef ASM_SUPPORTED static __always_inline void diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index a56825592b90..002601371b2f 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -137,18 +137,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) * semaphore.h for details. -ben */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_read_can_lock(x) ((int)(x)->lock > 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) - static inline void arch_read_lock(arch_rwlock_t *rw) { unsigned long tmp0, tmp1; diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h index ad8436feed8d..6a932a952d53 100644 --- a/arch/metag/include/asm/spinlock_lnkget.h +++ b/arch/metag/include/asm/spinlock_lnkget.h @@ -136,21 +136,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) : "memory"); } -/* write_can_lock - would write_trylock() succeed? */ -static inline int arch_write_can_lock(arch_rwlock_t *rw) -{ - int ret; - - asm volatile ("LNKGETD %0, [%1]\n" - "CMP %0, #0\n" - "MOV %0, #1\n" - "XORNZ %0, %0, %0\n" - : "=&d" (ret) - : "da" (&rw->lock) - : "cc"); - return ret; -} - /* * Read locks are a bit more hairy: * - Exclusively load the lock value. @@ -224,21 +209,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) return tmp; } -/* read_can_lock - would read_trylock() succeed? */ -static inline int arch_read_can_lock(arch_rwlock_t *rw) -{ - int tmp; - - asm volatile ("LNKGETD %0, [%1]\n" - "CMP %0, %2\n" - "MOV %0, #1\n" - "XORZ %0, %0, %0\n" - : "=&d" (tmp) - : "da" (&rw->lock), "bd" (0x80000000) - : "cc"); - return tmp; -} - #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/metag/include/asm/spinlock_lock1.h b/arch/metag/include/asm/spinlock_lock1.h index c630444cffe9..8ae12bfc8ad8 100644 --- a/arch/metag/include/asm/spinlock_lock1.h +++ b/arch/metag/include/asm/spinlock_lock1.h @@ -104,16 +104,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) rw->lock = 0; } -/* write_can_lock - would write_trylock() succeed? */ -static inline int arch_write_can_lock(arch_rwlock_t *rw) -{ - unsigned int ret; - - barrier(); - ret = rw->lock; - return (ret == 0); -} - /* * Read locks are a bit more hairy: * - Exclusively load the lock value. @@ -171,14 +161,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) return (ret < 0x80000000); } -/* read_can_lock - would read_trylock() succeed? */ -static inline int arch_read_can_lock(arch_rwlock_t *rw) -{ - unsigned int ret; - - barrier(); - ret = rw->lock; - return (ret < 0x80000000); -} - #endif /* __ASM_SPINLOCK_LOCK1_H */ diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index fe413b41df6c..54f75dac8094 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -98,18 +98,6 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lock, * read-locks. */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_read_can_lock(x) ((int)(x)->lock > 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) - /* * On mn10300, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 55bfe4affca3..136e1c9bb8a9 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -168,24 +168,6 @@ static __inline__ int arch_write_trylock(arch_rwlock_t *rw) return result; } -/* - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -static __inline__ int arch_read_can_lock(arch_rwlock_t *rw) -{ - return rw->counter >= 0; -} - -/* - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -static __inline__ int arch_write_can_lock(arch_rwlock_t *rw) -{ - return !rw->counter; -} - #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index edbe571bcc54..d83f4f755ad8 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -181,9 +181,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) * read-locks. */ -#define arch_read_can_lock(rw) ((rw)->lock >= 0) -#define arch_write_can_lock(rw) (!(rw)->lock) - #ifdef CONFIG_PPC64 #define __DO_SIGN_EXTEND "extsw %0,%0\n" #define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 8182b521c42f..dc9c58ed9e4c 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -110,18 +110,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) * read-locks. */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_read_can_lock(x) ((int)(x)->lock >= 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_write_can_lock(x) ((x)->lock == 0) - extern int _raw_read_trylock_retry(arch_rwlock_t *lp); extern int _raw_write_trylock_retry(arch_rwlock_t *lp); diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index 5ed7dbbd94ff..315467834521 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -53,18 +53,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) * read-locks. */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_read_can_lock(x) ((x)->lock > 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) - static inline void arch_read_lock(arch_rwlock_t *rw) { unsigned old; diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index f77263aae760..06be4a55f3c4 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -89,18 +89,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) * read-locks. */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_read_can_lock(x) ((x)->lock > 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) - static inline void arch_read_lock(arch_rwlock_t *rw) { unsigned long tmp; diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 67345b2dc408..76986b8f7dad 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -190,9 +190,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -#define arch_read_can_lock(rw) (!((rw)->lock & 0xff)) -#define arch_write_can_lock(rw) (!(rw)->lock) - #endif /* !(__ASSEMBLY__) */ #endif /* __SPARC_SPINLOCK_H */ diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index cba8ba9b8da6..91d05f21cba9 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -79,22 +79,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define _RD_COUNT_SHIFT 24 #define _RD_COUNT_WIDTH 8 -/** - * arch_read_can_lock() - would read_trylock() succeed? - */ -static inline int arch_read_can_lock(arch_rwlock_t *rwlock) -{ - return (rwlock->lock << _RD_COUNT_WIDTH) == 0; -} - -/** - * arch_write_can_lock() - would write_trylock() succeed? - */ -static inline int arch_write_can_lock(arch_rwlock_t *rwlock) -{ - return rwlock->lock == 0; -} - /** * arch_read_lock() - acquire a read lock. */ diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index 9a2c2d605752..c802f48badf4 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -93,24 +93,6 @@ static inline int arch_write_val_locked(int val) return val < 0; /* Optimize "val & __WRITE_LOCK_BIT". */ } -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -static inline int arch_read_can_lock(arch_rwlock_t *rw) -{ - return !arch_write_val_locked(rw->lock); -} - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -static inline int arch_write_can_lock(arch_rwlock_t *rw) -{ - return rw->lock == 0; -} - extern void __read_lock_failed(arch_rwlock_t *rw); static inline void arch_read_lock(arch_rwlock_t *rw) diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index 3bb49681ee24..d005af51e2e1 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -97,8 +97,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) * 0x80000000 one writer owns the rwlock, no other writers, no readers */ -#define arch_write_can_lock(x) ((x)->lock == 0) - static inline void arch_write_lock(arch_rwlock_t *rw) { unsigned long tmp; diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 7d026bf27713..50925327b0a8 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -52,24 +52,6 @@ extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts); extern void queued_write_lock_slowpath(struct qrwlock *lock); -/** - * queued_read_can_lock- would read_trylock() succeed? - * @lock: Pointer to queue rwlock structure - */ -static inline int queued_read_can_lock(struct qrwlock *lock) -{ - return !(atomic_read(&lock->cnts) & _QW_WMASK); -} - -/** - * queued_write_can_lock- would write_trylock() succeed? - * @lock: Pointer to queue rwlock structure - */ -static inline int queued_write_can_lock(struct qrwlock *lock) -{ - return !atomic_read(&lock->cnts); -} - /** * queued_read_trylock - try to acquire read lock of a queue rwlock * @lock : Pointer to queue rwlock structure @@ -169,8 +151,6 @@ static inline void queued_write_unlock(struct qrwlock *lock) * Remapping rwlock architecture specific functions to the corresponding * queue rwlock functions. */ -#define arch_read_can_lock(l) queued_read_can_lock(l) -#define arch_write_can_lock(l) queued_write_can_lock(l) #define arch_read_lock(l) queued_read_lock(l) #define arch_write_lock(l) queued_write_lock(l) #define arch_read_trylock(l) queued_read_trylock(l) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index bc2994ed66e1..766c5ca5cbd1 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -50,9 +50,6 @@ do { \ # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif -#define read_can_lock(rwlock) arch_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) arch_write_can_lock(&(rwlock)->raw_lock) - /* * Define the various rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 69e079c5ff98..1e3e48041800 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -278,12 +278,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) 1 : ({ local_irq_restore(flags); 0; }); \ }) -/** - * raw_spin_can_lock - would raw_spin_trylock() succeed? - * @lock: the spinlock in question. - */ -#define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock)) - /* Include rwlock functions */ #include @@ -396,11 +390,6 @@ static __always_inline int spin_is_contended(spinlock_t *lock) return raw_spin_is_contended(&lock->rlock); } -static __always_inline int spin_can_lock(spinlock_t *lock) -{ - return raw_spin_can_lock(&lock->rlock); -} - #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) /* diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 612fb530af41..901cf8f44388 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -77,7 +77,4 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_contended(lock) (((void)(lock), 0)) -#define arch_read_can_lock(lock) (((void)(lock), 1)) -#define arch_write_can_lock(lock) (((void)(lock), 1)) - #endif /* __LINUX_SPINLOCK_UP_H */ diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..8fd48b5552a7 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -32,8 +32,6 @@ * include/linux/spinlock_api_smp.h */ #else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) /* * Some architectures can relax in favour of the CPU owning the lock. @@ -68,7 +66,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ @@ -88,7 +86,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ From 0160fb177d484367e041ac251fca591a3e49660c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 19:25:28 +0100 Subject: [PATCH 0338/1085] locking/arch: Remove dummy arch_{read,spin,write}_relax() implementations arch_{read,spin,write}_relax() are defined as cpu_relax() by the core code, so architectures that can't do better (i.e. most of them) don't need to bother with the dummy definitions. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/arc/include/asm/spinlock.h | 4 ---- arch/arm/include/asm/spinlock.h | 4 ---- arch/arm64/include/asm/spinlock.h | 4 ---- arch/blackfin/include/asm/spinlock.h | 4 ---- arch/ia64/include/asm/spinlock.h | 4 ---- arch/m32r/include/asm/spinlock.h | 4 ---- arch/metag/include/asm/spinlock.h | 4 ---- arch/metag/include/asm/spinlock_lnkget.h | 4 ---- arch/mips/include/asm/spinlock.h | 4 ---- arch/s390/include/asm/spinlock.h | 3 +++ arch/sh/include/asm/spinlock-cas.h | 4 ---- arch/sh/include/asm/spinlock-llsc.h | 4 ---- arch/sparc/include/asm/spinlock_32.h | 4 ---- arch/sparc/include/asm/spinlock_64.h | 4 ---- arch/x86/include/asm/spinlock.h | 4 ---- 15 files changed, 3 insertions(+), 56 deletions(-) diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index ce9bfcf1d870..f85bb585cdfc 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -413,8 +413,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index f5223260c73e..d40a28fcbc62 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -273,8 +273,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index 95ad7102b63c..1504f2b95c57 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -306,10 +306,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index 607ef98c0f6c..3885d12d9939 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -82,10 +82,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) __raw_write_unlock_asm(&rw->lock); } -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif #endif /* !__BFIN_SPINLOCK_H */ diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index c728dda59bd2..ed1e6212e9de 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -269,8 +269,4 @@ static inline int arch_read_trylock(arch_rwlock_t *x) return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word; } -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* _ASM_IA64_SPINLOCK_H */ diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 002601371b2f..6809a9bbd169 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -308,8 +308,4 @@ static inline int arch_write_trylock(arch_rwlock_t *lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* _ASM_M32R_SPINLOCK_H */ diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index ddf7fe5708a6..b5b4174cde5e 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -20,8 +20,4 @@ #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h index 6a932a952d53..d5c334ddfd62 100644 --- a/arch/metag/include/asm/spinlock_lnkget.h +++ b/arch/metag/include/asm/spinlock_lnkget.h @@ -212,8 +212,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SPINLOCK_LNKGET_H */ diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index a7d21da16b6a..4260d3f80d3a 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -16,8 +16,4 @@ #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index dc9c58ed9e4c..4eca60cc81e4 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -45,6 +45,7 @@ static inline void arch_spin_relax(arch_spinlock_t *lock) { arch_lock_relax(lock->lock); } +#define arch_spin_relax arch_spin_relax static inline u32 arch_spin_lockval(int cpu) { @@ -256,10 +257,12 @@ static inline void arch_read_relax(arch_rwlock_t *rw) { arch_lock_relax(rw->owner); } +#define arch_read_relax arch_read_relax static inline void arch_write_relax(arch_rwlock_t *rw) { arch_lock_relax(rw->owner); } +#define arch_write_relax arch_write_relax #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index 315467834521..295993c2598e 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -93,8 +93,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SH_SPINLOCK_CAS_H */ diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index 06be4a55f3c4..a6f9edd15317 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -200,8 +200,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* __ASM_SH_SPINLOCK_LLSC_H */ diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 76986b8f7dad..9d9129efd5d6 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -186,10 +186,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw) #define arch_read_lock_flags(rw, flags) arch_read_lock(rw) #define arch_write_lock_flags(rw, flags) arch_write_lock(rw) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* !(__ASSEMBLY__) */ #endif /* __SPARC_SPINLOCK_H */ diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h index f7028f5e1a5a..3b67705e1b74 100644 --- a/arch/sparc/include/asm/spinlock_64.h +++ b/arch/sparc/include/asm/spinlock_64.h @@ -16,10 +16,6 @@ #define arch_read_lock_flags(p, f) arch_read_lock(p) #define arch_write_lock_flags(p, f) arch_write_lock(p) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* !(__ASSEMBLY__) */ #endif /* !(__SPARC64_SPINLOCK_H) */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 6d391909e864..a558c187f20c 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -44,8 +44,4 @@ #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* _ASM_X86_SPINLOCK_H */ From a4c1887d4c1462b0ec5a8989f8ba3cdd9057a299 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 19:25:29 +0100 Subject: [PATCH 0339/1085] locking/arch: Remove dummy arch_{read,spin,write}_lock_flags() implementations The arch_{read,spin,write}_lock_flags() macros are simply mapped to the non-flags versions by the majority of architectures, so do this in core code and remove the dummy implementations. Also remove the implementation in spinlock_up.h, since all callers of do_raw_spin_lock_flags() call local_irq_save(flags) anyway. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/spinlock.h | 4 ---- arch/arc/include/asm/spinlock.h | 4 ---- arch/arm/include/asm/spinlock.h | 5 ----- arch/arm64/include/asm/spinlock.h | 5 ----- arch/blackfin/include/asm/spinlock.h | 6 ------ arch/hexagon/include/asm/spinlock.h | 5 ----- arch/ia64/include/asm/spinlock.h | 5 +++-- arch/m32r/include/asm/spinlock.h | 4 ---- arch/metag/include/asm/spinlock.h | 5 ----- arch/metag/include/asm/spinlock_lnkget.h | 3 --- arch/mips/include/asm/spinlock.h | 3 --- arch/mn10300/include/asm/spinlock.h | 4 +--- arch/parisc/include/asm/spinlock.h | 4 +--- arch/powerpc/include/asm/spinlock.h | 4 +--- arch/s390/include/asm/spinlock.h | 4 +--- arch/sh/include/asm/spinlock-cas.h | 4 ---- arch/sh/include/asm/spinlock-llsc.h | 4 ---- arch/sparc/include/asm/spinlock_32.h | 4 ---- arch/sparc/include/asm/spinlock_64.h | 3 --- arch/tile/include/asm/spinlock_32.h | 6 ------ arch/tile/include/asm/spinlock_64.h | 6 ------ arch/x86/include/asm/spinlock.h | 3 --- arch/xtensa/include/asm/spinlock.h | 5 ----- include/asm-generic/qspinlock.h | 1 - include/linux/rwlock.h | 9 +++++++++ include/linux/spinlock.h | 4 ++++ include/linux/spinlock_up.h | 8 -------- 27 files changed, 20 insertions(+), 102 deletions(-) diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index 7bff6316b8bb..3e2b4a05cb0f 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -13,7 +13,6 @@ * We make no fairness assumptions. They have a cost. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_spin_is_locked(x) ((x)->lock != 0) static inline int arch_spin_value_unlocked(arch_spinlock_t lock) @@ -160,7 +159,4 @@ static inline void arch_write_unlock(arch_rwlock_t * lock) lock->lock = 0; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ALPHA_SPINLOCK_H */ diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index f85bb585cdfc..2ba04a7db621 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -14,7 +14,6 @@ #include #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #ifdef CONFIG_ARC_HAS_LLSC @@ -410,7 +409,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #endif -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index d40a28fcbc62..daa87212c9a1 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -52,8 +52,6 @@ static inline void dsb_sev(void) * memory. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - static inline void arch_spin_lock(arch_spinlock_t *lock) { unsigned long tmp; @@ -270,7 +268,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) } } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index 1504f2b95c57..aa51a38e46e4 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -27,8 +27,6 @@ * instructions. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - static inline void arch_spin_lock(arch_spinlock_t *lock) { unsigned int tmp; @@ -303,9 +301,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) /* read_can_lock - would read_trylock() succeed? */ #define arch_read_can_lock(x) ((x)->lock < 0x80000000) -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index 3885d12d9939..839d1441af3a 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -36,8 +36,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) __raw_spin_lock_asm(&lock->lock); } -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - static inline int arch_spin_trylock(arch_spinlock_t *lock) { return __raw_spin_trylock_asm(&lock->lock); @@ -53,8 +51,6 @@ static inline void arch_read_lock(arch_rwlock_t *rw) __raw_read_lock_asm(&rw->lock); } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) - static inline int arch_read_trylock(arch_rwlock_t *rw) { return __raw_read_trylock_asm(&rw->lock); @@ -70,8 +66,6 @@ static inline void arch_write_lock(arch_rwlock_t *rw) __raw_write_lock_asm(&rw->lock); } -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - static inline int arch_write_trylock(arch_rwlock_t *rw) { return __raw_write_trylock_asm(&rw->lock); diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index 9f9414b9c303..48020863f53a 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -167,11 +167,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) /* * SMP spinlocks are intended to allow only a single CPU at the lock */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - #define arch_spin_is_locked(x) ((x)->lock != 0) -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index ed1e6212e9de..35b31884863b 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -126,6 +126,7 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, { arch_spin_lock(lock); } +#define arch_spin_lock_flags arch_spin_lock_flags #ifdef ASM_SUPPORTED @@ -153,6 +154,7 @@ arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags) : "p6", "p7", "r2", "memory"); } +#define arch_read_lock_flags arch_read_lock_flags #define arch_read_lock(lock) arch_read_lock_flags(lock, 0) #else /* !ASM_SUPPORTED */ @@ -205,6 +207,7 @@ arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags) : "ar.ccv", "p6", "p7", "r2", "r29", "memory"); } +#define arch_write_lock_flags arch_write_lock_flags #define arch_write_lock(rw) arch_write_lock_flags(rw, 0) #define arch_write_trylock(rw) \ @@ -228,8 +231,6 @@ static inline void arch_write_unlock(arch_rwlock_t *x) #else /* !ASM_SUPPORTED */ -#define arch_write_lock_flags(l, flags) arch_write_lock(l) - #define arch_write_lock(l) \ ({ \ __u64 ia64_val, ia64_set_val = ia64_dep_mi(-1, 0, 31, 1); \ diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 6809a9bbd169..882203db8723 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -28,7 +28,6 @@ */ #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) /** * arch_spin_trylock - Try spin lock and return a result @@ -305,7 +304,4 @@ static inline int arch_write_trylock(arch_rwlock_t *lock) return 0; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ASM_M32R_SPINLOCK_H */ diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index b5b4174cde5e..80e3e59172f2 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -15,9 +15,4 @@ * locked. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h index d5c334ddfd62..5708ac0a9d09 100644 --- a/arch/metag/include/asm/spinlock_lnkget.h +++ b/arch/metag/include/asm/spinlock_lnkget.h @@ -209,7 +209,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) return tmp; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SPINLOCK_LNKGET_H */ diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index 4260d3f80d3a..ee81297d9117 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -13,7 +13,4 @@ #include #include -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 54f75dac8094..879cd0df53ba 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -84,6 +84,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lock, : "d" (flags), "a"(&lock->slock), "i"(EPSW_IE | MN10300_CLI_LEVEL) : "memory", "cc"); } +#define arch_spin_lock_flags arch_spin_lock_flags #ifdef __KERNEL__ @@ -171,9 +172,6 @@ static inline int arch_write_trylock(arch_rwlock_t *lock) return 0; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #define _raw_spin_relax(lock) cpu_relax() #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 136e1c9bb8a9..d66d7b1efc4e 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -31,6 +31,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *x, cpu_relax(); mb(); } +#define arch_spin_lock_flags arch_spin_lock_flags static inline void arch_spin_unlock(arch_spinlock_t *x) { @@ -168,7 +169,4 @@ static __inline__ int arch_write_trylock(arch_rwlock_t *rw) return result; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index d83f4f755ad8..b9ebc3085fb7 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -161,6 +161,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) local_irq_restore(flags_dis); } } +#define arch_spin_lock_flags arch_spin_lock_flags static inline void arch_spin_unlock(arch_spinlock_t *lock) { @@ -299,9 +300,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) rw->lock = 0; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #define arch_spin_relax(lock) __spin_yield(lock) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 4eca60cc81e4..9fa855f91e55 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -81,6 +81,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lp, if (!arch_spin_trylock_once(lp)) arch_spin_lock_wait_flags(lp, flags); } +#define arch_spin_lock_flags arch_spin_lock_flags static inline int arch_spin_trylock(arch_spinlock_t *lp) { @@ -114,9 +115,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) extern int _raw_read_trylock_retry(arch_rwlock_t *lp); extern int _raw_write_trylock_retry(arch_rwlock_t *lp); -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - static inline int arch_read_trylock_once(arch_rwlock_t *rw) { int old = ACCESS_ONCE(rw->lock); diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index 295993c2598e..270ee4d3e25b 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -27,7 +27,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new */ #define arch_spin_is_locked(x) ((x)->lock <= 0) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) { @@ -90,7 +89,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) return __sl_cas(&rw->lock, RW_LOCK_BIAS, 0) == RW_LOCK_BIAS; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SH_SPINLOCK_CAS_H */ diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index a6f9edd15317..715595de286a 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -19,7 +19,6 @@ */ #define arch_spin_is_locked(x) ((x)->lock <= 0) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -197,7 +196,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) return (oldval > (RW_LOCK_BIAS - 1)); } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* __ASM_SH_SPINLOCK_LLSC_H */ diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 9d9129efd5d6..12bf857b471e 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -182,10 +182,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw) res; \ }) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -#define arch_read_lock_flags(rw, flags) arch_read_lock(rw) -#define arch_write_lock_flags(rw, flags) arch_write_lock(rw) - #endif /* !(__ASSEMBLY__) */ #endif /* __SPARC_SPINLOCK_H */ diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h index 3b67705e1b74..99b6e1c4f630 100644 --- a/arch/sparc/include/asm/spinlock_64.h +++ b/arch/sparc/include/asm/spinlock_64.h @@ -13,9 +13,6 @@ #include #include -#define arch_read_lock_flags(p, f) arch_read_lock(p) -#define arch_write_lock_flags(p, f) arch_write_lock(p) - #endif /* !(__ASSEMBLY__) */ #endif /* !(__SPARC64_SPINLOCK_H) */ diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index 91d05f21cba9..fb5313d77315 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -51,9 +51,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) void arch_spin_lock(arch_spinlock_t *lock); -/* We cannot take an interrupt after getting a ticket, so don't enable them. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - int arch_spin_trylock(arch_spinlock_t *lock); static inline void arch_spin_unlock(arch_spinlock_t *lock) @@ -109,7 +106,4 @@ void arch_read_unlock(arch_rwlock_t *rwlock); */ void arch_write_unlock(arch_rwlock_t *rwlock); -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ASM_TILE_SPINLOCK_32_H */ diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index c802f48badf4..5b616ef642a8 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -75,9 +75,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) /* Try to get the lock, and return whether we succeeded. */ int arch_spin_trylock(arch_spinlock_t *lock); -/* We cannot take an interrupt after getting a ticket, so don't enable them. */ -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - /* * Read-write spinlocks, allowing multiple readers * but only one writer. @@ -138,7 +135,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) return 0; } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ASM_TILE_SPINLOCK_64_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index a558c187f20c..c6a6adf0a5c5 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -41,7 +41,4 @@ #include -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index d005af51e2e1..c6e1290dcbb7 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -33,8 +33,6 @@ #define arch_spin_is_locked(x) ((x)->slock != 0) -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) - static inline void arch_spin_lock(arch_spinlock_t *lock) { unsigned long tmp; @@ -198,7 +196,4 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) : "memory"); } -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #endif /* _XTENSA_SPINLOCK_H */ diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 66260777d644..b37b4ad7eb94 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -121,6 +121,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock) #define arch_spin_lock(l) queued_spin_lock(l) #define arch_spin_trylock(l) queued_spin_trylock(l) #define arch_spin_unlock(l) queued_spin_unlock(l) -#define arch_spin_lock_flags(l, f) queued_spin_lock(l) #endif /* __ASM_GENERIC_QSPINLOCK_H */ diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 766c5ca5cbd1..3dcd617e65ae 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -38,6 +38,15 @@ do { \ extern int do_raw_write_trylock(rwlock_t *lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else + +#ifndef arch_read_lock_flags +# define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#endif + +#ifndef arch_write_lock_flags +# define arch_write_lock_flags(lock, flags) arch_write_lock(lock) +#endif + # define do_raw_read_lock(rwlock) do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) # define do_raw_read_lock_flags(lock, flags) \ do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1e3e48041800..4e202b00dd66 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -165,6 +165,10 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) arch_spin_lock(&lock->raw_lock); } +#ifndef arch_spin_lock_flags +#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) +#endif + static inline void do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) { diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 901cf8f44388..0ac9112c1bbe 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -32,14 +32,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) barrier(); } -static inline void -arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - local_irq_save(flags); - lock->slock = 0; - barrier(); -} - static inline int arch_spin_trylock(arch_spinlock_t *lock) { char oldval = lock->slock; From 24281a2f4ca569d44e44e4cbc500cf08e7bb4c36 Mon Sep 17 00:00:00 2001 From: Luis Felipe Sandoval Castro Date: Thu, 28 Sep 2017 08:54:42 -0500 Subject: [PATCH 0340/1085] EDAC, sb_edac: Fix missing DIMM sysfs entries with KNL SNC2/SNC4 mode When figuring out the size of the DIMMs and the cluster mode is SNC2 or SNC4 the current algorithm ignores the contribution of some of the channels resulting in EDAC never knowing of the existence of some DIMMs attached to such channels (thus sysfs is not populated). Instead of selectively iterating from 0 to interlv_ways when looking for all the participants in the interleave, do an exhaustive search and iterate from 0 to KNL_MAX_CHANNELS. The algorithm is already smart enough to consider participants only one time. This works fine in all KNL cluster modes and even when there are missing DIMMs as the contribution of those channels is 0. Signed-off-by: Luis Felipe Sandoval Castro Acked-by: Tony Luck Cc: Mauro Carvalho Chehab Cc: arozansk@redhat.com Cc: linux-edac Cc: qiuxu.zhuo@intel.com Link: http://lkml.kernel.org/r/1506606882-90521-1-git-send-email-luis.felipe.sandoval.castro@intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 54d9d179cb77..72b98a081d2b 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1318,9 +1318,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) int cur_reg_start; int mc; int channel; - int way; int participants[KNL_MAX_CHANNELS]; - int participant_count = 0; for (i = 0; i < KNL_MAX_CHANNELS; i++) mc_sizes[i] = 0; @@ -1495,21 +1493,14 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) * this channel mapped to the given target? */ for (channel = 0; channel < KNL_MAX_CHANNELS; channel++) { - for (way = 0; way < intrlv_ways; way++) { - int target; - int cha; - - if (KNL_MOD3(dram_rule)) - target = way; - else - target = 0x7 & sad_pkg( - pvt->info.interleave_pkg, interleave_reg, way); + int target; + int cha; + for (target = 0; target < KNL_MAX_CHANNELS; target++) { for (cha = 0; cha < KNL_MAX_CHAS; cha++) { if (knl_get_mc_route(target, mc_route_reg[cha]) == channel && !participants[channel]) { - participant_count++; participants[channel] = 1; break; } @@ -1517,10 +1508,6 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) } } - if (participant_count != intrlv_ways) - edac_dbg(0, "participant_count (%d) != interleave_ways (%d): DIMM size may be incorrect\n", - participant_count, intrlv_ways); - for (channel = 0; channel < KNL_MAX_CHANNELS; channel++) { mc = knl_channel_mc(channel); if (participants[channel]) { From dc0fdf7d2bf9e7efdc690b8a594e39de5b3a993b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 21 Sep 2017 00:27:12 +0900 Subject: [PATCH 0341/1085] x86/boot: Remove unnecessary #include The defines UTS_RELEASE, but I do not see any reference to it in arch/x86/boot/header.S. Signed-off-by: Masahiro Yamada Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1505921232-8960-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/boot/header.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 1bb08ecffd24..16c516b53758 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -16,7 +16,6 @@ */ #include -#include #include #include #include From df8bbd0c98bef5bcf691709437c4102e9840b0f5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 6 Oct 2017 13:17:10 +0200 Subject: [PATCH 0342/1085] s390/kprobes: remove KPROBE_SWAP_INST state For an unknown reason the s390 kprobes instruction replacement function modifies the kprobe_status of the current CPU to KPROBE_SWAP_INST. This was supposed to catch traps that happened during instruction patching. Such a fault is not supposed to happen, and silently discarding such a fault is certainly also not what we want. In fact s390 is the only architecture which has this odd piece of code. Just remove this and behave like all other architectures. This was pointed out by Jens Remus. Reported-by: Jens Remus Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kprobes.h | 2 -- arch/s390/kernel/kprobes.c | 7 ------- 2 files changed, 9 deletions(-) diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 28792ef82c83..921391f2341e 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -63,8 +63,6 @@ typedef u16 kprobe_opcode_t; #define kretprobe_blacklist_size 0 -#define KPROBE_SWAP_INST 0x10 - /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 6842e4501e2e..1a6521af1751 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -161,8 +161,6 @@ struct swap_insn_args { static int swap_instruction(void *data) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long status = kcb->kprobe_status; struct swap_insn_args *args = data; struct ftrace_insn new_insn, *insn; struct kprobe *p = args->p; @@ -185,9 +183,7 @@ static int swap_instruction(void *data) ftrace_generate_nop_insn(&new_insn); } skip_ftrace: - kcb->kprobe_status = KPROBE_SWAP_INST; s390_kernel_write(p->addr, &new_insn, len); - kcb->kprobe_status = status; return 0; } NOKPROBE_SYMBOL(swap_instruction); @@ -574,9 +570,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) const struct exception_table_entry *entry; switch(kcb->kprobe_status) { - case KPROBE_SWAP_INST: - /* We are here because the instruction replacement failed */ - return 0; case KPROBE_HIT_SS: case KPROBE_REENTER: /* From 1e4078f0bba46ad61b69548abe6a6faf63b89380 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Oct 2017 09:24:30 +0200 Subject: [PATCH 0343/1085] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default in the 64-bit defconfig Increase testing coverage by turning on the primary x86 unwinder for the 64-bit defconfig. Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/configs/x86_64_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4a4b16e56d35..eb65c248708d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y +CONFIG_ORC_UNWINDER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y From 02edee152d6ea325c88898f3a702f5db2d78de7a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Oct 2017 11:05:28 +0200 Subject: [PATCH 0344/1085] x86/apic/vector: Ignore set_affinity call for inactive interrupts The core interrupt code can call the affinity setter for inactive interrupts under certain circumstances. For inactive intererupts which use managed or reservation mode this is a pointless exercise as the activation will assign a vector which fits the destination mask. Check for this and return w/o going through the vector assignment. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6789e286def9..573538e0981e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -726,8 +726,21 @@ void lapic_offline(void) static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { + struct apic_chip_data *apicd = apic_chip_data(irqd); int err; + /* + * Core code can call here for inactive interrupts. For inactive + * interrupts which use managed or reservation mode there is no + * point in going through the vector assignment right now as the + * activation will assign a vector which fits the destination + * cpumask. Let the core code store the destination mask and be + * done with it. + */ + if (!irqd_is_activated(irqd) && + (apicd->is_managed || apicd->can_reserve)) + return IRQ_SET_MASK_OK; + raw_spin_lock(&vector_lock); cpumask_and(vector_searchmask, dest, cpu_online_mask); if (irqd_affinity_is_managed(irqd)) From da379f3c1db0c9a1fd27b11d24c9894b5edc7c75 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 20 Jun 2017 11:38:03 +0200 Subject: [PATCH 0345/1085] tpm: migrate pubek_show to struct tpm_buf Migrated pubek_show to struct tpm_buf and cleaned up its implementation. Previously the output parameter structure was declared but left completely unused. Now it is used to refer different fields of the output. We can move it to tpm-sysfs.c as it does not have any use outside of that file. Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-sysfs.c | 83 ++++++++++++++++++++---------------- drivers/char/tpm/tpm.h | 13 ------ 2 files changed, 46 insertions(+), 50 deletions(-) diff --git a/drivers/char/tpm/tpm-sysfs.c b/drivers/char/tpm/tpm-sysfs.c index 86f38d239476..83a77a445538 100644 --- a/drivers/char/tpm/tpm-sysfs.c +++ b/drivers/char/tpm/tpm-sysfs.c @@ -20,44 +20,48 @@ #include #include "tpm.h" -#define READ_PUBEK_RESULT_SIZE 314 +struct tpm_readpubek_out { + u8 algorithm[4]; + u8 encscheme[2]; + u8 sigscheme[2]; + __be32 paramsize; + u8 parameters[12]; + __be32 keysize; + u8 modulus[256]; + u8 checksum[20]; +} __packed; + #define READ_PUBEK_RESULT_MIN_BODY_SIZE (28 + 256) #define TPM_ORD_READPUBEK 124 -static const struct tpm_input_header tpm_readpubek_header = { - .tag = cpu_to_be16(TPM_TAG_RQU_COMMAND), - .length = cpu_to_be32(30), - .ordinal = cpu_to_be32(TPM_ORD_READPUBEK) -}; + static ssize_t pubek_show(struct device *dev, struct device_attribute *attr, char *buf) { - u8 *data; - struct tpm_cmd_t tpm_cmd; - ssize_t err; - int i, rc; + struct tpm_buf tpm_buf; + struct tpm_readpubek_out *out; + ssize_t rc; + int i; char *str = buf; struct tpm_chip *chip = to_tpm_chip(dev); + char anti_replay[20]; - memset(&tpm_cmd, 0, sizeof(tpm_cmd)); + memset(&anti_replay, 0, sizeof(anti_replay)); - tpm_cmd.header.in = tpm_readpubek_header; - err = tpm_transmit_cmd(chip, NULL, &tpm_cmd, READ_PUBEK_RESULT_SIZE, - READ_PUBEK_RESULT_MIN_BODY_SIZE, 0, - "attempting to read the PUBEK"); - if (err) - goto out; + rc = tpm_buf_init(&tpm_buf, TPM_TAG_RQU_COMMAND, TPM_ORD_READPUBEK); + if (rc) + return rc; - /* - ignore header 10 bytes - algorithm 32 bits (1 == RSA ) - encscheme 16 bits - sigscheme 16 bits - parameters (RSA 12->bytes: keybit, #primes, expbit) - keylenbytes 32 bits - 256 byte modulus - ignore checksum 20 bytes - */ - data = tpm_cmd.params.readpubek_out_buffer; + tpm_buf_append(&tpm_buf, anti_replay, sizeof(anti_replay)); + + rc = tpm_transmit_cmd(chip, NULL, tpm_buf.data, PAGE_SIZE, + READ_PUBEK_RESULT_MIN_BODY_SIZE, 0, + "attempting to read the PUBEK"); + if (rc) { + tpm_buf_destroy(&tpm_buf); + return 0; + } + + out = (struct tpm_readpubek_out *)&tpm_buf.data[10]; str += sprintf(str, "Algorithm: %02X %02X %02X %02X\n" @@ -68,21 +72,26 @@ static ssize_t pubek_show(struct device *dev, struct device_attribute *attr, "%02X %02X %02X %02X\n" "Modulus length: %d\n" "Modulus:\n", - data[0], data[1], data[2], data[3], - data[4], data[5], - data[6], data[7], - data[12], data[13], data[14], data[15], - data[16], data[17], data[18], data[19], - data[20], data[21], data[22], data[23], - be32_to_cpu(*((__be32 *) (data + 24)))); + out->algorithm[0], out->algorithm[1], out->algorithm[2], + out->algorithm[3], + out->encscheme[0], out->encscheme[1], + out->sigscheme[0], out->sigscheme[1], + out->parameters[0], out->parameters[1], + out->parameters[2], out->parameters[3], + out->parameters[4], out->parameters[5], + out->parameters[6], out->parameters[7], + out->parameters[8], out->parameters[9], + out->parameters[10], out->parameters[11], + be32_to_cpu(out->keysize)); for (i = 0; i < 256; i++) { - str += sprintf(str, "%02X ", data[i + 28]); + str += sprintf(str, "%02X ", out->modulus[i]); if ((i + 1) % 16 == 0) str += sprintf(str, "\n"); } -out: + rc = str - buf; + tpm_buf_destroy(&tpm_buf); return rc; } static DEVICE_ATTR_RO(pubek); diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 2d5466a72e40..b50e92fbca31 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -345,17 +345,6 @@ enum tpm_sub_capabilities { TPM_CAP_PROP_TIS_DURATION = 0x120, }; -struct tpm_readpubek_params_out { - u8 algorithm[4]; - u8 encscheme[2]; - u8 sigscheme[2]; - __be32 paramsize; - u8 parameters[12]; /*assuming RSA*/ - __be32 keysize; - u8 modulus[256]; - u8 checksum[20]; -} __packed; - typedef union { struct tpm_input_header in; struct tpm_output_header out; @@ -385,8 +374,6 @@ struct tpm_getrandom_in { } __packed; typedef union { - struct tpm_readpubek_params_out readpubek_out; - u8 readpubek_out_buffer[sizeof(struct tpm_readpubek_params_out)]; struct tpm_pcrread_in pcrread_in; struct tpm_pcrread_out pcrread_out; struct tpm_getrandom_in getrandom_in; From d5cdbb875f5c450bf9ee950008a2865343c1775c Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 2 Oct 2017 17:44:17 -0600 Subject: [PATCH 0346/1085] doc: dev-tools: kselftest.rst: update to include make O=dir support Update to include details on make O=dir support and other changes improve test results output. Signed-off-by: Shuah Khan [jc: Tweaked RST formatting slightly ] Signed-off-by: Jonathan Corbet --- Documentation/dev-tools/kselftest.rst | 34 +++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index ebd03d11d2c2..e80850eefe13 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -31,6 +31,17 @@ To build and run the tests with a single command, use:: Note that some tests will require root privileges. +Build and run from user specific object directory (make O=dir):: + + $ make O=/tmp/kselftest kselftest + +Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=):: + + $ make KBUILD_OUTPUT=/tmp/kselftest kselftest + +The above commands run the tests and print pass/fail summary to make it +easier to understand the test results. Please find the detailed individual +test results for each test in /tmp/testname file(s). Running a subset of selftests ============================= @@ -46,10 +57,21 @@ You can specify multiple tests to build and run:: $ make TARGETS="size timers" kselftest +Build and run from user specific object directory (make O=dir):: + + $ make O=/tmp/kselftest TARGETS="size timers" kselftest + +Build and run KBUILD_OUTPUT directory (make KBUILD_OUTPUT=):: + + $ make KBUILD_OUTPUT=/tmp/kselftest TARGETS="size timers" kselftest + +The above commands run the tests and print pass/fail summary to make it +easier to understand the test results. Please find the detailed individual +test results for each test in /tmp/testname file(s). + See the top-level tools/testing/selftests/Makefile for the list of all possible targets. - Running the full range hotplug selftests ======================================== @@ -113,9 +135,17 @@ Contributing new tests (details) * Use TEST_GEN_XXX if such binaries or files are generated during compiling. - TEST_PROGS, TEST_GEN_PROGS mean it is the excutable tested by + TEST_PROGS, TEST_GEN_PROGS mean it is the executable tested by default. + TEST_CUSTOM_PROGS should be used by tests that require custom build + rule and prevent common build rule use. + + TEST_PROGS are for test shell scripts. Please ensure shell script has + its exec bit set. Otherwise, lib.mk run_tests will generate a warning. + + TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests. + TEST_PROGS_EXTENDED, TEST_GEN_PROGS_EXTENDED mean it is the executable which is not tested by default. TEST_FILES, TEST_GEN_FILES mean it is the file which is used by From 9effc8f70b87cf0c02a2ce264257c34a5fe885d5 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 2 Oct 2017 17:44:18 -0600 Subject: [PATCH 0347/1085] doc: enhance dochelp include default output location for doc build Enhance documentation help message to specify the default location for the generated documents. Signed-off-by: Shuah Khan Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/Makefile b/Documentation/Makefile index 85f7856f0092..5e65fa5c6ab7 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -116,3 +116,5 @@ dochelp: @echo @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build' @echo ' configuration. This is e.g. useful to build with nit-picking config.' + @echo + @echo ' Default location for the generated documents is Documentation/output' From 8d73c512e648bee83b912733876b9b4071353265 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 2 Oct 2017 17:44:19 -0600 Subject: [PATCH 0348/1085] Makefile: enable dochelp run from main make level Change to enable dochelp run from main make level to make it easier to use it. Signed-off-by: Shuah Khan Signed-off-by: Jonathan Corbet --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d1119941261c..bcb20d2c1eac 100644 --- a/Makefile +++ b/Makefile @@ -1454,7 +1454,7 @@ $(help-board-dirs): help-%: # Documentation targets # --------------------------------------------------------------------------- -DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs epubdocs cleandocs linkcheckdocs +DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs epubdocs cleandocs linkcheckdocs dochelp PHONY += $(DOC_TARGETS) $(DOC_TARGETS): scripts_basic FORCE $(Q)$(MAKE) $(build)=Documentation $@ From e8939222dced668fc5cae02b0b601af069801107 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 9 Oct 2017 18:26:15 +0300 Subject: [PATCH 0349/1085] Documentation: add script and build target to check for broken file references Add a simple script and build target to do a treewide grep for references to files under Documentation, and report the non-existing file in stderr. It tries to take into account punctuation not part of the filename, and wildcards, but there are bound to be false positives too. Mostly seems accurate though. We've moved files around enough to make having this worthwhile. Signed-off-by: Jani Nikula Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 4 ++++ Makefile | 3 ++- scripts/documentation-file-ref-check | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100755 scripts/documentation-file-ref-check diff --git a/Documentation/Makefile b/Documentation/Makefile index 5e65fa5c6ab7..2ca77ad0f238 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -97,6 +97,9 @@ endif # HAVE_SPHINX # The following targets are independent of HAVE_SPHINX, and the rules should # work or silently pass without Sphinx. +refcheckdocs: + $(Q)cd $(srctree);scripts/documentation-file-ref-check + cleandocs: $(Q)rm -rf $(BUILDDIR) $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean @@ -109,6 +112,7 @@ dochelp: @echo ' epubdocs - EPUB' @echo ' xmldocs - XML' @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)' + @echo ' refcheckdocs - check for references to non-existing files under Documentation' @echo ' cleandocs - clean all generated files' @echo @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2' diff --git a/Makefile b/Makefile index bcb20d2c1eac..11df924e160a 100644 --- a/Makefile +++ b/Makefile @@ -1454,7 +1454,8 @@ $(help-board-dirs): help-%: # Documentation targets # --------------------------------------------------------------------------- -DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs epubdocs cleandocs linkcheckdocs dochelp +DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs epubdocs cleandocs \ + linkcheckdocs dochelp refcheckdocs PHONY += $(DOC_TARGETS) $(DOC_TARGETS): scripts_basic FORCE $(Q)$(MAKE) $(build)=Documentation $@ diff --git a/scripts/documentation-file-ref-check b/scripts/documentation-file-ref-check new file mode 100755 index 000000000000..bc1659900e89 --- /dev/null +++ b/scripts/documentation-file-ref-check @@ -0,0 +1,15 @@ +#!/bin/sh +# Treewide grep for references to files under Documentation, and report +# non-existing files in stderr. + +for f in $(git ls-files); do + for ref in $(grep -ho "Documentation/[A-Za-z0-9_.,~/*+-]*" "$f"); do + # presume trailing . and , are not part of the name + ref=${ref%%[.,]} + + # use ls to handle wildcards + if ! ls $ref >/dev/null 2>&1; then + echo "$f: $ref" >&2 + fi + done +done From 66ccc64f2c3b934f065811160be288cb9a2815ef Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:09 -0500 Subject: [PATCH 0350/1085] Documentation: fix driver-api doc refs Make driver-api document refs valid. Signed-off-by: Tom Saeger Acked-by: Rafael J. Wysocki Signed-off-by: Jonathan Corbet --- Documentation/power/pci.txt | 10 +++++----- Documentation/power/runtime_pm.txt | 2 +- Documentation/process/submitting-drivers.rst | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt index a1b7f7158930..d17fdf8f45ef 100644 --- a/Documentation/power/pci.txt +++ b/Documentation/power/pci.txt @@ -8,7 +8,7 @@ management. Based on previous work by Patrick Mochel This document only covers the aspects of power management specific to PCI devices. For general description of the kernel's interfaces related to device -power management refer to Documentation/power/admin-guide/devices.rst and +power management refer to Documentation/driver-api/pm/devices.rst and Documentation/power/runtime_pm.txt. --------------------------------------------------------------------------- @@ -417,7 +417,7 @@ pm->runtime_idle() callback. 2.4. System-Wide Power Transitions ---------------------------------- There are a few different types of system-wide power transitions, described in -Documentation/power/admin-guide/devices.rst. Each of them requires devices to be handled +Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled in a specific way and the PM core executes subsystem-level power management callbacks for this purpose. They are executed in phases such that each phase involves executing the same subsystem-level callback for every device belonging @@ -623,7 +623,7 @@ System restore requires a hibernation image to be loaded into memory and the pre-hibernation memory contents to be restored before the pre-hibernation system activity can be resumed. -As described in Documentation/power/admin-guide/devices.rst, the hibernation image is loaded +As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded into memory by a fresh instance of the kernel, called the boot kernel, which in turn is loaded and run by a boot loader in the usual way. After the boot kernel has loaded the image, it needs to replace its own code and data with the code @@ -677,7 +677,7 @@ controlling the runtime power management of their devices. At the time of this writing there are two ways to define power management callbacks for a PCI device driver, the recommended one, based on using a -dev_pm_ops structure described in Documentation/power/admin-guide/devices.rst, and the +dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the "legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and .resume() callbacks from struct pci_driver are used. The legacy approach, however, doesn't allow one to define runtime power management callbacks and is @@ -1046,5 +1046,5 @@ PCI Local Bus Specification, Rev. 3.0 PCI Bus Power Management Interface Specification, Rev. 1.2 Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b PCI Express Base Specification, Rev. 2.0 -Documentation/power/admin-guide/devices.rst +Documentation/driver-api/pm/devices.rst Documentation/power/runtime_pm.txt diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 625549d4c74a..57af2f7963ee 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -680,7 +680,7 @@ left in runtime suspend. If that happens, the PM core will not execute any system suspend and resume callbacks for all of those devices, except for the complete callback, which is then entirely responsible for handling the device as appropriate. This only applies to system suspend transitions that are not -related to hibernation (see Documentation/power/admin-guide/devices.rst for more +related to hibernation (see Documentation/driver-api/pm/devices.rst for more information). The PM core does its best to reduce the probability of race conditions between diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst index afb82ee0cbea..b38bf2054ce3 100644 --- a/Documentation/process/submitting-drivers.rst +++ b/Documentation/process/submitting-drivers.rst @@ -117,7 +117,7 @@ PM support: anything. For the driver testing instructions see Documentation/power/drivers-testing.txt and for a relatively complete overview of the power management issues related to - drivers see Documentation/power/admin-guide/devices.rst . + drivers see Documentation/driver-api/pm/devices.rst. Control: In general if there is active maintenance of a driver by From 3ba9b1b814fe37ba3267b58917a73dc69eebde40 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:16 -0500 Subject: [PATCH 0351/1085] Documentation: fix admin-guide doc refs Make admin-guide document refs valid. Signed-off-by: Tom Saeger Acked-by: Rafael J. Wysocki Signed-off-by: Jonathan Corbet --- Documentation/ABI/stable/sysfs-devices | 2 +- Documentation/ABI/testing/sysfs-devices-system-cpu | 6 ++++-- Documentation/ABI/testing/sysfs-power | 6 ++++-- Documentation/admin-guide/README.rst | 2 +- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/reporting-bugs.rst | 4 ++-- Documentation/laptops/laptop-mode.txt | 6 +++--- Documentation/media/v4l-drivers/bttv.rst | 2 +- Documentation/power/interface.txt | 3 ++- 9 files changed, 19 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-devices b/Documentation/ABI/stable/sysfs-devices index 35c457f8ce73..4404bd9b96c1 100644 --- a/Documentation/ABI/stable/sysfs-devices +++ b/Documentation/ABI/stable/sysfs-devices @@ -1,5 +1,5 @@ # Note: This documents additional properties of any device beyond what -# is documented in Documentation/sysfs-rules.txt +# is documented in Documentation/admin-guide/sysfs-rules.rst What: /sys/devices/*/of_node Date: February 2015 diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index f3d5817c4ef0..d6d862db3b5d 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -187,7 +187,8 @@ Description: Processor frequency boosting control This switch controls the boost setting for the whole system. Boosting allows the CPU and the firmware to run at a frequency beyound it's nominal limit. - More details can be found in Documentation/cpu-freq/boost.txt + More details can be found in + Documentation/admin-guide/pm/cpufreq.rst What: /sys/devices/system/cpu/cpu#/crash_notes @@ -223,7 +224,8 @@ Description: Parameters for the Intel P-state driver no_turbo: limits the driver to selecting P states below the turbo frequency range. - More details can be found in Documentation/cpu-freq/intel-pstate.txt + More details can be found in + Documentation/admin-guide/pm/intel_pstate.rst What: /sys/devices/system/cpu/cpu*/cache/index*/ Date: July 2014(documented, existed before August 2008) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 713cab1d5f12..4b2e13e5f020 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -18,7 +18,8 @@ Description: Writing one of the above strings to this file causes the system to transition into the corresponding state, if available. - See Documentation/power/states.txt for more information. + See Documentation/admin-guide/pm/sleep-states.rst for more + information. What: /sys/power/mem_sleep Date: November 2016 @@ -35,7 +36,8 @@ Description: represented by it to be used on subsequent attempts to suspend the system. - See Documentation/power/states.txt for more information. + See Documentation/admin-guide/pm/sleep-states.rst for more + information. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index b5343c5aa224..63066db39910 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -350,7 +350,7 @@ If something goes wrong help debugging the problem. The text above the dump is also important: it tells something about why the kernel dumped code (in the above example, it's due to a bad kernel pointer). More information - on making sense of the dump is in Documentation/admin-guide/oops-tracing.rst + on making sense of the dump is in Documentation/admin-guide/bug-hunting.rst - If you compiled the kernel with CONFIG_KALLSYMS you can send the dump as is, otherwise you will have to use the ``ksymoops`` program to make diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3427399fd07..957b931cc84c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3134,7 +3134,7 @@ plip= [PPT,NET] Parallel port network link Format: { parport | timid | 0 } - See also Documentation/parport.txt. + See also Documentation/admin-guide/parport.rst. pmtmr= [X86] Manual setup of pmtmr I/O Port. Override pmtimer IOPort with a hex value. diff --git a/Documentation/admin-guide/reporting-bugs.rst b/Documentation/admin-guide/reporting-bugs.rst index 26b60b419652..4650edb8840a 100644 --- a/Documentation/admin-guide/reporting-bugs.rst +++ b/Documentation/admin-guide/reporting-bugs.rst @@ -94,7 +94,7 @@ step-by-step instructions for how a user can trigger the bug. If the failure includes an "OOPS:", take a picture of the screen, capture a netconsole trace, or type the message from your screen into the bug -report. Please read "Documentation/admin-guide/oops-tracing.rst" before posting your +report. Please read "Documentation/admin-guide/bug-hunting.rst" before posting your bug report. This explains what you should do with the "Oops" information to make it useful to the recipient. @@ -120,7 +120,7 @@ summary from [1.]>" for easy identification by the developers:: [4.2.] Kernel .config file: [5.] Most recent kernel version which did not have the bug: [6.] Output of Oops.. message (if applicable) with symbolic information - resolved (see Documentation/admin-guide/oops-tracing.rst) + resolved (see Documentation/admin-guide/bug-hunting.rst) [7.] A small shell script or example program which triggers the problem (if possible) [8.] Environment diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt index 19276f5d195c..1c707fc9b141 100644 --- a/Documentation/laptops/laptop-mode.txt +++ b/Documentation/laptops/laptop-mode.txt @@ -184,7 +184,7 @@ is done when dirty_ratio is reached. DO_CPU: Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/cpu-freq/user-guide.txt for more info. Disabled by default.) +See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) CPU_MAXFREQ: @@ -287,7 +287,7 @@ MINIMUM_BATTERY_MINUTES=10 # Should the maximum CPU frequency be adjusted down while on battery? # Requires CPUFreq to be setup. -# See Documentation/cpu-freq/user-guide.txt for more info +# See Documentation/admin-guide/pm/cpufreq.rst for more info #DO_CPU=0 # When on battery what is the maximum CPU speed that the system should @@ -378,7 +378,7 @@ BATT_HD=${BATT_HD:-'4'} DIRTY_RATIO=${DIRTY_RATIO:-'40'} # cpu frequency scaling -# See Documentation/cpu-freq/user-guide.txt for more info +# See Documentation/admin-guide/pm/cpufreq.rst for more info DO_CPU=${CPU_MANAGE:-'0'} CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} diff --git a/Documentation/media/v4l-drivers/bttv.rst b/Documentation/media/v4l-drivers/bttv.rst index 195ccaac2816..5f35e2fb5afa 100644 --- a/Documentation/media/v4l-drivers/bttv.rst +++ b/Documentation/media/v4l-drivers/bttv.rst @@ -307,7 +307,7 @@ console and let some terminal application log the messages. /me uses screen. See Documentation/admin-guide/serial-console.rst for details on setting up a serial console. -Read Documentation/admin-guide/oops-tracing.rst to learn how to get any useful +Read Documentation/admin-guide/bug-hunting.rst to learn how to get any useful information out of a register+stack dump printed by the kernel on protection faults (so-called "kernel oops"). diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index 974916ff6608..7dc75f48e8bd 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -24,7 +24,8 @@ platform. If one of the strings listed in /sys/power/state is written to it, the system will attempt to transition into the corresponding sleep state. Refer to -Documentation/power/states.txt for a description of each of those states. +Documentation/admin-guide/pm/sleep-states.rst for a description of each of +those states. /sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk). Specifically, it tells the kernel what to do after creating a hibernation image. From 1752118d48e7627756acf7fb8c13541305078fed Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:23 -0500 Subject: [PATCH 0352/1085] Documentation: fix input related doc refs Make `input` document refs valid including: - joystick - joystick-parport Signed-off-by: Tom Saeger Reviewed-by: Takashi Iwai Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 10 +++++----- Documentation/hid/hiddev.txt | 2 +- Documentation/input/devices/xpad.rst | 3 ++- Documentation/sound/cards/joystick.rst | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 957b931cc84c..f478f2402af1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -314,7 +314,7 @@ amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , - See also Documentation/input/joystick.txt + See also Documentation/input/joydev/joystick.rst analog.map= [HW,JOY] Analog joystick and gamepad support Specifies type or capabilities of an analog joystick @@ -724,7 +724,7 @@ db9.dev[2|3]= [HW,JOY] Multisystem joystick support via parallel port (one device per port) Format: , - See also Documentation/input/joystick-parport.txt + See also Documentation/input/devices/joystick-parport.rst ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot time. See @@ -1220,7 +1220,7 @@ [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) Format: ,,,,, - See also Documentation/input/joystick-parport.txt + See also Documentation/input/devices/joystick-parport.rst gamma= [HW,DRM] @@ -1766,7 +1766,7 @@ ivrs_acpihid[00:14.5]=AMD0020:0 js= [HW,JOY] Analog joystick - See Documentation/input/joystick.txt. + See Documentation/input/joydev/joystick.rst. nokaslr [KNL] When CONFIG_RANDOMIZE_BASE is set, this disables @@ -4205,7 +4205,7 @@ TurboGraFX parallel port interface Format: ,,,,,,, - See also Documentation/input/joystick-parport.txt + See also Documentation/input/devices/joystick-parport.rst udbg-immortal [PPC] When debugging early kernel crashes that happen after console_init() and before a proper diff --git a/Documentation/hid/hiddev.txt b/Documentation/hid/hiddev.txt index 6e8c9f1d2f22..638448707aa2 100644 --- a/Documentation/hid/hiddev.txt +++ b/Documentation/hid/hiddev.txt @@ -12,7 +12,7 @@ To support these disparate requirements, the Linux USB system provides HID events to two separate interfaces: * the input subsystem, which converts HID events into normal input device interfaces (such as keyboard, mouse and joystick) and a -normalised event interface - see Documentation/input/input.txt +normalised event interface - see Documentation/input/input.rst * the hiddev interface, which provides fairly raw HID events The data flow for a HID event produced by a device is something like diff --git a/Documentation/input/devices/xpad.rst b/Documentation/input/devices/xpad.rst index 5a709ab77c8d..b8bd65962dd8 100644 --- a/Documentation/input/devices/xpad.rst +++ b/Documentation/input/devices/xpad.rst @@ -230,4 +230,5 @@ Historic Edits 2005-03-19 - Dominic Cerquetti - added stuff for dance pads, new d-pad->axes mappings -Later changes may be viewed with 'git log Documentation/input/xpad.txt' +Later changes may be viewed with +'git log --follow Documentation/input/devices/xpad.rst' diff --git a/Documentation/sound/cards/joystick.rst b/Documentation/sound/cards/joystick.rst index a6e468c81d02..488946fc1079 100644 --- a/Documentation/sound/cards/joystick.rst +++ b/Documentation/sound/cards/joystick.rst @@ -11,7 +11,7 @@ General First of all, you need to enable GAMEPORT support on Linux kernel for using a joystick with the ALSA driver. For the details of gameport -support, refer to Documentation/input/joystick.txt. +support, refer to Documentation/input/joydev/joystick.rst. The joystick support of ALSA drivers is different between ISA and PCI cards. In the case of ISA (PnP) cards, it's usually handled by the From c7f66400f504fd54bda6ec644853c07333e8cb87 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:30 -0500 Subject: [PATCH 0353/1085] Documentation: fix security related doc refs Make security document refs valid. Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/ABI/testing/evm | 4 ++-- Documentation/security/LSM.rst | 2 +- Documentation/security/credentials.rst | 2 +- Documentation/security/keys/request-key.rst | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/evm b/Documentation/ABI/testing/evm index 8374d4557e5d..ca622c9aa24c 100644 --- a/Documentation/ABI/testing/evm +++ b/Documentation/ABI/testing/evm @@ -18,6 +18,6 @@ Description: in the initramfs, which has already been measured as part of the trusted boot. For more information on creating and loading existing trusted/encrypted keys, refer to: - Documentation/keys-trusted-encrypted.txt. (A sample dracut - patch, which loads the trusted/encrypted key and enables + Documentation/security/keys/trusted-encrypted.rst. (A sample + dracut patch, which loads the trusted/encrypted key and enables EVM, is available from http://linux-ima.sourceforge.net/#EVM.) diff --git a/Documentation/security/LSM.rst b/Documentation/security/LSM.rst index d75778b0fa10..98522e0e1ee2 100644 --- a/Documentation/security/LSM.rst +++ b/Documentation/security/LSM.rst @@ -5,7 +5,7 @@ Linux Security Module Development Based on https://lkml.org/lkml/2007/10/26/215, a new LSM is accepted into the kernel when its intent (a description of what it tries to protect against and in what cases one would expect to -use it) has been appropriately documented in ``Documentation/security/LSM``. +use it) has been appropriately documented in ``Documentation/security/LSM.rst``. This allows an LSM's code to be easily compared to its goals, and so that end users and distros can make a more informed decision about which LSMs suit their requirements. diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst index 038a7e19eff9..66a2e24939d8 100644 --- a/Documentation/security/credentials.rst +++ b/Documentation/security/credentials.rst @@ -196,7 +196,7 @@ The Linux kernel supports the following types of credentials: When a process accesses a key, if not already present, it will normally be cached on one of these keyrings for future accesses to find. - For more information on using keys, see Documentation/security/keys.txt. + For more information on using keys, see ``Documentation/security/keys/*``. 5. LSM diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst index b2d16abaa9e9..21e27238cec6 100644 --- a/Documentation/security/keys/request-key.rst +++ b/Documentation/security/keys/request-key.rst @@ -3,7 +3,7 @@ Key Request Service =================== The key request service is part of the key retention service (refer to -Documentation/security/core.rst). This document explains more fully how +Documentation/security/keys/core.rst). This document explains more fully how the requesting algorithm works. The process starts by either the kernel requesting a service by calling From a405ed85ca170f5a6cc512a3ff492f77b5e63f15 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:37 -0500 Subject: [PATCH 0354/1085] Documentation: fix media related doc refs Make media doc refs valid. Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- Documentation/media/dvb-drivers/bt8xx.rst | 8 ++++---- Documentation/media/uapi/v4l/dev-sliced-vbi.rst | 2 +- Documentation/media/uapi/v4l/extended-controls.rst | 2 +- Documentation/media/uapi/v4l/pixfmt-reserved.rst | 2 +- Documentation/media/v4l-drivers/max2175.rst | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f478f2402af1..97a76eea7389 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -439,7 +439,7 @@ bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards) bttv.radio= Most important insmod options are available as kernel args too. - bttv.pll= See Documentation/video4linux/bttv/Insmod-options + bttv.pll= See Documentation/media/v4l-drivers/bttv.rst bttv.tuner= bulk_remove=off [PPC] This parameter disables the use of the pSeries @@ -2251,7 +2251,7 @@ See Documentation/admin-guide/pm/sleep-states.rst. meye.*= [HW] Set MotionEye Camera parameters - See Documentation/video4linux/meye.txt. + See Documentation/media/v4l-drivers/meye.rst. mfgpt_irq= [IA-32] Specify the IRQ to use for the Multi-Function General Purpose Timers on AMD Geode diff --git a/Documentation/media/dvb-drivers/bt8xx.rst b/Documentation/media/dvb-drivers/bt8xx.rst index b43958b7340c..e3e387bdf498 100644 --- a/Documentation/media/dvb-drivers/bt8xx.rst +++ b/Documentation/media/dvb-drivers/bt8xx.rst @@ -18,7 +18,7 @@ General information This class of cards has a bt878a as the PCI interface, and require the bttv driver for accessing the i2c bus and the gpio pins of the bt8xx chipset. -Please see Documentation/dvb/cards.txt => o Cards based on the Conexant Bt8xx PCI bridge: +Please see Documentation/media/dvb-drivers/cards.rst => o Cards based on the Conexant Bt8xx PCI bridge: Compiling kernel please enable: @@ -45,7 +45,7 @@ Loading Modules Regular case: If the bttv driver detects a bt8xx-based DVB card, all frontend and backend modules will be loaded automatically. Exceptions are: - Old TwinHan DST cards or clones with or without CA slot and not containing an Eeprom. -People running udev please see Documentation/dvb/udev.txt. +People running udev please see Documentation/media/dvb-drivers/udev.rst. In the following cases overriding the PCI type detection for dvb-bt8xx might be necessary: @@ -72,7 +72,7 @@ Useful parameters for verbosity level and debugging the dst module: The autodetected values are determined by the cards' "response string". In your logs see f. ex.: dst_get_device_id: Recognize [DSTMCI]. For bug reports please send in a complete log with verbose=4 activated. -Please also see Documentation/dvb/ci.txt. +Please also see Documentation/media/dvb-drivers/ci.rst. Running multiple cards ~~~~~~~~~~~~~~~~~~~~~~ @@ -100,7 +100,7 @@ Examples of card ID's: $ modprobe bttv card=113 card=135 -For a full list of card ID's please see Documentation/video4linux/CARDLIST.bttv. +For a full list of card ID's please see Documentation/media/v4l-drivers/bttv-cardlist.rst. In case of further problems please subscribe and send questions to the mailing list: linux-dvb@linuxtv.org. Probing the cards with broken PCI subsystem ID diff --git a/Documentation/media/uapi/v4l/dev-sliced-vbi.rst b/Documentation/media/uapi/v4l/dev-sliced-vbi.rst index 9d6c860271cb..d311a6866b3b 100644 --- a/Documentation/media/uapi/v4l/dev-sliced-vbi.rst +++ b/Documentation/media/uapi/v4l/dev-sliced-vbi.rst @@ -431,7 +431,7 @@ MPEG stream. *Historical context*: This format specification originates from a custom, embedded, sliced VBI data format used by the ``ivtv`` driver. This format has already been informally specified in the kernel sources -in the file ``Documentation/video4linux/cx2341x/README.vbi`` . The +in the file ``Documentation/media/v4l-drivers/cx2341x.rst`` . The maximum size of the payload and other aspects of this format are driven by the CX23415 MPEG decoder's capabilities and limitations with respect to extracting, decoding, and displaying sliced VBI data embedded within diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index a3e81c1d276b..dfe49ae57e78 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -284,7 +284,7 @@ enum v4l2_mpeg_stream_vbi_fmt - * - ``V4L2_MPEG_STREAM_VBI_FMT_IVTV`` - VBI in private packets, IVTV format (documented in the kernel sources in the file - ``Documentation/video4linux/cx2341x/README.vbi``) + ``Documentation/media/v4l-drivers/cx2341x.rst``) diff --git a/Documentation/media/uapi/v4l/pixfmt-reserved.rst b/Documentation/media/uapi/v4l/pixfmt-reserved.rst index 521adb795535..38af1472a4b4 100644 --- a/Documentation/media/uapi/v4l/pixfmt-reserved.rst +++ b/Documentation/media/uapi/v4l/pixfmt-reserved.rst @@ -52,7 +52,7 @@ please make a proposal on the linux-media mailing list. `http://www.ivtvdriver.org/ `__ The format is documented in the kernel sources in the file - ``Documentation/video4linux/cx2341x/README.hm12`` + ``Documentation/media/v4l-drivers/cx2341x.rst`` * .. _V4L2-PIX-FMT-CPIA1: - ``V4L2_PIX_FMT_CPIA1`` diff --git a/Documentation/media/v4l-drivers/max2175.rst b/Documentation/media/v4l-drivers/max2175.rst index 04478c25d57a..b1a4c89fd869 100644 --- a/Documentation/media/v4l-drivers/max2175.rst +++ b/Documentation/media/v4l-drivers/max2175.rst @@ -7,7 +7,7 @@ The MAX2175 driver implements the following driver-specific controls: ------------------------------- Enable/Disable I2S output of the tuner. This is a private control that can be accessed only using the subdev interface. - Refer to Documentation/media/kapi/v4l2-controls for more details. + Refer to Documentation/media/kapi/v4l2-controls.rst for more details. .. flat-table:: :header-rows: 0 From f495ae3c01dfa80e368a9330831023121b61006e Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:45 -0500 Subject: [PATCH 0355/1085] Documentation: fix sound related doc refs Make sound doc refs valid. Signed-off-by: Tom Saeger Reviewed-by: Takashi Iwai Signed-off-by: Jonathan Corbet --- Documentation/sound/hd-audio/notes.rst | 2 +- Documentation/sound/kernel-api/writing-an-alsa-driver.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/sound/hd-audio/notes.rst b/Documentation/sound/hd-audio/notes.rst index f59c3cdbfaf4..9f7347830ba4 100644 --- a/Documentation/sound/hd-audio/notes.rst +++ b/Documentation/sound/hd-audio/notes.rst @@ -192,7 +192,7 @@ preset model instead of PCI (and codec-) SSID look-up. What ``model`` option values are available depends on the codec chip. Check your codec chip from the codec proc file (see "Codec Proc-File" section below). It will show the vendor/product name of your codec -chip. Then, see Documentation/sound/HD-Audio-Models.rst file, +chip. Then, see Documentation/sound/hd-audio/models.rst file, the section of HD-audio driver. You can find a list of codecs and ``model`` options belonging to each codec. For example, for Realtek ALC262 codec chip, pass ``model=ultra`` for devices that are compatible diff --git a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst index 58ffa3f5bda7..a0b268466cb1 100644 --- a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst +++ b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst @@ -2498,7 +2498,7 @@ Mic boost Mic-boost switch is set as “Mic Boost” or “Mic Boost (6dB)”. More precise information can be found in -``Documentation/sound/alsa/ControlNames.txt``. +``Documentation/sound/designs/control-names.rst``. Access Flags ------------ From 4269a691108aaaa6b107bba5fbde7d59f991b73e Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:36:52 -0500 Subject: [PATCH 0356/1085] Documentation: fix usb related doc refs Update ref to usb proc_usb_info.txt. Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/driver-api/usb/usb.rst | 4 +--- Documentation/networking/cdc_mbim.txt | 4 ++-- Documentation/usb/gadget-testing.txt | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst index dba0f876b36f..078e981e2b16 100644 --- a/Documentation/driver-api/usb/usb.rst +++ b/Documentation/driver-api/usb/usb.rst @@ -690,9 +690,7 @@ The USB devices are now exported via debugfs: This file is handy for status viewing tools in user mode, which can scan the text format and ignore most of it. More detailed device status (including class and vendor status) is available from device-specific -files. For information about the current format of this file, see the -``Documentation/usb/proc_usb_info.txt`` file in your Linux kernel -sources. +files. For information about the current format of this file, see below. This file, in combination with the poll() system call, can also be used to detect when devices are added or removed:: diff --git a/Documentation/networking/cdc_mbim.txt b/Documentation/networking/cdc_mbim.txt index e4c376abbdad..4e68f0bc5dba 100644 --- a/Documentation/networking/cdc_mbim.txt +++ b/Documentation/networking/cdc_mbim.txt @@ -332,8 +332,8 @@ References [5] "MBIM (Mobile Broadband Interface Model) Registry" - http://compliance.usb.org/mbim/ -[6] "/dev/bus/usb filesystem output" - - Documentation/usb/proc_usb_info.txt +[6] "/sys/kernel/debug/usb/devices output format" + - Documentation/driver-api/usb/usb.rst [7] "/sys/bus/usb/devices/.../descriptors" - Documentation/ABI/stable/sysfs-bus-usb diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt index fbc397d17e98..441a4b9b666f 100644 --- a/Documentation/usb/gadget-testing.txt +++ b/Documentation/usb/gadget-testing.txt @@ -773,7 +773,7 @@ host: # cat /dev/usb/lp0 More advanced testing can be done with the prn_example -described in Documentation/usb/gadget-printer.txt. +described in Documentation/usb/gadget_printer.txt. 20. UAC1 function (virtual ALSA card, using u_audio API) From f2b41874240411a7d06aac2dc864a389c93183cc Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Tue, 10 Oct 2017 12:37:01 -0500 Subject: [PATCH 0357/1085] Documentation: fix networking related doc refs. Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/networking/checksum-offloads.txt | 2 +- Documentation/networking/packet_mmap.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt index d52d191bbb0c..27bc09cfcf6d 100644 --- a/Documentation/networking/checksum-offloads.txt +++ b/Documentation/networking/checksum-offloads.txt @@ -47,7 +47,7 @@ The requirements for GSO are more complicated, because when segmenting an (section 'E') for more details. A driver declares its offload capabilities in netdev->hw_features; see - Documentation/networking/netdev-features for more. Note that a device + Documentation/networking/netdev-features.txt for more. Note that a device which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and csum_offset given in the SKB; if it tries to deduce these itself in hardware (as some NICs do) the driver should check that the values in the diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index f3b9e507ab05..bf654845556e 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1055,7 +1055,7 @@ TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec members do not contain a valid value. For TX_RINGs, by default no timestamp is generated! -See include/linux/net_tstamp.h and Documentation/networking/timestamping +See include/linux/net_tstamp.h and Documentation/networking/timestamping.txt for more information on hardware timestamps. ------------------------------------------------------------------------------- From 1ad6e3b2652b310e4ab1a544894c79d4f7cb39d3 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 12 Oct 2017 15:00:06 +1100 Subject: [PATCH 0358/1085] documentation: Update ide-cd documentation to reflect CONFIG_BLK_DEV_HD_IDE removal CONFIG_BLK_DEV_HD_IDE was removed in commit 80aa31cb460d ("ide: remove CONFIG_BLK_DEV_HD_IDE config option (take 2)") but the ide-cd documentation was not updated and still asks users to disable it, which is misleading and involves a fruitless search. Signed-off-by: Finn Thain Signed-off-by: Jonathan Corbet --- Documentation/cdrom/ide-cd | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd index f4dc9de2694e..a5f2a7f1ff46 100644 --- a/Documentation/cdrom/ide-cd +++ b/Documentation/cdrom/ide-cd @@ -55,13 +55,9 @@ This driver provides the following features: (to compile support as a module which can be loaded and unloaded) to the options: - Enhanced IDE/MFM/RLL disk/cdrom/tape/floppy support + ATA/ATAPI/MFM/RLL support Include IDE/ATAPI CDROM support - and `no' to - - Use old disk-only driver on primary interface - Depending on what type of IDE interface you have, you may need to specify additional configuration options. See Documentation/ide/ide.txt. From 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:06:19 -0400 Subject: [PATCH 0359/1085] x86/fpu/debug: Remove unused 'x86_fpu_state' and 'x86_fpu_deactivate_state' tracepoints Commit: d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points") ... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points, but never used them. Today they are still not used. As they take up and waste memory, remove them. Signed-off-by: Steven Rostedt (VMware) Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/fpu.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 39f7a27bef13..6b086c39e6dc 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -33,11 +33,6 @@ DECLARE_EVENT_CLASS(x86_fpu, ) ); -DEFINE_EVENT(x86_fpu, x86_fpu_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -73,11 +68,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) From 1b0c22e45508ffbd645430e049c52c46bdaad3b4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 14:12:45 +0200 Subject: [PATCH 0360/1085] regmap: avoid -Wint-in-bool-context warning When we pass the result of a multiplication as the timeout or the delay, we can get a warning from gcc-7: drivers/mmc/host/bcm2835.c:596:149: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] drivers/mfd/arizona-core.c:247:195: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c:49:27: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] The warning is a bit questionable inside of a macro, but this is intentional on the side of the gcc developers. It is also an indication of another problem: we evaluate the timeout and sleep arguments multiple times, which can have undesired side-effects when those are complex expressions. This changes the two regmap variants to use local variables for storing copies of the timeouts. This adds some more type safety, and avoids both the double-evaluation and the gcc warning. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81484 Link: http://lkml.kernel.org/r/20170726133756.2161367-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Signed-off-by: Mark Brown --- include/linux/regmap.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 1e369f0e921d..edad98890b9b 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -120,22 +120,24 @@ struct reg_sequence { */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ - ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \ + u64 __timeout_us = (timeout_us); \ + unsigned long __sleep_us = (sleep_us); \ + ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ int __ret; \ - might_sleep_if(sleep_us); \ + might_sleep_if(__sleep_us); \ for (;;) { \ __ret = regmap_read((map), (addr), &(val)); \ if (__ret) \ break; \ if (cond) \ break; \ - if ((timeout_us) && \ + if ((__timeout_us) && \ ktime_compare(ktime_get(), __timeout) > 0) { \ __ret = regmap_read((map), (addr), &(val)); \ break; \ } \ - if (sleep_us) \ - usleep_range(((sleep_us) >> 2) + 1, sleep_us); \ + if (__sleep_us) \ + usleep_range((__sleep_us >> 2) + 1, __sleep_us); \ } \ __ret ?: ((cond) ? 0 : -ETIMEDOUT); \ }) @@ -160,21 +162,23 @@ struct reg_sequence { */ #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ ({ \ - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ + u64 __timeout_us = (timeout_us); \ + unsigned long __sleep_us = (sleep_us); \ + ktime_t timeout = ktime_add_us(ktime_get(), __timeout_us); \ int pollret; \ - might_sleep_if(sleep_us); \ + might_sleep_if(__sleep_us); \ for (;;) { \ pollret = regmap_field_read((field), &(val)); \ if (pollret) \ break; \ if (cond) \ break; \ - if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ + if (__timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ pollret = regmap_field_read((field), &(val)); \ break; \ } \ - if (sleep_us) \ - usleep_range((sleep_us >> 2) + 1, sleep_us); \ + if (__sleep_us) \ + usleep_range((__sleep_us >> 2) + 1, __sleep_us); \ } \ pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ }) From 8955b26d227263a0938daae77f981da927e19513 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 14 Oct 2017 05:10:40 +0800 Subject: [PATCH 0361/1085] spi: sprd-adi: fix platform_no_drv_owner.cocci warnings drivers/spi/spi-sprd-adi.c:409:3-8: No need to set .owner here. The core will do it. Remove .owner field if calls are used which set it automatically Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci Fixes: 7e2903cb91df ("spi: Add ADI driver for Spreadtrum platform") Signed-off-by: Fengguang Wu Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index 1324463244d3..6a5ff3003044 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -406,7 +406,6 @@ MODULE_DEVICE_TABLE(of, sprd_adi_of_match); static struct platform_driver sprd_adi_driver = { .driver = { .name = "sprd-adi", - .owner = THIS_MODULE, .of_match_table = sprd_adi_of_match, }, .probe = sprd_adi_probe, From 11af847446ed0d131cf24d16a7ef3d5ea7a49554 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 13 Oct 2017 15:02:00 -0500 Subject: [PATCH 0362/1085] x86/unwind: Rename unwinder config options to 'CONFIG_UNWINDER_*' Rename the unwinder config options from: CONFIG_ORC_UNWINDER CONFIG_FRAME_POINTER_UNWINDER CONFIG_GUESS_UNWINDER to: CONFIG_UNWINDER_ORC CONFIG_UNWINDER_FRAME_POINTER CONFIG_UNWINDER_GUESS ... in order to give them a more logical config namespace. Suggested-by: Ingo Molnar Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/x86/orc-unwinder.txt | 2 +- Makefile | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 10 +++++----- arch/x86/configs/tiny.config | 4 ++-- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/include/asm/unwind.h | 8 ++++---- arch/x86/kernel/Makefile | 6 +++--- include/asm-generic/vmlinux.lds.h | 2 +- lib/Kconfig.debug | 2 +- scripts/Makefile.build | 2 +- 12 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt index af0c9a4c65a6..cd4b29be29af 100644 --- a/Documentation/x86/orc-unwinder.txt +++ b/Documentation/x86/orc-unwinder.txt @@ -4,7 +4,7 @@ ORC unwinder Overview -------- -The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is +The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. diff --git a/Makefile b/Makefile index bc5c79e8e3cf..c0f723f81c06 100644 --- a/Makefile +++ b/Makefile @@ -933,8 +933,8 @@ ifdef CONFIG_STACK_VALIDATION ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - ifdef CONFIG_ORC_UNWINDER - $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_UNWINDER_ORC + $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") else $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..6b94ca0aa585 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -170,7 +170,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71a48a30fc84..f274dbb87c26 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default FRAME_POINTER_UNWINDER + default UNWINDER_FRAME_POINTER ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, livepatch, lockdep, and more. -config FRAME_POINTER_UNWINDER +config UNWINDER_FRAME_POINTER bool "Frame pointer unwinder" select FRAME_POINTER ---help--- @@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER consistency model, as this is currently the only way to get a reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -config ORC_UNWINDER +config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 select STACK_VALIDATION @@ -395,7 +395,7 @@ config ORC_UNWINDER Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. -config GUESS_UNWINDER +config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT ---help--- @@ -410,7 +410,7 @@ config GUESS_UNWINDER endchoice config FRAME_POINTER - depends on !ORC_UNWINDER && !GUESS_UNWINDER + depends on !UNWINDER_ORC && !UNWINDER_GUESS bool endmenu diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 550cd5012b73..66c9e2aab16c 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set -CONFIG_GUESS_UNWINDER=y -# CONFIG_FRAME_POINTER_UNWINDER is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index eb65c248708d..e32fc1f274d8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y -CONFIG_ORC_UNWINDER=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eb7c718aaf8..9f05a1002aa9 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,7 +5,7 @@ #include struct mod_arch_specific { -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9f793e2df7a..35d67dc7b69f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,11 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#if defined(CONFIG_ORC_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs; -#elif defined(CONFIG_FRAME_POINTER_UNWINDER) +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; @@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } -#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) } #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..6209ab6deb50 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -127,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8acfc1e099e1..63e56f6c1877 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -687,7 +687,7 @@ #define BUG_TABLE #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2689b7c50c52..7566eff22236 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -376,7 +376,7 @@ config STACK_VALIDATION that runtime stack traces are more reliable. This is also a prerequisite for generation of ORC unwind data, which - is needed for CONFIG_ORC_UNWINDER. + is needed for CONFIG_UNWINDER_ORC. For more information, see tools/objtool/Documentation/stack-validation.txt. diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 061d0c3a420a..f965f477832e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) __objtool_obj := $(objtree)/tools/objtool/objtool -objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) +objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) ifndef CONFIG_FRAME_POINTER objtool_args += --no-fp From fc72ae40e30327aa24eb88a24b9c7058f938bd36 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 13 Oct 2017 15:02:01 -0500 Subject: [PATCH 0363/1085] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit The ORC unwinder has been stable in testing so far. Give it much wider testing by making it the default in kconfig for x86_64. It's not yet supported for 32-bit, so leave frame pointers as the default there. Suggested-by: Ingo Molnar Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index f274dbb87c26..a4ff214fb760 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default UNWINDER_FRAME_POINTER + default UNWINDER_ORC if X86_64 + default UNWINDER_FRAME_POINTER if X86_32 ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, livepatch, lockdep, and more. -config UNWINDER_FRAME_POINTER - bool "Frame pointer unwinder" - select FRAME_POINTER - ---help--- - This option enables the frame pointer unwinder for unwinding kernel - stack traces. - - The unwinder itself is fast and it uses less RAM than the ORC - unwinder, but the kernel text size will grow by ~3% and the kernel's - overall performance will degrade by roughly 5-10%. - - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 @@ -395,6 +381,21 @@ config UNWINDER_ORC Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT From 0bb6bba5fb426165aa2720e359107ef7c1d90f70 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Wed, 11 Oct 2017 21:52:46 +0200 Subject: [PATCH 0364/1085] s390/pkey: fix kzalloc-simple.cocci warnings drivers/s390/crypto/pkey_api.c:128:11-18: WARNING: kzalloc should be used for cprbmem, instead of kmalloc/memset Use kzalloc rather than kmalloc followed by memset with 0 Signed-off-by: Vasyl Gomonovych Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/pkey_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index f61fa47135a6..8dda5bb34a2f 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -125,10 +125,9 @@ static int alloc_and_prep_cprbmem(size_t paramblen, * allocate consecutive memory for request CPRB, request param * block, reply CPRB and reply param block */ - cprbmem = kmalloc(2 * cprbplusparamblen, GFP_KERNEL); + cprbmem = kzalloc(2 * cprbplusparamblen, GFP_KERNEL); if (!cprbmem) return -ENOMEM; - memset(cprbmem, 0, 2 * cprbplusparamblen); preqcblk = (struct CPRBX *) cprbmem; prepcblk = (struct CPRBX *) (cprbmem + cprbplusparamblen); From 496da0d706a952f12b4cbbec4b9f60d3ffdf5356 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 13 Oct 2017 09:06:29 +0200 Subject: [PATCH 0365/1085] s390/debug: adjust coding style The debug feature code hasn't been touched in ages and the code also looks like this. Therefore clean up the code so it looks a bit more like current coding style. There is no functional change - actually I made also sure that the generated code with performance_defconfig is identical. A diff of old vs new with "objdump -d" is empty. The code is still not checkpatch clean, but that was not the goal. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/debug.h | 186 +++---- arch/s390/kernel/debug.c | 882 +++++++++++++++------------------- 2 files changed, 490 insertions(+), 578 deletions(-) diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index df7b54ea956d..4a4c6dd2585a 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -13,71 +13,71 @@ #include #include -#define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ -#define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ -#define DEBUG_FLUSH_ALL -1 /* parameter to flush all areas */ -#define DEBUG_MAX_VIEWS 10 /* max number of views in proc fs */ -#define DEBUG_MAX_NAME_LEN 64 /* max length for a debugfs file name */ -#define DEBUG_DEFAULT_LEVEL 3 /* initial debug level */ +#define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ +#define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ +#define DEBUG_FLUSH_ALL -1 /* parameter to flush all areas */ +#define DEBUG_MAX_VIEWS 10 /* max number of views in proc fs */ +#define DEBUG_MAX_NAME_LEN 64 /* max length for a debugfs file name */ +#define DEBUG_DEFAULT_LEVEL 3 /* initial debug level */ #define DEBUG_DIR_ROOT "s390dbf" /* name of debug root directory in proc fs */ -#define DEBUG_DATA(entry) (char*)(entry + 1) /* data is stored behind */ - /* the entry information */ +#define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */ + /* the entry information */ typedef struct __debug_entry debug_entry_t; struct debug_view; -typedef struct debug_info { - struct debug_info* next; - struct debug_info* prev; +typedef struct debug_info { + struct debug_info *next; + struct debug_info *prev; refcount_t ref_count; - spinlock_t lock; + spinlock_t lock; int level; int nr_areas; int pages_per_area; int buf_size; - int entry_size; - debug_entry_t*** areas; + int entry_size; + debug_entry_t ***areas; int active_area; int *active_pages; int *active_entries; - struct dentry* debugfs_root_entry; - struct dentry* debugfs_entries[DEBUG_MAX_VIEWS]; - struct debug_view* views[DEBUG_MAX_VIEWS]; + struct dentry *debugfs_root_entry; + struct dentry *debugfs_entries[DEBUG_MAX_VIEWS]; + struct debug_view *views[DEBUG_MAX_VIEWS]; char name[DEBUG_MAX_NAME_LEN]; umode_t mode; } debug_info_t; -typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, +typedef int (debug_header_proc_t) (debug_info_t *id, + struct debug_view *view, int area, - debug_entry_t* entry, - char* out_buf); + debug_entry_t *entry, + char *out_buf); -typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); -typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); -typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, +typedef int (debug_format_proc_t) (debug_info_t *id, + struct debug_view *view, char *out_buf, + const char *in_buf); +typedef int (debug_prolog_proc_t) (debug_info_t *id, + struct debug_view *view, + char *out_buf); +typedef int (debug_input_proc_t) (debug_info_t *id, + struct debug_view *view, + struct file *file, const char __user *user_buf, - size_t in_buf_size, loff_t* offset); + size_t in_buf_size, loff_t *offset); + +int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf); -int debug_dflt_header_fn(debug_info_t* id, struct debug_view* view, - int area, debug_entry_t* entry, char* out_buf); - struct debug_view { char name[DEBUG_MAX_NAME_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; + debug_prolog_proc_t *prolog_proc; + debug_header_proc_t *header_proc; + debug_format_proc_t *format_proc; + debug_input_proc_t *input_proc; + void *private_data; }; extern struct debug_view debug_hex_ascii_view; @@ -86,65 +86,67 @@ extern struct debug_view debug_sprintf_view; /* do NOT use the _common functions */ -debug_entry_t* debug_event_common(debug_info_t* id, int level, - const void* data, int length); +debug_entry_t *debug_event_common(debug_info_t *id, int level, + const void *data, int length); -debug_entry_t* debug_exception_common(debug_info_t* id, int level, - const void* data, int length); +debug_entry_t *debug_exception_common(debug_info_t *id, int level, + const void *data, int length); /* Debug Feature API: */ debug_info_t *debug_register(const char *name, int pages, int nr_areas, - int buf_size); + int buf_size); debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas, int buf_size, umode_t mode, uid_t uid, gid_t gid); -void debug_unregister(debug_info_t* id); +void debug_unregister(debug_info_t *id); -void debug_set_level(debug_info_t* id, int new_level); +void debug_set_level(debug_info_t *id, int new_level); void debug_set_critical(void); void debug_stop_all(void); -static inline bool debug_level_enabled(debug_info_t* id, int level) +static inline bool debug_level_enabled(debug_info_t *id, int level) { return level <= id->level; } -static inline debug_entry_t* -debug_event(debug_info_t* id, int level, void* data, int length) +static inline debug_entry_t *debug_event(debug_info_t *id, int level, + void *data, int length) { if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_event_common(id,level,data,length); + return debug_event_common(id, level, data, length); } -static inline debug_entry_t* -debug_int_event(debug_info_t* id, int level, unsigned int tag) +static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, + unsigned int tag) { - unsigned int t=tag; + unsigned int t = tag; + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_event_common(id,level,&t,sizeof(unsigned int)); + return debug_event_common(id, level, &t, sizeof(unsigned int)); } -static inline debug_entry_t * -debug_long_event (debug_info_t* id, int level, unsigned long tag) +static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, + unsigned long tag) { - unsigned long t=tag; + unsigned long t = tag; + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_event_common(id,level,&t,sizeof(unsigned long)); + return debug_event_common(id, level, &t, sizeof(unsigned long)); } -static inline debug_entry_t* -debug_text_event(debug_info_t* id, int level, const char* txt) +static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, + const char *txt) { if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_event_common(id,level,txt,strlen(txt)); + return debug_event_common(id, level, txt, strlen(txt)); } /* @@ -160,6 +162,7 @@ __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) debug_entry_t *__ret; \ debug_info_t *__id = _id; \ int __level = _level; \ + \ if ((!__id) || (__level > __id->level)) \ __ret = NULL; \ else \ @@ -168,38 +171,40 @@ __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) __ret; \ }) -static inline debug_entry_t* -debug_exception(debug_info_t* id, int level, void* data, int length) +static inline debug_entry_t *debug_exception(debug_info_t *id, int level, + void *data, int length) { if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_exception_common(id,level,data,length); + return debug_exception_common(id, level, data, length); } -static inline debug_entry_t* -debug_int_exception(debug_info_t* id, int level, unsigned int tag) +static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, + unsigned int tag) { - unsigned int t=tag; + unsigned int t = tag; + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_exception_common(id,level,&t,sizeof(unsigned int)); + return debug_exception_common(id, level, &t, sizeof(unsigned int)); } -static inline debug_entry_t * -debug_long_exception (debug_info_t* id, int level, unsigned long tag) +static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, + unsigned long tag) { - unsigned long t=tag; + unsigned long t = tag; + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_exception_common(id,level,&t,sizeof(unsigned long)); + return debug_exception_common(id, level, &t, sizeof(unsigned long)); } -static inline debug_entry_t* -debug_text_exception(debug_info_t* id, int level, const char* txt) +static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, + const char *txt) { if ((!id) || (level > id->level) || (id->pages_per_area == 0)) return NULL; - return debug_exception_common(id,level,txt,strlen(txt)); + return debug_exception_common(id, level, txt, strlen(txt)); } /* @@ -215,6 +220,7 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) debug_entry_t *__ret; \ debug_info_t *__id = _id; \ int __level = _level; \ + \ if ((!__id) || (__level > __id->level)) \ __ret = NULL; \ else \ @@ -223,13 +229,13 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) __ret; \ }) -int debug_register_view(debug_info_t* id, struct debug_view* view); -int debug_unregister_view(debug_info_t* id, struct debug_view* view); +int debug_register_view(debug_info_t *id, struct debug_view *view); +int debug_unregister_view(debug_info_t *id, struct debug_view *view); /* define the debug levels: - 0 No debugging output to console or syslog - - 1 Log internal errors to syslog, ignore check conditions + - 1 Log internal errors to syslog, ignore check conditions - 2 Log internal errors and check conditions to syslog - 3 Log internal errors to console, log check conditions to syslog - 4 Log internal errors and check conditions to console @@ -247,17 +253,17 @@ int debug_unregister_view(debug_info_t* id, struct debug_view* view); #define INTERNAL_DEBMSG(x,y...) "D" __FILE__ "%d: " x, __LINE__, y #if DEBUG_LEVEL > 0 -#define PRINT_DEBUG(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#define PRINT_INFO(x...) printk ( KERN_INFO PRINTK_HEADER x ) -#define PRINT_WARN(x...) printk ( KERN_WARNING PRINTK_HEADER x ) -#define PRINT_ERR(x...) printk ( KERN_ERR PRINTK_HEADER x ) -#define PRINT_FATAL(x...) panic ( PRINTK_HEADER x ) +#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#define PRINT_INFO(x...) printk(KERN_INFO PRINTK_HEADER x) +#define PRINT_WARN(x...) printk(KERN_WARNING PRINTK_HEADER x) +#define PRINT_ERR(x...) printk(KERN_ERR PRINTK_HEADER x) +#define PRINT_FATAL(x...) panic(PRINTK_HEADER x) #else -#define PRINT_DEBUG(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#define PRINT_INFO(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#define PRINT_WARN(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#define PRINT_ERR(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#define PRINT_FATAL(x...) printk ( KERN_DEBUG PRINTK_HEADER x ) -#endif /* DASD_DEBUG */ +#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#define PRINT_INFO(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#define PRINT_WARN(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#define PRINT_ERR(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#define PRINT_FATAL(x...) printk(KERN_DEBUG PRINTK_HEADER x) +#endif /* DASD_DEBUG */ -#endif /* DEBUG_H */ +#endif /* DEBUG_H */ diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 1d9e83c401fc..c960797c8a6f 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -4,7 +4,7 @@ * Copyright IBM Corp. 1999, 2012 * * Author(s): Michael Holzheu (holzheu@de.ibm.com), - * Holger Smolinski (Holger.Smolinski@de.ibm.com) + * Holger Smolinski (Holger.Smolinski@de.ibm.com) * * Bugreports to: */ @@ -36,69 +36,67 @@ typedef struct file_private_info { loff_t offset; /* offset of last read in file */ - int act_area; /* number of last formated area */ - int act_page; /* act page in given area */ - int act_entry; /* last formated entry (offset */ - /* relative to beginning of last */ - /* formated page) */ - size_t act_entry_offset; /* up to this offset we copied */ + int act_area; /* number of last formated area */ + int act_page; /* act page in given area */ + int act_entry; /* last formated entry (offset */ + /* relative to beginning of last */ + /* formated page) */ + size_t act_entry_offset; /* up to this offset we copied */ /* in last read the last formated */ /* entry to userland */ char temp_buf[2048]; /* buffer for output */ - debug_info_t *debug_info_org; /* original debug information */ + debug_info_t *debug_info_org; /* original debug information */ debug_info_t *debug_info_snap; /* snapshot of debug information */ struct debug_view *view; /* used view of debug info */ } file_private_info_t; -typedef struct -{ +typedef struct { char *string; - /* - * This assumes that all args are converted into longs - * on L/390 this is the case for all types of parameter - * except of floats, and long long (32 bit) + /* + * This assumes that all args are converted into longs + * on L/390 this is the case for all types of parameter + * except of floats, and long long (32 bit) * */ long args[0]; } debug_sprintf_entry_t; - /* internal function prototyes */ static int debug_init(void); static ssize_t debug_output(struct file *file, char __user *user_buf, - size_t user_len, loff_t * offset); + size_t user_len, loff_t *offset); static ssize_t debug_input(struct file *file, const char __user *user_buf, - size_t user_len, loff_t * offset); + size_t user_len, loff_t *offset); static int debug_open(struct inode *inode, struct file *file); static int debug_close(struct inode *inode, struct file *file); static debug_info_t *debug_info_create(const char *name, int pages_per_area, - int nr_areas, int buf_size, umode_t mode); + int nr_areas, int buf_size, umode_t mode); static void debug_info_get(debug_info_t *); static void debug_info_put(debug_info_t *); -static int debug_prolog_level_fn(debug_info_t * id, - struct debug_view *view, char *out_buf); -static int debug_input_level_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_buf_size, loff_t * offset); -static int debug_prolog_pages_fn(debug_info_t * id, - struct debug_view *view, char *out_buf); -static int debug_input_pages_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_buf_size, loff_t * offset); -static int debug_input_flush_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_buf_size, loff_t * offset); -static int debug_hex_ascii_format_fn(debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf); -static int debug_raw_format_fn(debug_info_t * id, - struct debug_view *view, char *out_buf, - const char *in_buf); -static int debug_raw_header_fn(debug_info_t * id, struct debug_view *view, - int area, debug_entry_t * entry, char *out_buf); +static int debug_prolog_level_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_prolog_pages_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf); +static int debug_raw_format_fn(debug_info_t *id, + struct debug_view *view, char *out_buf, + const char *in_buf); +static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf); -static int debug_sprintf_format_fn(debug_info_t * id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event); +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, debug_sprintf_entry_t *curr_event); /* globals */ @@ -141,19 +139,19 @@ static struct debug_view debug_pages_view = { }; static struct debug_view debug_flush_view = { - "flush", - NULL, - NULL, - NULL, - &debug_input_flush_fn, - NULL + "flush", + NULL, + NULL, + NULL, + &debug_input_flush_fn, + NULL }; struct debug_view debug_sprintf_view = { "sprintf", NULL, &debug_dflt_header_fn, - (debug_format_proc_t*)&debug_sprintf_format_fn, + (debug_format_proc_t *)&debug_sprintf_format_fn, NULL, NULL }; @@ -164,18 +162,18 @@ static unsigned int __used debug_feature_version = __DEBUG_FEATURE_VERSION; /* static globals */ -static debug_info_t *debug_area_first = NULL; -static debug_info_t *debug_area_last = NULL; +static debug_info_t *debug_area_first; +static debug_info_t *debug_area_last; static DEFINE_MUTEX(debug_mutex); static int initialized; static int debug_critical; static const struct file_operations debug_file_ops = { - .owner = THIS_MODULE, - .read = debug_output, - .write = debug_input, - .open = debug_open, + .owner = THIS_MODULE, + .read = debug_output, + .write = debug_input, + .open = debug_open, .release = debug_close, .llseek = no_llseek, }; @@ -190,29 +188,23 @@ static struct dentry *debug_debugfs_root_entry; * areas[areanumber][pagenumber][pageoffset] */ -static debug_entry_t*** -debug_areas_alloc(int pages_per_area, int nr_areas) +static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) { - debug_entry_t*** areas; - int i,j; + debug_entry_t ***areas; + int i, j; - areas = kmalloc(nr_areas * - sizeof(debug_entry_t**), - GFP_KERNEL); + areas = kmalloc(nr_areas * sizeof(debug_entry_t **), GFP_KERNEL); if (!areas) goto fail_malloc_areas; for (i = 0; i < nr_areas; i++) { - areas[i] = kmalloc(pages_per_area * - sizeof(debug_entry_t*),GFP_KERNEL); - if (!areas[i]) { + areas[i] = kmalloc(pages_per_area * sizeof(debug_entry_t *), GFP_KERNEL); + if (!areas[i]) goto fail_malloc_areas2; - } - for(j = 0; j < pages_per_area; j++) { + for (j = 0; j < pages_per_area; j++) { areas[i][j] = kzalloc(PAGE_SIZE, GFP_KERNEL); - if(!areas[i][j]) { - for(j--; j >=0 ; j--) { + if (!areas[i][j]) { + for (j--; j >= 0 ; j--) kfree(areas[i][j]); - } kfree(areas[i]); goto fail_malloc_areas2; } @@ -221,62 +213,55 @@ debug_areas_alloc(int pages_per_area, int nr_areas) return areas; fail_malloc_areas2: - for(i--; i >= 0; i--){ - for(j=0; j < pages_per_area;j++){ + for (i--; i >= 0; i--) { + for (j = 0; j < pages_per_area; j++) kfree(areas[i][j]); - } kfree(areas[i]); } kfree(areas); fail_malloc_areas: return NULL; - } - /* * debug_info_alloc * - alloc new debug-info */ - -static debug_info_t* -debug_info_alloc(const char *name, int pages_per_area, int nr_areas, - int buf_size, int level, int mode) +static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, + int nr_areas, int buf_size, int level, + int mode) { - debug_info_t* rc; + debug_info_t *rc; /* alloc everything */ - rc = kmalloc(sizeof(debug_info_t), GFP_KERNEL); - if(!rc) + if (!rc) goto fail_malloc_rc; rc->active_entries = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); - if(!rc->active_entries) + if (!rc->active_entries) goto fail_malloc_active_entries; rc->active_pages = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); - if(!rc->active_pages) + if (!rc->active_pages) goto fail_malloc_active_pages; - if((mode == ALL_AREAS) && (pages_per_area != 0)){ + if ((mode == ALL_AREAS) && (pages_per_area != 0)) { rc->areas = debug_areas_alloc(pages_per_area, nr_areas); - if(!rc->areas) + if (!rc->areas) goto fail_malloc_areas; } else { rc->areas = NULL; } /* initialize members */ - spin_lock_init(&rc->lock); rc->pages_per_area = pages_per_area; - rc->nr_areas = nr_areas; + rc->nr_areas = nr_areas; rc->active_area = 0; - rc->level = level; - rc->buf_size = buf_size; - rc->entry_size = sizeof(debug_entry_t) + buf_size; + rc->level = level; + rc->buf_size = buf_size; + rc->entry_size = sizeof(debug_entry_t) + buf_size; strlcpy(rc->name, name, sizeof(rc->name)); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); - memset(rc->debugfs_entries, 0 ,DEBUG_MAX_VIEWS * - sizeof(struct dentry*)); + memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); return rc; @@ -295,18 +280,15 @@ fail_malloc_rc: * debug_areas_free * - free all debug areas */ - -static void -debug_areas_free(debug_info_t* db_info) +static void debug_areas_free(debug_info_t *db_info) { - int i,j; + int i, j; - if(!db_info->areas) + if (!db_info->areas) return; for (i = 0; i < db_info->nr_areas; i++) { - for(j = 0; j < db_info->pages_per_area; j++) { + for (j = 0; j < db_info->pages_per_area; j++) kfree(db_info->areas[i][j]); - } kfree(db_info->areas[i]); } kfree(db_info->areas); @@ -317,9 +299,8 @@ debug_areas_free(debug_info_t* db_info) * debug_info_free * - free memory debug-info */ - -static void -debug_info_free(debug_info_t* db_info){ +static void debug_info_free(debug_info_t *db_info) +{ debug_areas_free(db_info); kfree(db_info->active_entries); kfree(db_info->active_pages); @@ -331,35 +312,34 @@ debug_info_free(debug_info_t* db_info){ * - create new debug-info */ -static debug_info_t* -debug_info_create(const char *name, int pages_per_area, int nr_areas, - int buf_size, umode_t mode) +static debug_info_t *debug_info_create(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode) { - debug_info_t* rc; + debug_info_t *rc; - rc = debug_info_alloc(name, pages_per_area, nr_areas, buf_size, - DEBUG_DEFAULT_LEVEL, ALL_AREAS); - if(!rc) + rc = debug_info_alloc(name, pages_per_area, nr_areas, buf_size, + DEBUG_DEFAULT_LEVEL, ALL_AREAS); + if (!rc) goto out; rc->mode = mode & ~S_IFMT; /* create root directory */ - rc->debugfs_root_entry = debugfs_create_dir(rc->name, - debug_debugfs_root_entry); + rc->debugfs_root_entry = debugfs_create_dir(rc->name, + debug_debugfs_root_entry); /* append new element to linked list */ - if (!debug_area_first) { - /* first element in list */ - debug_area_first = rc; - rc->prev = NULL; - } else { - /* append element to end of list */ - debug_area_last->next = rc; - rc->prev = debug_area_last; - } - debug_area_last = rc; - rc->next = NULL; + if (!debug_area_first) { + /* first element in list */ + debug_area_first = rc; + rc->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = rc; + rc->prev = debug_area_last; + } + debug_area_last = rc; + rc->next = NULL; refcount_set(&rc->ref_count, 1); out: @@ -370,24 +350,22 @@ out: * debug_info_copy * - copy debug-info */ - -static debug_info_t* -debug_info_copy(debug_info_t* in, int mode) +static debug_info_t *debug_info_copy(debug_info_t *in, int mode) { - int i,j; - debug_info_t* rc; - unsigned long flags; + unsigned long flags; + debug_info_t *rc; + int i, j; /* get a consistent copy of the debug areas */ do { rc = debug_info_alloc(in->name, in->pages_per_area, in->nr_areas, in->buf_size, in->level, mode); spin_lock_irqsave(&in->lock, flags); - if(!rc) + if (!rc) goto out; /* has something changed in the meantime ? */ - if((rc->pages_per_area == in->pages_per_area) && - (rc->nr_areas == in->nr_areas)) { + if ((rc->pages_per_area == in->pages_per_area) && + (rc->nr_areas == in->nr_areas)) { break; } spin_unlock_irqrestore(&in->lock, flags); @@ -395,25 +373,22 @@ debug_info_copy(debug_info_t* in, int mode) } while (1); if (mode == NO_AREAS) - goto out; + goto out; - for(i = 0; i < in->nr_areas; i++){ - for(j = 0; j < in->pages_per_area; j++) { - memcpy(rc->areas[i][j], in->areas[i][j],PAGE_SIZE); - } - } + for (i = 0; i < in->nr_areas; i++) { + for (j = 0; j < in->pages_per_area; j++) + memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + } out: - spin_unlock_irqrestore(&in->lock, flags); - return rc; + spin_unlock_irqrestore(&in->lock, flags); + return rc; } /* * debug_info_get * - increments reference count for debug-info */ - -static void -debug_info_get(debug_info_t * db_info) +static void debug_info_get(debug_info_t *db_info) { if (db_info) refcount_inc(&db_info->ref_count); @@ -423,9 +398,7 @@ debug_info_get(debug_info_t * db_info) * debug_info_put: * - decreases reference count for debug-info and frees it if necessary */ - -static void -debug_info_put(debug_info_t *db_info) +static void debug_info_put(debug_info_t *db_info) { int i; @@ -438,12 +411,14 @@ debug_info_put(debug_info_t *db_info) debugfs_remove(db_info->debugfs_entries[i]); } debugfs_remove(db_info->debugfs_root_entry); - if(db_info == debug_area_first) + if (db_info == debug_area_first) debug_area_first = db_info->next; - if(db_info == debug_area_last) + if (db_info == debug_area_last) debug_area_last = db_info->prev; - if(db_info->prev) db_info->prev->next = db_info->next; - if(db_info->next) db_info->next->prev = db_info->prev; + if (db_info->prev) + db_info->prev->next = db_info->next; + if (db_info->next) + db_info->next->prev = db_info->prev; debug_info_free(db_info); } } @@ -452,71 +427,68 @@ debug_info_put(debug_info_t *db_info) * debug_format_entry: * - format one debug entry and return size of formated data */ - -static int -debug_format_entry(file_private_info_t *p_info) +static int debug_format_entry(file_private_info_t *p_info) { - debug_info_t *id_snap = p_info->debug_info_snap; + debug_info_t *id_snap = p_info->debug_info_snap; struct debug_view *view = p_info->view; debug_entry_t *act_entry; size_t len = 0; - if(p_info->act_entry == DEBUG_PROLOG_ENTRY){ + + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { /* print prolog */ - if (view->prolog_proc) - len += view->prolog_proc(id_snap,view,p_info->temp_buf); + if (view->prolog_proc) + len += view->prolog_proc(id_snap, view, p_info->temp_buf); goto out; } if (!id_snap->areas) /* this is true, if we have a prolog only view */ goto out; /* or if 'pages_per_area' is 0 */ - act_entry = (debug_entry_t *) ((char*)id_snap->areas[p_info->act_area] - [p_info->act_page] + p_info->act_entry); - + act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area] + [p_info->act_page] + p_info->act_entry); + if (act_entry->id.stck == 0LL) - goto out; /* empty entry */ + goto out; /* empty entry */ if (view->header_proc) len += view->header_proc(id_snap, view, p_info->act_area, - act_entry, p_info->temp_buf + len); + act_entry, p_info->temp_buf + len); if (view->format_proc) len += view->format_proc(id_snap, view, p_info->temp_buf + len, - DEBUG_DATA(act_entry)); + DEBUG_DATA(act_entry)); out: - return len; + return len; } /* * debug_next_entry: * - goto next entry in p_info */ - -static inline int -debug_next_entry(file_private_info_t *p_info) +static inline int debug_next_entry(file_private_info_t *p_info) { debug_info_t *id; id = p_info->debug_info_snap; - if(p_info->act_entry == DEBUG_PROLOG_ENTRY){ + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { p_info->act_entry = 0; p_info->act_page = 0; goto out; } - if(!id->areas) + if (!id->areas) return 1; p_info->act_entry += id->entry_size; /* switch to next page, if we reached the end of the page */ - if (p_info->act_entry > (PAGE_SIZE - id->entry_size)){ + if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { /* next page */ p_info->act_entry = 0; p_info->act_page += 1; - if((p_info->act_page % id->pages_per_area) == 0) { + if ((p_info->act_page % id->pages_per_area) == 0) { /* next area */ - p_info->act_area++; - p_info->act_page=0; + p_info->act_area++; + p_info->act_page = 0; } - if(p_info->act_area >= id->nr_areas) + if (p_info->act_area >= id->nr_areas) return 1; } out: - return 0; + return 0; } /* @@ -524,26 +496,24 @@ out: * - called for user read() * - copies formated debug entries to the user buffer */ - -static ssize_t -debug_output(struct file *file, /* file descriptor */ - char __user *user_buf, /* user buffer */ - size_t len, /* length of buffer */ - loff_t *offset) /* offset in the file */ +static ssize_t debug_output(struct file *file, /* file descriptor */ + char __user *user_buf, /* user buffer */ + size_t len, /* length of buffer */ + loff_t *offset) /* offset in the file */ { size_t count = 0; size_t entry_offset; file_private_info_t *p_info; - p_info = ((file_private_info_t *) file->private_data); - if (*offset != p_info->offset) + p_info = (file_private_info_t *) file->private_data; + if (*offset != p_info->offset) return -EPIPE; - if(p_info->act_area >= p_info->debug_info_snap->nr_areas) + if (p_info->act_area >= p_info->debug_info_snap->nr_areas) return 0; entry_offset = p_info->act_entry_offset; - while(count < len){ - int formatted_line_size; + while (count < len) { int formatted_line_residue; + int formatted_line_size; int user_buf_residue; size_t copy_size; @@ -551,21 +521,21 @@ debug_output(struct file *file, /* file descriptor */ formatted_line_residue = formatted_line_size - entry_offset; user_buf_residue = len-count; copy_size = min(user_buf_residue, formatted_line_residue); - if(copy_size){ + if (copy_size) { if (copy_to_user(user_buf + count, p_info->temp_buf - + entry_offset, copy_size)) + + entry_offset, copy_size)) return -EFAULT; count += copy_size; entry_offset += copy_size; } - if(copy_size == formatted_line_residue){ + if (copy_size == formatted_line_residue) { entry_offset = 0; - if(debug_next_entry(p_info)) + if (debug_next_entry(p_info)) goto out; } } out: - p_info->offset = *offset + count; + p_info->offset = *offset + count; p_info->act_entry_offset = entry_offset; *offset = p_info->offset; return count; @@ -576,24 +546,23 @@ out: * - called for user write() * - calls input function of view */ - -static ssize_t -debug_input(struct file *file, const char __user *user_buf, size_t length, - loff_t *offset) +static ssize_t debug_input(struct file *file, const char __user *user_buf, + size_t length, loff_t *offset) { - int rc = 0; file_private_info_t *p_info; + int rc = 0; mutex_lock(&debug_mutex); p_info = ((file_private_info_t *) file->private_data); - if (p_info->view->input_proc) + if (p_info->view->input_proc) { rc = p_info->view->input_proc(p_info->debug_info_org, p_info->view, file, user_buf, length, offset); - else + } else { rc = -EPERM; + } mutex_unlock(&debug_mutex); - return rc; /* number of input characters */ + return rc; /* number of input characters */ } /* @@ -602,13 +571,11 @@ debug_input(struct file *file, const char __user *user_buf, size_t length, * - copies formated output to private_data area of the file * handle */ - -static int -debug_open(struct inode *inode, struct file *file) +static int debug_open(struct inode *inode, struct file *file) { - int i, rc = 0; - file_private_info_t *p_info; debug_info_t *debug_info, *debug_info_snapshot; + file_private_info_t *p_info; + int i, rc = 0; mutex_lock(&debug_mutex); debug_info = file_inode(file)->i_private; @@ -616,10 +583,8 @@ debug_open(struct inode *inode, struct file *file) for (i = 0; i < DEBUG_MAX_VIEWS; i++) { if (!debug_info->views[i]) continue; - else if (debug_info->debugfs_entries[i] == - file->f_path.dentry) { - goto found; /* found view ! */ - } + else if (debug_info->debugfs_entries[i] == file->f_path.dentry) + goto found; /* found view ! */ } /* no entry found */ rc = -EINVAL; @@ -627,31 +592,28 @@ debug_open(struct inode *inode, struct file *file) found: - /* Make snapshot of current debug areas to get it consistent. */ + /* Make snapshot of current debug areas to get it consistent. */ /* To copy all the areas is only needed, if we have a view which */ /* formats the debug areas. */ - if(!debug_info->views[i]->format_proc && - !debug_info->views[i]->header_proc){ + if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); - } else { + else debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); - } - if(!debug_info_snapshot){ + if (!debug_info_snapshot) { rc = -ENOMEM; goto out; } - p_info = kmalloc(sizeof(file_private_info_t), - GFP_KERNEL); - if(!p_info){ + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { debug_info_free(debug_info_snapshot); rc = -ENOMEM; goto out; } p_info->offset = 0; p_info->debug_info_snap = debug_info_snapshot; - p_info->debug_info_org = debug_info; + p_info->debug_info_org = debug_info; p_info->view = debug_info->views[i]; p_info->act_area = 0; p_info->act_page = 0; @@ -670,17 +632,16 @@ out: * - called for user close() * - deletes private_data area of the file handle */ - -static int -debug_close(struct inode *inode, struct file *file) +static int debug_close(struct inode *inode, struct file *file) { file_private_info_t *p_info; + p_info = (file_private_info_t *) file->private_data; - if(p_info->debug_info_snap) + if (p_info->debug_info_snap) debug_info_free(p_info->debug_info_snap); debug_info_put(p_info->debug_info_org); kfree(file->private_data); - return 0; /* success */ + return 0; /* success */ } /* @@ -689,7 +650,6 @@ debug_close(struct inode *inode, struct file *file) * The mode parameter allows to specify access rights for the s390dbf files * - Returns handle for debug area */ - debug_info_t *debug_register_mode(const char *name, int pages_per_area, int nr_areas, int buf_size, umode_t mode, uid_t uid, gid_t gid) @@ -703,18 +663,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area, BUG_ON(!initialized); mutex_lock(&debug_mutex); - /* create new debug_info */ - + /* create new debug_info */ rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); - if(!rc) + if (!rc) goto out; debug_register_view(rc, &debug_level_view); - debug_register_view(rc, &debug_flush_view); + debug_register_view(rc, &debug_flush_view); debug_register_view(rc, &debug_pages_view); out: - if (!rc){ + if (!rc) pr_err("Registering debug feature %s failed\n", name); - } mutex_unlock(&debug_mutex); return rc; } @@ -725,7 +683,6 @@ EXPORT_SYMBOL(debug_register_mode); * - creates and initializes debug area for the caller * - returns handle for debug area */ - debug_info_t *debug_register(const char *name, int pages_per_area, int nr_areas, int buf_size) { @@ -738,18 +695,13 @@ EXPORT_SYMBOL(debug_register); * debug_unregister: * - give back debug area */ - -void -debug_unregister(debug_info_t * id) +void debug_unregister(debug_info_t *id) { if (!id) - goto out; + return; mutex_lock(&debug_mutex); debug_info_put(id); mutex_unlock(&debug_mutex); - -out: - return; } EXPORT_SYMBOL(debug_unregister); @@ -757,18 +709,17 @@ EXPORT_SYMBOL(debug_unregister); * debug_set_size: * - set area size (number of pages) and number of areas */ -static int -debug_set_size(debug_info_t* id, int nr_areas, int pages_per_area) +static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) { + debug_entry_t ***new_areas; unsigned long flags; - debug_entry_t *** new_areas; - int rc=0; + int rc = 0; - if(!id || (nr_areas <= 0) || (pages_per_area < 0)) + if (!id || (nr_areas <= 0) || (pages_per_area < 0)) return -EINVAL; - if(pages_per_area > 0){ + if (pages_per_area > 0) { new_areas = debug_areas_alloc(pages_per_area, nr_areas); - if(!new_areas) { + if (!new_areas) { pr_info("Allocating memory for %i pages failed\n", pages_per_area); rc = -ENOMEM; @@ -777,16 +728,16 @@ debug_set_size(debug_info_t* id, int nr_areas, int pages_per_area) } else { new_areas = NULL; } - spin_lock_irqsave(&id->lock,flags); + spin_lock_irqsave(&id->lock, flags); debug_areas_free(id); id->areas = new_areas; id->nr_areas = nr_areas; id->pages_per_area = pages_per_area; id->active_area = 0; - memset(id->active_entries,0,sizeof(int)*id->nr_areas); + memset(id->active_entries, 0, sizeof(int)*id->nr_areas); memset(id->active_pages, 0, sizeof(int)*id->nr_areas); - spin_unlock_irqrestore(&id->lock,flags); - pr_info("%s: set new size (%i pages)\n" ,id->name, pages_per_area); + spin_unlock_irqrestore(&id->lock, flags); + pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); out: return rc; } @@ -795,24 +746,23 @@ out: * debug_set_level: * - set actual debug level */ - -void -debug_set_level(debug_info_t* id, int new_level) +void debug_set_level(debug_info_t *id, int new_level) { unsigned long flags; - if(!id) - return; - spin_lock_irqsave(&id->lock,flags); - if(new_level == DEBUG_OFF_LEVEL){ - id->level = DEBUG_OFF_LEVEL; - pr_info("%s: switched off\n",id->name); - } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { + + if (!id) + return; + spin_lock_irqsave(&id->lock, flags); + if (new_level == DEBUG_OFF_LEVEL) { + id->level = DEBUG_OFF_LEVEL; + pr_info("%s: switched off\n", id->name); + } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { pr_info("%s: level %i is out of range (%i - %i)\n", - id->name, new_level, 0, DEBUG_MAX_LEVEL); - } else { - id->level = new_level; - } - spin_unlock_irqrestore(&id->lock,flags); + id->name, new_level, 0, DEBUG_MAX_LEVEL); + } else { + id->level = new_level; + } + spin_unlock_irqrestore(&id->lock, flags); } EXPORT_SYMBOL(debug_set_level); @@ -820,12 +770,10 @@ EXPORT_SYMBOL(debug_set_level); * proceed_active_entry: * - set active entry to next in the ring buffer */ - -static inline void -proceed_active_entry(debug_info_t * id) +static inline void proceed_active_entry(debug_info_t *id) { if ((id->active_entries[id->active_area] += id->entry_size) - > (PAGE_SIZE - id->entry_size)){ + > (PAGE_SIZE - id->entry_size)) { id->active_entries[id->active_area] = 0; id->active_pages[id->active_area] = (id->active_pages[id->active_area] + 1) % @@ -837,9 +785,7 @@ proceed_active_entry(debug_info_t * id) * proceed_active_area: * - set active area to next in the ring buffer */ - -static inline void -proceed_active_area(debug_info_t * id) +static inline void proceed_active_area(debug_info_t *id) { id->active_area++; id->active_area = id->active_area % id->nr_areas; @@ -848,13 +794,11 @@ proceed_active_area(debug_info_t * id) /* * get_active_entry: */ - -static inline debug_entry_t* -get_active_entry(debug_info_t * id) +static inline debug_entry_t *get_active_entry(debug_info_t *id) { return (debug_entry_t *) (((char *) id->areas[id->active_area] - [id->active_pages[id->active_area]]) + - id->active_entries[id->active_area]); + [id->active_pages[id->active_area]]) + + id->active_entries[id->active_area]); } /* @@ -862,23 +806,22 @@ get_active_entry(debug_info_t * id) * - set timestamp, caller address, cpu number etc. */ -static inline void -debug_finish_entry(debug_info_t * id, debug_entry_t* active, int level, - int exception) +static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active, + int level, int exception) { active->id.stck = get_tod_clock_fast() - *(unsigned long long *) &tod_clock_base[1]; active->id.fields.cpuid = smp_processor_id(); active->caller = __builtin_return_address(0); active->id.fields.exception = exception; - active->id.fields.level = level; + active->id.fields.level = level; proceed_active_entry(id); - if(exception) + if (exception) proceed_active_area(id); } -static int debug_stoppable=1; -static int debug_active=1; +static int debug_stoppable = 1; +static int debug_active = 1; #define CTL_S390DBF_STOPPABLE 5678 #define CTL_S390DBF_ACTIVE 5679 @@ -888,9 +831,8 @@ static int debug_active=1; * always allow read, allow write only if debug_stoppable is set or * if debug_active is already off */ -static int -s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int s390dbf_procactive(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); @@ -898,39 +840,37 @@ s390dbf_procactive(struct ctl_table *table, int write, return 0; } - static struct ctl_table s390dbf_table[] = { { - .procname = "debug_stoppable", + .procname = "debug_stoppable", .data = &debug_stoppable, .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, }, - { - .procname = "debug_active", + { + .procname = "debug_active", .data = &debug_active, .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = s390dbf_procactive, + .mode = S_IRUGO | S_IWUSR, + .proc_handler = s390dbf_procactive, }, { } }; static struct ctl_table s390dbf_dir_table[] = { { - .procname = "s390dbf", - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = s390dbf_table, + .procname = "s390dbf", + .maxlen = 0, + .mode = S_IRUGO | S_IXUGO, + .child = s390dbf_table, }, { } }; static struct ctl_table_header *s390dbf_sysctl_header; -void -debug_stop_all(void) +void debug_stop_all(void) { if (debug_stoppable) debug_active = 0; @@ -946,20 +886,20 @@ void debug_set_critical(void) * debug_event_common: * - write debug entry with given size */ - -debug_entry_t* -debug_event_common(debug_info_t * id, int level, const void *buf, int len) +debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, + int len) { - unsigned long flags; debug_entry_t *active; + unsigned long flags; if (!debug_active || !id->areas) return NULL; if (debug_critical) { if (!spin_trylock_irqsave(&id->lock, flags)) return NULL; - } else + } else { spin_lock_irqsave(&id->lock, flags); + } active = get_active_entry(id); memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); @@ -974,20 +914,20 @@ EXPORT_SYMBOL(debug_event_common); * debug_exception_common: * - write debug entry with given size and switch to next debug area */ - -debug_entry_t -*debug_exception_common(debug_info_t * id, int level, const void *buf, int len) +debug_entry_t *debug_exception_common(debug_info_t *id, int level, + const void *buf, int len) { - unsigned long flags; debug_entry_t *active; + unsigned long flags; if (!debug_active || !id->areas) return NULL; if (debug_critical) { if (!spin_trylock_irqsave(&id->lock, flags)) return NULL; - } else + } else { spin_lock_irqsave(&id->lock, flags); + } active = get_active_entry(id); memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); @@ -1001,47 +941,44 @@ EXPORT_SYMBOL(debug_exception_common); /* * counts arguments in format string for sprintf view */ - -static inline int -debug_count_numargs(char *string) +static inline int debug_count_numargs(char *string) { - int numargs=0; + int numargs = 0; - while(*string) { - if(*string++=='%') + while (*string) { + if (*string++ == '%') numargs++; } - return(numargs); + return numargs; } /* * debug_sprintf_event: */ - -debug_entry_t* -__debug_sprintf_event(debug_info_t *id, int level, char *string, ...) +debug_entry_t *__debug_sprintf_event(debug_info_t *id, int level, char *string, ...) { - va_list ap; - int numargs,idx; - unsigned long flags; debug_sprintf_entry_t *curr_event; debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; if (!debug_active || !id->areas) return NULL; - numargs=debug_count_numargs(string); + numargs = debug_count_numargs(string); if (debug_critical) { if (!spin_trylock_irqsave(&id->lock, flags)) return NULL; - } else + } else { spin_lock_irqsave(&id->lock, flags); + } active = get_active_entry(id); - curr_event=(debug_sprintf_entry_t *) DEBUG_DATA(active); - va_start(ap,string); - curr_event->string=string; - for(idx=0;idxbuf_size / sizeof(long))-1);idx++) - curr_event->args[idx]=va_arg(ap,long); + curr_event = (debug_sprintf_entry_t *) DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); va_end(ap); debug_finish_entry(id, active, level, 0); spin_unlock_irqrestore(&id->lock, flags); @@ -1053,32 +990,31 @@ EXPORT_SYMBOL(__debug_sprintf_event); /* * debug_sprintf_exception: */ - -debug_entry_t* -__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) +debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) { - va_list ap; - int numargs,idx; - unsigned long flags; debug_sprintf_entry_t *curr_event; debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; if (!debug_active || !id->areas) return NULL; - numargs=debug_count_numargs(string); + numargs = debug_count_numargs(string); if (debug_critical) { if (!spin_trylock_irqsave(&id->lock, flags)) return NULL; - } else + } else { spin_lock_irqsave(&id->lock, flags); + } active = get_active_entry(id); - curr_event=(debug_sprintf_entry_t *)DEBUG_DATA(active); - va_start(ap,string); - curr_event->string=string; - for(idx=0;idxbuf_size / sizeof(long))-1);idx++) - curr_event->args[idx]=va_arg(ap,long); + curr_event = (debug_sprintf_entry_t *)DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); va_end(ap); debug_finish_entry(id, active, level, 1); spin_unlock_irqrestore(&id->lock, flags); @@ -1090,15 +1026,13 @@ EXPORT_SYMBOL(__debug_sprintf_exception); /* * debug_register_view: */ - -int -debug_register_view(debug_info_t * id, struct debug_view *view) +int debug_register_view(debug_info_t *id, struct debug_view *view) { + unsigned long flags; + struct dentry *pde; + umode_t mode; int rc = 0; int i; - unsigned long flags; - umode_t mode; - struct dentry *pde; if (!id) goto out; @@ -1108,10 +1042,10 @@ debug_register_view(debug_info_t * id, struct debug_view *view) if (!view->input_proc) mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); pde = debugfs_create_file(view->name, mode, id->debugfs_root_entry, - id , &debug_file_ops); - if (!pde){ + id, &debug_file_ops); + if (!pde) { pr_err("Registering view %s/%s failed due to out of " - "memory\n", id->name,view->name); + "memory\n", id->name, view->name); rc = -1; goto out; } @@ -1139,9 +1073,7 @@ EXPORT_SYMBOL(debug_register_view); /* * debug_unregister_view: */ - -int -debug_unregister_view(debug_info_t * id, struct debug_view *view) +int debug_unregister_view(debug_info_t *id, struct debug_view *view) { struct dentry *dentry = NULL; unsigned long flags; @@ -1154,9 +1086,9 @@ debug_unregister_view(debug_info_t * id, struct debug_view *view) if (id->views[i] == view) break; } - if (i == DEBUG_MAX_VIEWS) + if (i == DEBUG_MAX_VIEWS) { rc = -1; - else { + } else { dentry = id->debugfs_entries[i]; id->views[i] = NULL; id->debugfs_entries[i] = NULL; @@ -1168,10 +1100,10 @@ out: } EXPORT_SYMBOL(debug_unregister_view); -static inline char * -debug_get_user_string(const char __user *user_buf, size_t user_len) +static inline char *debug_get_user_string(const char __user *user_buf, + size_t user_len) { - char* buffer; + char *buffer; buffer = kmalloc(user_len + 1, GFP_KERNEL); if (!buffer) @@ -1185,19 +1117,17 @@ debug_get_user_string(const char __user *user_buf, size_t user_len) buffer[user_len - 1] = 0; else buffer[user_len] = 0; - return buffer; + return buffer; } -static inline int -debug_get_uint(char *buf) +static inline int debug_get_uint(char *buf) { int rc; buf = skip_spaces(buf); rc = simple_strtoul(buf, &buf, 10); - if(*buf){ + if (*buf) rc = -EINVAL; - } return rc; } @@ -1210,9 +1140,8 @@ debug_get_uint(char *buf) * prints out actual debug level */ -static int -debug_prolog_pages_fn(debug_info_t * id, - struct debug_view *view, char *out_buf) +static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) { return sprintf(out_buf, "%i\n", id->pages_per_area); } @@ -1221,32 +1150,31 @@ debug_prolog_pages_fn(debug_info_t * id, * reads new size (number of pages per debug area) */ -static int -debug_input_pages_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_len, loff_t * offset) +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) { + int rc, new_pages; char *str; - int rc,new_pages; if (user_len > 0x10000) - user_len = 0x10000; - if (*offset != 0){ + user_len = 0x10000; + if (*offset != 0) { rc = -EPIPE; goto out; } - str = debug_get_user_string(user_buf,user_len); - if(IS_ERR(str)){ + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { rc = PTR_ERR(str); goto out; } new_pages = debug_get_uint(str); - if(new_pages < 0){ + if (new_pages < 0) { rc = -EINVAL; goto free_str; } - rc = debug_set_size(id,id->nr_areas, new_pages); - if(rc != 0){ + rc = debug_set_size(id, id->nr_areas, new_pages); + if (rc != 0) { rc = -EINVAL; goto free_str; } @@ -1261,52 +1189,47 @@ out: /* * prints out actual debug level */ - -static int -debug_prolog_level_fn(debug_info_t * id, struct debug_view *view, char *out_buf) +static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) { int rc = 0; - if(id->level == DEBUG_OFF_LEVEL) { - rc = sprintf(out_buf,"-\n"); - } - else { + if (id->level == DEBUG_OFF_LEVEL) + rc = sprintf(out_buf, "-\n"); + else rc = sprintf(out_buf, "%i\n", id->level); - } return rc; } /* * reads new debug level */ - -static int -debug_input_level_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_len, loff_t * offset) +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) { + int rc, new_level; char *str; - int rc,new_level; if (user_len > 0x10000) - user_len = 0x10000; - if (*offset != 0){ + user_len = 0x10000; + if (*offset != 0) { rc = -EPIPE; goto out; } - str = debug_get_user_string(user_buf,user_len); - if(IS_ERR(str)){ + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { rc = PTR_ERR(str); goto out; } - if(str[0] == '-'){ + if (str[0] == '-') { debug_set_level(id, DEBUG_OFF_LEVEL); rc = user_len; goto free_str; } else { new_level = debug_get_uint(str); } - if(new_level < 0) { + if (new_level < 0) { pr_warn("%s is not a valid level for a debug feature\n", str); rc = -EINVAL; } else { @@ -1320,99 +1243,90 @@ out: return rc; /* number of input characters */ } - /* * flushes debug areas */ - -static void debug_flush(debug_info_t* id, int area) +static void debug_flush(debug_info_t *id, int area) { - unsigned long flags; - int i,j; + unsigned long flags; + int i, j; - if(!id || !id->areas) - return; - spin_lock_irqsave(&id->lock,flags); - if(area == DEBUG_FLUSH_ALL){ - id->active_area = 0; - memset(id->active_entries, 0, id->nr_areas * sizeof(int)); - for (i = 0; i < id->nr_areas; i++) { + if (!id || !id->areas) + return; + spin_lock_irqsave(&id->lock, flags); + if (area == DEBUG_FLUSH_ALL) { + id->active_area = 0; + memset(id->active_entries, 0, id->nr_areas * sizeof(int)); + for (i = 0; i < id->nr_areas; i++) { id->active_pages[i] = 0; - for(j = 0; j < id->pages_per_area; j++) { - memset(id->areas[i][j], 0, PAGE_SIZE); - } + for (j = 0; j < id->pages_per_area; j++) + memset(id->areas[i][j], 0, PAGE_SIZE); } - } else if(area >= 0 && area < id->nr_areas) { - id->active_entries[area] = 0; + } else if (area >= 0 && area < id->nr_areas) { + id->active_entries[area] = 0; id->active_pages[area] = 0; - for(i = 0; i < id->pages_per_area; i++) { - memset(id->areas[area][i],0,PAGE_SIZE); - } - } - spin_unlock_irqrestore(&id->lock,flags); + for (i = 0; i < id->pages_per_area; i++) + memset(id->areas[area][i], 0, PAGE_SIZE); + } + spin_unlock_irqrestore(&id->lock, flags); } /* - * view function: flushes debug areas + * view function: flushes debug areas */ - -static int -debug_input_flush_fn(debug_info_t * id, struct debug_view *view, - struct file *file, const char __user *user_buf, - size_t user_len, loff_t * offset) +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) { - char input_buf[1]; - int rc = user_len; + char input_buf[1]; + int rc = user_len; if (user_len > 0x10000) - user_len = 0x10000; - if (*offset != 0){ + user_len = 0x10000; + if (*offset != 0) { rc = -EPIPE; - goto out; + goto out; + } + if (copy_from_user(input_buf, user_buf, 1)) { + rc = -EFAULT; + goto out; + } + if (input_buf[0] == '-') { + debug_flush(id, DEBUG_FLUSH_ALL); + goto out; + } + if (isdigit(input_buf[0])) { + int area = ((int) input_buf[0] - (int) '0'); + + debug_flush(id, area); + goto out; } - if (copy_from_user(input_buf, user_buf, 1)){ - rc = -EFAULT; - goto out; - } - if(input_buf[0] == '-') { - debug_flush(id, DEBUG_FLUSH_ALL); - goto out; - } - if (isdigit(input_buf[0])) { - int area = ((int) input_buf[0] - (int) '0'); - debug_flush(id, area); - goto out; - } pr_info("Flushing debug data failed because %c is not a valid " "area\n", input_buf[0]); out: - *offset += user_len; - return rc; /* number of input characters */ + *offset += user_len; + return rc; /* number of input characters */ } /* * prints debug header in raw format */ - -static int -debug_raw_header_fn(debug_info_t * id, struct debug_view *view, - int area, debug_entry_t * entry, char *out_buf) +static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf) { - int rc; + int rc; rc = sizeof(debug_entry_t); - memcpy(out_buf,entry,sizeof(debug_entry_t)); - return rc; + memcpy(out_buf, entry, sizeof(debug_entry_t)); + return rc; } /* * prints debug data in raw format */ - -static int -debug_raw_format_fn(debug_info_t * id, struct debug_view *view, +static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, const char *in_buf) { int rc; @@ -1425,20 +1339,17 @@ debug_raw_format_fn(debug_info_t * id, struct debug_view *view, /* * prints debug data in hex/ascii format */ - -static int -debug_hex_ascii_format_fn(debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf) +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf) { int i, rc = 0; - for (i = 0; i < id->buf_size; i++) { - rc += sprintf(out_buf + rc, "%02x ", - ((unsigned char *) in_buf)[i]); - } + for (i = 0; i < id->buf_size; i++) + rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); rc += sprintf(out_buf + rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; + if (isascii(c) && isprint(c)) rc += sprintf(out_buf + rc, "%c", c); else @@ -1451,16 +1362,14 @@ debug_hex_ascii_format_fn(debug_info_t * id, struct debug_view *view, /* * prints header for debug entry */ - -int -debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, - int area, debug_entry_t * entry, char *out_buf) +int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf) { unsigned long base, sec, usec; - char *except_str; unsigned long caller; - int rc = 0; unsigned int level; + char *except_str; + int rc = 0; level = entry->id.fields.level; base = (*(unsigned long *) &tod_clock_base[0]) >> 4; @@ -1486,19 +1395,18 @@ EXPORT_SYMBOL(debug_dflt_header_fn); #define DEBUG_SPRINTF_MAX_ARGS 10 -static int -debug_sprintf_format_fn(debug_info_t * id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event) +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, debug_sprintf_entry_t *curr_event) { - int num_longs, num_used_args = 0,i, rc = 0; + int num_longs, num_used_args = 0, i, rc = 0; int index[DEBUG_SPRINTF_MAX_ARGS]; /* count of longs fit into one entry */ - num_longs = id->buf_size / sizeof(long); + num_longs = id->buf_size / sizeof(long); - if(num_longs < 1) + if (num_longs < 1) goto out; /* bufsize of entry too small */ - if(num_longs == 1) { + if (num_longs == 1) { /* no args, we use only the string */ strcpy(out_buf, curr_event->string); rc = strlen(curr_event->string); @@ -1506,22 +1414,20 @@ debug_sprintf_format_fn(debug_info_t * id, struct debug_view *view, } /* number of arguments used for sprintf (without the format string) */ - num_used_args = min(DEBUG_SPRINTF_MAX_ARGS, (num_longs - 1)); + num_used_args = min(DEBUG_SPRINTF_MAX_ARGS, (num_longs - 1)); - memset(index,0, DEBUG_SPRINTF_MAX_ARGS * sizeof(int)); + memset(index, 0, DEBUG_SPRINTF_MAX_ARGS * sizeof(int)); - for(i = 0; i < num_used_args; i++) + for (i = 0; i < num_used_args; i++) index[i] = i; - rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], - curr_event->args[index[1]], curr_event->args[index[2]], - curr_event->args[index[3]], curr_event->args[index[4]], - curr_event->args[index[5]], curr_event->args[index[6]], - curr_event->args[index[7]], curr_event->args[index[8]], - curr_event->args[index[9]]); - + rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); out: - return rc; } From fe3af62553896d2a6b13c931137406681d76c3e4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 4 Jul 2017 07:40:04 +0200 Subject: [PATCH 0366/1085] s390: update defconfig Signed-off-by: Martin Schwidefsky --- arch/s390/configs/default_defconfig | 11 ++++------- arch/s390/configs/gcov_defconfig | 9 +++------ arch/s390/configs/performance_defconfig | 9 +++------ arch/s390/defconfig | 3 --- 4 files changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 282072206df7..84eccc88c065 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -69,7 +69,6 @@ CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y -CONFIG_CMA=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y CONFIG_MEM_SOFT_DIRTY=y @@ -379,7 +378,6 @@ CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_OSD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_BLK_DEV_RAM_DAX=y @@ -416,7 +414,6 @@ CONFIG_SCSI_OSD_ULD=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m @@ -483,6 +480,8 @@ CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -599,7 +598,6 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y CONFIG_PANIC_ON_OOPS=y CONFIG_DEBUG_TIMEKEEPING=y -CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y @@ -629,10 +627,8 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y -CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_TEST_LIST_SORT=y CONFIG_TEST_SORT=y @@ -649,6 +645,7 @@ CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_HARDENED_USERCOPY=y +CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 @@ -705,12 +702,12 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_ZCRYPT=m CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_ASYMMETRIC_KEY_TYPE=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 3c6b78189fbc..f7202358e6d7 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -70,7 +70,6 @@ CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y -CONFIG_CMA=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y CONFIG_ZBUD=m @@ -376,7 +375,6 @@ CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_OSD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_BLK_DEV_RAM_DAX=y @@ -412,7 +410,6 @@ CONFIG_SCSI_OSD_ULD=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m @@ -479,6 +476,8 @@ CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -575,10 +574,8 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y -CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y @@ -650,12 +647,12 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_ZCRYPT=m CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 653d72bcc007..03100fe74ea8 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -68,7 +68,6 @@ CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y -CONFIG_CMA=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y CONFIG_ZBUD=m @@ -374,7 +373,6 @@ CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_OSD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_BLK_DEV_RAM_DAX=y @@ -410,7 +408,6 @@ CONFIG_SCSI_OSD_ULD=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m @@ -477,6 +474,8 @@ CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -573,10 +572,8 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y -CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y @@ -648,12 +645,12 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_ZCRYPT=m CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 20244a38c886..46a3178d8bc6 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -53,7 +53,6 @@ CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y -CONFIG_CMA=y CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m @@ -163,7 +162,6 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y -CONFIG_DEBUG_RT_MUTEXES=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y CONFIG_DEBUG_LOCKDEP=y @@ -179,7 +177,6 @@ CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_FUNCTION_PROFILER=y -CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_KPROBES_SANITY_TEST=y CONFIG_S390_PTDUMP=y CONFIG_CRYPTO_CRYPTD=m From 688c29533ffc969773bc860d07d3a5f1995f5878 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 11 Oct 2017 04:38:21 +0200 Subject: [PATCH 0367/1085] vfio: ccw: bypass bad idaw address when fetching IDAL ccws We currently return the same error code (-EFAULT) to indicate two different error cases: 1. a bug in vfio-ccw implementation has been found. 2. a buggy channel program has been detected. This brings difficulty for userland program (specifically Qemu) to handle. Let's use -EFAULT to only indicate the first case. For the second case, we simply hand over the ccws to lower level for further handling. Notice: Once a bad idaw address is detected, the current behavior is to suppress the ssch. With this fix, the channel program will be accepted, and part of the channel program (the part ahead of the bad idaw) could possibly be executed by the device before I/O conclusion. Suggested-by: Halil Pasic Reviewed-by: Pierre Morel Signed-off-by: Dong Jia Shi Message-Id: <20171011023822.42948-2-bjsdjshi@linux.vnet.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 5ccfdc80d0ec..722f8b8c7273 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -569,10 +569,6 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, for (i = 0; i < idaw_nr; i++) { idaw_iova = *(idaws + i); - if (IS_ERR_VALUE(idaw_iova)) { - ret = -EFAULT; - goto out_free_idaws; - } ret = pfn_array_alloc_pin(pat->pat_pa + i, cp->mdev, idaw_iova, 1); From 4cebc5d6a6ff41c0266d235aa4854b062d34ad09 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 11 Oct 2017 04:38:22 +0200 Subject: [PATCH 0368/1085] vfio: ccw: validate the count field of a ccw before pinning If the count field of a ccw is zero, there is no need to try to pin page(s) for it. Let's check the count value before starting pinning operations. Reviewed-by: Pierre Morel Signed-off-by: Dong Jia Shi Message-Id: <20171011023822.42948-3-bjsdjshi@linux.vnet.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 722f8b8c7273..d8f98ad9b029 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -105,7 +105,10 @@ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, { int ret = 0; - if (!len || pa->pa_nr) + if (!len) + return 0; + + if (pa->pa_nr) return -EINVAL; pa->pa_iova = iova; @@ -501,6 +504,16 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, ccw = chain->ch_ccw + idx; + if (!ccw->count) { + /* + * We just want the translation result of any direct ccw + * to be an IDA ccw, so let's add the IDA flag for it. + * Although the flag will be ignored by firmware. + */ + ccw->flags |= CCW_FLAG_IDA; + return 0; + } + /* * Pin data page(s) in memory. * The number of pages actually is the count of the idaws which will be @@ -541,6 +554,9 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, ccw = chain->ch_ccw + idx; + if (!ccw->count) + return 0; + /* Calculate size of idaws. */ ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova)); if (ret) From 62518c02f75ff9b19c07b53b8e13ed878211b795 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Mon, 16 Oct 2017 18:04:22 +0200 Subject: [PATCH 0369/1085] irqchip/irq-omap-intc: Remove omap3_init_irq() All mach-omap2 variants are device tree only now, so this function is dead code. Remove it. Signed-off-by: Ladislav Michl Signed-off-by: Thomas Gleixner Acked-by: Tony Lindgren Cc: Marc Zyngier Cc: linux-omap@vger.kernel.org Cc: Jason Cooper Link: https://lkml.kernel.org/r/20171016160422.uu2i7vvrgy7cc4aw@lenoch --- drivers/irqchip/irq-omap-intc.c | 12 ------------ include/linux/irqchip/irq-omap-intc.h | 2 -- 2 files changed, 14 deletions(-) diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c index b04a8ac6e744..05f7f06dd711 100644 --- a/drivers/irqchip/irq-omap-intc.c +++ b/drivers/irqchip/irq-omap-intc.c @@ -25,10 +25,6 @@ #include -/* Define these here for now until we drop all board-files */ -#define OMAP24XX_IC_BASE 0x480fe000 -#define OMAP34XX_IC_BASE 0x48200000 - /* selected INTC register offsets */ #define INTC_REVISION 0x0000 @@ -364,14 +360,6 @@ omap_intc_handle_irq(struct pt_regs *regs) handle_domain_irq(domain, irqnr, regs); } -void __init omap3_init_irq(void) -{ - omap_nr_irqs = 96; - omap_nr_pending = 3; - omap_init_irq(OMAP34XX_IC_BASE, NULL); - set_handle_irq(omap_intc_handle_irq); -} - static int __init intc_of_init(struct device_node *node, struct device_node *parent) { diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h index 2e3d1afeb674..f19ccee7749f 100644 --- a/include/linux/irqchip/irq-omap-intc.h +++ b/include/linux/irqchip/irq-omap-intc.h @@ -18,8 +18,6 @@ #ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H #define __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H -void omap3_init_irq(void); - int omap_irq_pending(void); void omap_intc_save_context(void); void omap_intc_restore_context(void); From 77c858fa343bb09cc3fe1456fda5710c974845b9 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Mon, 16 Oct 2017 18:13:03 +0200 Subject: [PATCH 0370/1085] irqchip/irq-omap-intc: Do not statically initialize variables omap_nr_pending and omap_nr_irqs variables are initialized right at the beginning of intc_of_init function, so there's no need to statically initialize them. Signed-off-by: Ladislav Michl Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Tony Lindgren Cc: linux-omap@vger.kernel.org Cc: Jason Cooper Link: https://lkml.kernel.org/r/20171016161303.veumgcd3xom5c54r@lenoch --- drivers/irqchip/irq-omap-intc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c index 05f7f06dd711..d360a6eddd6d 100644 --- a/drivers/irqchip/irq-omap-intc.c +++ b/drivers/irqchip/irq-omap-intc.c @@ -66,8 +66,8 @@ static struct omap_intc_regs intc_context; static struct irq_domain *domain; static void __iomem *omap_irq_base; -static int omap_nr_pending = 3; -static int omap_nr_irqs = 96; +static int omap_nr_pending; +static int omap_nr_irqs; static void intc_writel(u32 reg, u32 val) { From c94fb639d5462027004ed8f5f71288955688a4ae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2017 11:04:33 -0700 Subject: [PATCH 0371/1085] irqchip: Add Kconfig menu Add a menu for IRQ chip drivers. This makes the Device drivers menu be more consistent (listing "subsystems" instead of specific options) and makes the IRQCHIP options be listed in expected places for 'make menu|xconfig'. Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Andrew Morton Cc: Jason Cooper Link: https://lkml.kernel.org/r/3db7385a-c6a1-5c93-0797-6f4b6b2b2cde@infradead.org --- drivers/irqchip/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 9d8a1dd2e2c2..77df38ed0050 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -1,3 +1,5 @@ +menu "IRQ chip support" + config IRQCHIP def_bool y depends on OF_IRQ @@ -321,3 +323,5 @@ config IRQ_UNIPHIER_AIDET select IRQ_DOMAIN_HIERARCHY help Support for the UniPhier AIDET (ARM Interrupt Detector). + +endmenu From 341102c3ef29c33611586072363cf9982a8bdb77 Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 12 Oct 2017 11:32:02 -0500 Subject: [PATCH 0372/1085] x86/tsc: Add option that TSC on Socket 0 being non-zero is valid Add a flag to indicate and process that TSC counters are on chassis that reset at different times during system startup. Therefore which TSC ADJUST values should be zero is not predictable. Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Dimitri Sivanich Reviewed-by: Russ Anderson Reviewed-by: Andrew Banman Reviewed-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Andrew Banman Cc: Bin Gao Link: https://lkml.kernel.org/r/20171012163201.944370012@stormcage.americas.sgi.com --- arch/x86/include/asm/tsc.h | 2 ++ arch/x86/kernel/tsc_sync.c | 39 ++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index d0509c75e150..79125f3609c4 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -36,11 +36,13 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern void mark_tsc_async_resets(char *reason); extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; +extern bool tsc_async_resets; /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 7842371bc9e4..3873dcdc7d7b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -30,6 +30,20 @@ struct tsc_adjust { static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +/* + * TSC's on different sockets may be reset asynchronously. + * This may cause the TSC ADJUST value on socket 0 to be NOT 0. + */ +bool __read_mostly tsc_async_resets; + +void mark_tsc_async_resets(char *reason) +{ + if (tsc_async_resets) + return; + tsc_async_resets = true; + pr_info("tsc: Marking TSC async resets true due to %s\n", reason); +} + void tsc_verify_tsc_adjust(bool resume) { struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); @@ -71,12 +85,22 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. + * + * Also don't force the ADJUST value to zero if that is a valid value + * for socket 0 as determined by the system arch. This is required + * when multiple sockets are reset asynchronously with each other + * and socket 0 may not have an TSC ADJUST value of 0. */ if (bootcpu && bootval != 0) { - pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, - bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); - bootval = 0; + if (likely(!tsc_async_resets)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", + cpu, bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } else { + pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", + cpu, bootval); + } } cur->adjusted = bootval; } @@ -117,6 +141,13 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) cur->nextcheck = jiffies + HZ; cur->warned = false; + /* + * If a non-zero TSC value for socket 0 may be valid then the default + * adjusted value cannot assumed to be zero either. + */ + if (tsc_async_resets) + cur->adjusted = bootval; + /* * Check whether this CPU is the first in a package to come up. In * this case do not check the boot value against another package From 9514ececa52e9f1436e7682e98c852d1338b699f Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 12 Oct 2017 11:32:03 -0500 Subject: [PATCH 0373/1085] x86/tsc: Skip TSC test and error messages if already unstable If the TSC has already been determined to be unstable, then checking TSC ADJUST values is a waste of time and generates unnecessary error messages. Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Dimitri Sivanich Reviewed-by: Russ Anderson Reviewed-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Andrew Banman Cc: Bin Gao Link: https://lkml.kernel.org/r/20171012163202.060777495@stormcage.americas.sgi.com --- arch/x86/kernel/tsc_sync.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3873dcdc7d7b..3bdb983fd11b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -52,6 +52,10 @@ void tsc_verify_tsc_adjust(bool resume) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return; + /* Rate limit the MSR check */ if (!resume && time_before(jiffies, adj->nextcheck)) return; @@ -114,6 +118,10 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return false; + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; From 41e7864ab5ce4ec36e89a9f55d8d9dfe19b0392c Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 12 Oct 2017 11:32:04 -0500 Subject: [PATCH 0374/1085] x86/tsc: Drastically reduce the number of firmware bug warnings Prior to the TSC ADJUST MSR being available, the method to set TSC's in sync with each other naturally caused a small skew between cpu threads. This was NOT a firmware bug at the time so introducing a whole avalanche of alarming warning messages might cause unnecessary concern and customer complaints. (Example: >3000 msgs in a 32 socket Skylake system.) Simply report the warning condition, if possible do the necessary fixes, and move on. Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Dimitri Sivanich Reviewed-by: Russ Anderson Reviewed-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Andrew Banman Cc: Bin Gao Link: https://lkml.kernel.org/r/20171012163202.175062400@stormcage.americas.sgi.com --- arch/x86/kernel/tsc_sync.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3bdb983fd11b..26ba0530a8a7 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -177,10 +177,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * Compare the boot value and complain if it differs in the * package. */ - if (bootval != ref->bootval) { - pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->bootval, cpu, bootval); - } + if (bootval != ref->bootval) + printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n"); + /* * The TSC_ADJUST values in a package must be the same. If the boot * value on this newly upcoming CPU differs from the adjustment @@ -188,8 +187,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * adjusted value. */ if (bootval != ref->adjusted) { - pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->adjusted, cpu, bootval); cur->adjusted = ref->adjusted; wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); } From 6c66350d0a482892793b888b07c1177fc6d4b344 Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 12 Oct 2017 11:32:05 -0500 Subject: [PATCH 0375/1085] x86/tsc: Provide a means to disable TSC ART On systems where multiple chassis are reset asynchronously, and thus the TSC counters are started asynchronously, the offset needed to convert to TSC to ART would be different. Disable ART in that case and rely on the TSC counters to supply the accurate time. Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Dimitri Sivanich Cc: Russ Anderson Cc: Andrew Banman Cc: Peter Zijlstra Cc: Bin Gao Link: https://lkml.kernel.org/r/20171012163202.289397994@stormcage.americas.sgi.com --- arch/x86/kernel/tsc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 896dbe31b407..f1326c0422c1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -962,10 +962,14 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ + /* + * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, + * and the TSC counter resets must not occur asynchronously. + */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || + tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, From 97d21003df3e7504c899b1701546f18ff475966f Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 12 Oct 2017 11:32:06 -0500 Subject: [PATCH 0376/1085] x86/platform/UV: Add check of TSC state set by UV BIOS Insert a check early in UV system startup that checks whether BIOS was able to obtain satisfactory TSC Sync stability. If not, it usually is caused by an error in the external TSC clock generation source. In this case the best fallback is to use the builtin hardware RTC as the kernel will not be able to set an accurate TSC sync either. Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Dimitri Sivanich Reviewed-by: Russ Anderson Reviewed-by: Andrew Banman Cc: Prarit Bhargava Cc: Andrew Banman Cc: Peter Zijlstra Cc: Bin Gao Link: https://lkml.kernel.org/r/20171012163202.406294490@stormcage.americas.sgi.com --- arch/x86/include/asm/uv/uv_hub.h | 23 ++++++++++++---- arch/x86/kernel/apic/x2apic_uv_x.c | 43 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 9cffb44a3cf5..036e26d63d9a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -776,23 +776,36 @@ static inline int uv_num_possible_blades(void) extern void uv_nmi_setup(void); extern void uv_nmi_setup_hubless(void); +/* BIOS/Kernel flags exchange MMR */ +#define UVH_BIOS_KERNEL_MMR UVH_SCRATCH5 +#define UVH_BIOS_KERNEL_MMR_ALIAS UVH_SCRATCH5_ALIAS +#define UVH_BIOS_KERNEL_MMR_ALIAS_2 UVH_SCRATCH5_ALIAS_2 + +/* TSC sync valid, set by BIOS */ +#define UVH_TSC_SYNC_MMR UVH_BIOS_KERNEL_MMR +#define UVH_TSC_SYNC_SHIFT 10 +#define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */ +#define UVH_TSC_SYNC_MASK 3 /* 0011 */ +#define UVH_TSC_SYNC_VALID 3 /* 0011 */ +#define UVH_TSC_SYNC_INVALID 2 /* 0010 */ + /* BMC sets a bit this MMR non-zero before sending an NMI */ -#define UVH_NMI_MMR UVH_SCRATCH5 -#define UVH_NMI_MMR_CLEAR UVH_SCRATCH5_ALIAS +#define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR +#define UVH_NMI_MMR_CLEAR UVH_BIOS_KERNEL_MMR_ALIAS #define UVH_NMI_MMR_SHIFT 63 -#define UVH_NMI_MMR_TYPE "SCRATCH5" +#define UVH_NMI_MMR_TYPE "SCRATCH5" /* Newer SMM NMI handler, not present in all systems */ #define UVH_NMI_MMRX UVH_EVENT_OCCURRED0 #define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS #define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT -#define UVH_NMI_MMRX_TYPE "EXTIO_INT0" +#define UVH_NMI_MMRX_TYPE "EXTIO_INT0" /* Non-zero indicates newer SMM NMI handler present */ #define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST /* Indicates to BIOS that we want to use the newer SMM NMI handler */ -#define UVH_NMI_MMRX_REQ UVH_SCRATCH5_ALIAS_2 +#define UVH_NMI_MMRX_REQ UVH_BIOS_KERNEL_MMR_ALIAS_2 #define UVH_NMI_MMRX_REQ_SHIFT 62 struct uv_hub_nmi_s { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0d57bb9079c9..440825478bbd 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -154,6 +154,48 @@ static int __init early_get_pnodeid(void) return pnode; } +static void uv_tsc_check_sync(void) +{ + u64 mmr; + int sync_state; + int mmr_shift; + char *state; + bool valid; + + /* Accommodate different UV arch BIOSes */ + mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); + mmr_shift = + is_uv1_hub() ? 0 : + is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; + if (mmr_shift) + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + else + sync_state = 0; + + switch (sync_state) { + case UVH_TSC_SYNC_VALID: + state = "in sync"; + valid = true; + break; + + case UVH_TSC_SYNC_INVALID: + state = "unstable"; + valid = false; + break; + default: + state = "unknown: assuming valid"; + valid = true; + break; + } + pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); + + /* Mark flag that says TSC != 0 is valid for socket 0 */ + if (valid) + mark_tsc_async_resets("UV BIOS"); + else + mark_tsc_unstable("UV BIOS"); +} + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -288,6 +330,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) } pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); + uv_tsc_check_sync(); return uv_apic; From c0fc9b1350a317da22b310d68117b0d01cb9065e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2017 08:49:37 +0200 Subject: [PATCH 0377/1085] x86/tsc: Make CONFIG_X86_TSC=n build work again tsc_async_resets is only available when CONFIG_X86_TSC=y. So a build with CONFIG_X86_TSC=n breaks: arch/x86/kernel/tsc.o: In function `tsc_init': (.init.text+0x87b): undefined reference to `tsc_async_resets' Add a stub define for the TSC=n case. Side note: This config switch should simply be removed. Reported-by: kbuild test robot Fixes: 341102c3ef29 ("x86/tsc: Add option that TSC on Socket 0 being non-zero is valid") Signed-off-by: Thomas Gleixner Cc: Mike Travis --- arch/x86/include/asm/tsc.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 79125f3609c4..c016167720b1 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -42,7 +42,11 @@ extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; +#ifdef CONFIG_X86_TSC extern bool tsc_async_resets; +#else +# define tsc_async_resets false +#endif /* * Boot-time check whether the TSCs are synchronized across From 0696d059f23c05f2dbc3b19ef50e5bdd175b782b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Oct 2017 16:16:19 +0200 Subject: [PATCH 0378/1085] x86/vector: Use correct per cpu variable in free_moved_vector() free_moved_vector() accesses the per cpu vector array with this_cpu_write() to clear the vector. The function has two call sites: 1) The vector cleanup IPI 2) The force_complete_move() code path For #1 this_cpu_write() is correct as it runs on the CPU on which the vector needs to be freed. For #2 this_cpu_write() is wrong because the function is called from an outgoing CPU which is not necessarily the CPU on which the previous vector needs to be freed. As a result it sets the vector on the outgoing CPU to NULL, which is pointless as that CPU does not handle interrupts anymore. What's worse is that it leaves the vector on the previous target CPU in place which later on triggers the BUG_ON(vector) in the vector allocation code when the vector gets reused. That's possible because the bitmap allocator entry of that CPU is freed correctly. Always use the CPU to which the vector was associated and clear the vector entry on that CPU. Fixup the tracepoint as well so it tracks on which CPU the vector gets removed. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Petri Latvala Signed-off-by: Thomas Gleixner Cc: Juergen Gross Cc: Tony Luck Cc: Len Brown Cc: Marc Zyngier Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Rui Zhang Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Boris Ostrovsky Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Alok Kataria Cc: Dan Williams Cc: Yu Chen Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710161614430.1973@nanos --- arch/x86/include/asm/trace/irq_vectors.h | 12 ++++++++---- arch/x86/kernel/apic/vector.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index bc09c5cf6390..bfd480b827f5 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -360,24 +360,28 @@ TRACE_EVENT(vector_setup, TRACE_EVENT(vector_free_moved, - TP_PROTO(unsigned int irq, unsigned int vector, bool is_managed), + TP_PROTO(unsigned int irq, unsigned int cpu, unsigned int vector, + bool is_managed), - TP_ARGS(irq, vector, is_managed), + TP_ARGS(irq, cpu, vector, is_managed), TP_STRUCT__entry( __field( unsigned int, irq ) + __field( unsigned int, cpu ) __field( unsigned int, vector ) __field( bool, is_managed ) ), TP_fast_assign( __entry->irq = irq; + __entry->cpu = cpu; __entry->vector = vector; __entry->is_managed = is_managed; ), - TP_printk("irq=%u vector=%u is_managed=%d", - __entry->irq, __entry->vector, __entry->is_managed) + TP_printk("irq=%u cpu=%u vector=%u is_managed=%d", + __entry->irq, __entry->cpu, __entry->vector, + __entry->is_managed) ); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 573538e0981e..05c85e693a5d 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -797,9 +797,9 @@ static void free_moved_vector(struct apic_chip_data *apicd) */ WARN_ON_ONCE(managed); - trace_vector_free_moved(apicd->irq, vector, managed); + trace_vector_free_moved(apicd->irq, cpu, vector, managed); irq_matrix_free(vector_matrix, cpu, vector, managed); - __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; apicd->move_in_progress = 0; From cbe96375025e14fc76f9ed42ee5225120d7210f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:41 -0700 Subject: [PATCH 0379/1085] bitops: Add clear/set_bit32() to linux/bitops.h Add two simple wrappers around set_bit/clear_bit() that accept the common case of an u32 array. This avoids writing casts in all callers. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org Signed-off-by: Ingo Molnar --- include/linux/bitops.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 8fbe259b197c..36794f058ba6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -227,6 +227,32 @@ static inline unsigned long __ffs64(u64 word) return __ffs((unsigned long)word); } +/* + * clear_bit32 - Clear a bit in memory for u32 array + * @nr: Bit to clear + * @addr: u32 * address of bitmap + * + * Same as clear_bit, but avoids needing casts for u32 arrays. + */ + +static __always_inline void clear_bit32(long nr, volatile u32 *addr) +{ + clear_bit(nr, (volatile unsigned long *)addr); +} + +/* + * set_bit32 - Set a bit in memory for u32 array + * @nr: Bit to clear + * @addr: u32 * address of bitmap + * + * Same as set_bit, but avoids needing casts for u32 arrays. + */ + +static __always_inline void set_bit32(long nr, volatile u32 *addr) +{ + set_bit(nr, (volatile unsigned long *)addr); +} + #ifdef __KERNEL__ #ifndef set_mask_bits From 0b00de857a648dafe7020878c7a27cf776f5edf4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:42 -0700 Subject: [PATCH 0380/1085] x86/cpuid: Add generic table for CPUID dependencies Some CPUID features depend on other features. Currently it's possible to to clear dependent features, but not clear the base features, which can cause various interesting problems. This patch implements a generic table to describe dependencies between CPUID features, to be used by all code that clears CPUID. Some subsystems (like XSAVE) had an own implementation of this, but it's better to do it all in a single place for everyone. Then clear_cpu_cap and setup_clear_cpu_cap always look up this table and clear all dependencies too. This is intended to be a practical table: only for features that make sense to clear. If someone for example clears FPU, or other features that are essentially part of the required base feature set, not much is going to work. Handling that is right now out of scope. We're only handling features which can be usefully cleared. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Jonathan McDowell Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 9 +-- arch/x86/include/asm/cpufeatures.h | 5 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d59c15c3defd..225fd8374fae 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2519c6c801c9..401a70992060 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -21,6 +21,11 @@ * this feature bit is not displayed in /proc/cpuinfo at all. */ +/* + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c + */ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e17942c131c8..de260fae1017 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,6 +22,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..e48eb7313120 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,113 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) +{ + clear_bit32(bit, c->x86_capability); +} + +static inline void __setup_clear_cpu_cap(unsigned int bit) +{ + clear_cpu_cap(&boot_cpu_data, bit); + set_bit32(bit, cpu_caps_cleared); +} + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + if (!c) + __setup_clear_cpu_cap(feature); + else + __clear_cpu_cap(c, feature); +} + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + bool changed; + DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + const struct cpuid_dep *d; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} From 0c2a3913d6f50503f7c59d83a6219e39508cc898 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:43 -0700 Subject: [PATCH 0381/1085] x86/fpu: Parse clearcpuid= as early XSAVE argument With a followon patch we want to make clearcpuid affect the XSAVE configuration. But xsave is currently initialized before arguments are parsed. Move the clearcpuid= parsing into the special early xsave argument parsing code. Since clearcpuid= contains a = we need to keep the old __setup around as a dummy, otherwise it would end up as a environment variable in init's environment. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 16 +++++++--------- arch/x86/kernel/fpu/init.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..03bb004bb15e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* From ccb18db2ab9d923df07e7495123fe5fb02329713 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:44 -0700 Subject: [PATCH 0382/1085] x86/fpu: Make XSAVE check the base CPUID features before enabling Before enabling XSAVE, not only check the XSAVE specific CPUID bits, but also the base CPUID features of the respective XSAVE feature. This allows to disable individual XSAVE states using the existing clearcpuid= option, which can be useful for performance testing and debugging, and also in general avoids inconsistencies. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..fb581292975b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include #include +#include /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -726,6 +740,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +774,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ From 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:45 -0700 Subject: [PATCH 0383/1085] x86/fpu: Remove the explicit clearing of XSAVE dependent features Clearing a CPU feature with setup_clear_cpu_cap() clears all features which depend on it. Expressing feature dependencies in one place is easier to maintain than keeping functions like fpu__xstate_clear_all_cpu_caps() up to date. The features which depend on XSAVE have their dependency expressed in the dependency table, so its sufficient to clear X86_FEATURE_XSAVE. Remove the explicit clearing of XSAVE dependent features. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fb581292975b..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* From fe460423438b62eb7440d994ab19a9f444e6280d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 20:29:38 +0200 Subject: [PATCH 0384/1085] posix-stubs: Use get_timespec64() and put_timespec64() This is a follow-up to commit 5c4994102fb5 ("posix-timers: Use get_timespec64() and put_timespec64()"), which left two system call using copy_from_user()/copy_to_user(). Change them as well for consistency. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Cc: y2038@lists.linaro.org Cc: John Stultz Cc: Al Viro Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/20171013183009.3442318-1-arnd@arndb.de --- kernel/time/posix-stubs.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 06f34feb635e..b258bee13b02 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; } - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } From 4eb1bca1793385b8caff4b2e1f19b31a013dd1ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 20:34:35 +0200 Subject: [PATCH 0385/1085] time: Use do_settimeofday64() internally do_settimeofday() is a wrapper around do_settimeofday64(), so that function can be called directly. The wrapper can be removed once the last user is gone. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: y2038@lists.linaro.org Cc: Frederic Weisbecker Cc: Stephen Boyd Cc: John Stultz Cc: Al Viro Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/20171013183452.3635956-1-arnd@arndb.de --- kernel/time/time.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 44a8c1402133..cfe3d3e4679f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) SYSCALL_DEFINE1(stime, time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } @@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } From 3c557df67257c114401f18ee412f0b74091c3c6f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 17:10:32 -0700 Subject: [PATCH 0386/1085] timer: Remove meaningless .data/.function assignments Several timer users needlessly reset their .function/.data fields during their timer callback, but nothing else changes them. Some users do not use their .data field at all. Each instance is removed here. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Greg Kroah-Hartman # for staging Acked-by: Krzysztof Halasa # for wan/hdlc* Acked-by: Jens Axboe # for amiflop Cc: devel@driverdev.osuosl.org Cc: netdev@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: Jens Axboe Cc: Ganesh Krishna Cc: Aditya Shankar Link: https://lkml.kernel.org/r/20171010001032.GA119829@beast --- drivers/block/amiflop.c | 3 +-- drivers/net/wan/hdlc_cisco.c | 2 -- drivers/net/wan/hdlc_fr.c | 2 -- drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +--- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 49908c74bfcb..4e3fb9f104af 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -323,7 +323,7 @@ static void fd_deselect (int drive) } -static void motor_on_callback(unsigned long nr) +static void motor_on_callback(unsigned long ignored) { if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) { complete_all(&motor_on_completion); @@ -344,7 +344,6 @@ static int fd_motor_on(int nr) fd_select(nr); reinit_completion(&motor_on_completion); - motor_on_timer.data = nr; mod_timer(&motor_on_timer, jiffies + HZ/2); on_attempts = 10; diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c index a408abc25512..f4b0ab34f048 100644 --- a/drivers/net/wan/hdlc_cisco.c +++ b/drivers/net/wan/hdlc_cisco.c @@ -276,8 +276,6 @@ static void cisco_timer(unsigned long arg) spin_unlock(&st->lock); st->timer.expires = jiffies + st->settings.interval * HZ; - st->timer.function = cisco_timer; - st->timer.data = arg; add_timer(&st->timer); } diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index 78596e42a3f3..07f265fa2826 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -644,8 +644,6 @@ static void fr_timer(unsigned long arg) state(hdlc)->settings.t391 * HZ; } - state(hdlc)->timer.function = fr_timer; - state(hdlc)->timer.data = arg; add_timer(&state(hdlc)->timer); } diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c index ac5aaafa461c..60f088babf27 100644 --- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c +++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c @@ -266,7 +266,7 @@ static void update_scan_time(void) last_scanned_shadow[i].time_scan = jiffies; } -static void remove_network_from_shadow(unsigned long arg) +static void remove_network_from_shadow(unsigned long unused) { unsigned long now = jiffies; int i, j; @@ -287,7 +287,6 @@ static void remove_network_from_shadow(unsigned long arg) } if (last_scanned_cnt != 0) { - hAgingTimer.data = arg; mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME)); } } @@ -304,7 +303,6 @@ static int is_network_in_shadow(struct network_info *pstrNetworkInfo, int i; if (last_scanned_cnt == 0) { - hAgingTimer.data = (unsigned long)user_void; mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME)); state = -1; } else { From b93ab338f7f0e39321b282d694a52736fdab172b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 14:56:42 -0700 Subject: [PATCH 0387/1085] libata: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Tejun Heo Cc: linux-ide@vger.kernel.org Link: https://lkml.kernel.org/r/20171005004842.GA23011@beast --- drivers/ata/libata-core.c | 5 ++--- drivers/ata/libata-eh.c | 4 ++-- drivers/ata/libata.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ee4c1ec9dca0..b8ac4902d312 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5979,9 +5979,8 @@ struct ata_port *ata_port_alloc(struct ata_host *host) INIT_LIST_HEAD(&ap->eh_done_q); init_waitqueue_head(&ap->eh_wait_q); init_completion(&ap->park_req_pending); - setup_deferrable_timer(&ap->fastdrain_timer, - ata_eh_fastdrain_timerfn, - (unsigned long)ap); + timer_setup(&ap->fastdrain_timer, ata_eh_fastdrain_timerfn, + TIMER_DEFERRABLE); ap->cbl = ATA_CBL_NONE; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index e4effef0c83f..ece6fd91a947 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -879,9 +879,9 @@ static int ata_eh_nr_in_flight(struct ata_port *ap) return nr; } -void ata_eh_fastdrain_timerfn(unsigned long arg) +void ata_eh_fastdrain_timerfn(struct timer_list *t) { - struct ata_port *ap = (void *)arg; + struct ata_port *ap = from_timer(ap, t, fastdrain_timer); unsigned long flags; int cnt; diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 839d487394b7..08a245b76417 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -154,7 +154,7 @@ extern void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd); extern void ata_eh_acquire(struct ata_port *ap); extern void ata_eh_release(struct ata_port *ap); extern void ata_scsi_error(struct Scsi_Host *host); -extern void ata_eh_fastdrain_timerfn(unsigned long arg); +extern void ata_eh_fastdrain_timerfn(struct timer_list *t); extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc); extern void ata_dev_disable(struct ata_device *dev); extern void ata_eh_detach_dev(struct ata_device *dev); From 376f3bcebdc999cc737d9052109cc33b573b3a8b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 16:22:31 -0700 Subject: [PATCH 0388/1085] x86/platform/UV: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Dimitri Sivanich Cc: Russ Anderson Cc: Mike Travis Link: https://lkml.kernel.org/r/20171016232231.GA100493@beast --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0d57bb9079c9..c0b694810ff4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -920,9 +920,8 @@ static __init void uv_rtc_init(void) /* * percpu heartbeat timer */ -static void uv_heartbeat(unsigned long ignored) +static void uv_heartbeat(struct timer_list *timer) { - struct timer_list *timer = &uv_scir_info->timer; unsigned char bits = uv_scir_info->state; /* Flip heartbeat bit: */ @@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu) struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_pinned_timer(timer, uv_heartbeat, cpu); + timer_setup(timer, uv_heartbeat, TIMER_PINNED); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; From ad92ceaf35822ebc2643d65ebba92d6dff18ae33 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 15 Oct 2017 17:03:12 +0800 Subject: [PATCH 0389/1085] regulator: axp20x: Simplify axp20x_is_polyphase_slave implementation The code to handle AXP803_ID and AXP813_ID cases are exactly the same. Make the switch-case fall through to avoid duplicate code. Signed-off-by: Axel Lin Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index e1761df4cbfd..181622b2813d 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -657,6 +657,7 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id) */ switch (axp20x->variant) { case AXP803_ID: + case AXP813_ID: regmap_read(axp20x->regmap, AXP803_POLYPHASE_CTRL, ®); switch (id) { @@ -681,17 +682,6 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id) } break; - case AXP813_ID: - regmap_read(axp20x->regmap, AXP803_POLYPHASE_CTRL, ®); - - switch (id) { - case AXP803_DCDC3: - return !!(reg & BIT(6)); - case AXP803_DCDC6: - return !!(reg & BIT(5)); - } - break; - default: return false; } From 7c3eaaa3917d8b5491f58ea263bf6e719fd3155f Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 17 Oct 2017 12:28:08 +0200 Subject: [PATCH 0390/1085] s390/kexec: Fix checksum validation return code for kdump Before kexec boots to a crash kernel it checks whether the image in memory changed after load. This is done by the function kdump_csum_valid, which returns true, i.e. an int != 0, on success and 0 otherwise. In other words when kdump_csum_valid returns an error code it means that the validation succeeded. This is not only counterintuitive but also produces the wrong result if the kernel was build without CONFIG_CRASH_DUMP. Fix this by making kdump_csum_valid return a bool. Signed-off-by: Philipp Rudo Acked-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/machine_kexec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 3d0b14afa232..51e8c63705a9 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -144,7 +144,7 @@ static noinline void __machine_kdump(void *image) /* * Check if kdump checksums are valid: We call purgatory with parameter "0" */ -static int kdump_csum_valid(struct kimage *image) +static bool kdump_csum_valid(struct kimage *image) { #ifdef CONFIG_CRASH_DUMP int (*start_kdump)(int) = (void *)image->start; @@ -153,9 +153,9 @@ static int kdump_csum_valid(struct kimage *image) __arch_local_irq_stnsm(0xfb); /* disable DAT */ rc = start_kdump(0); __arch_local_irq_stosm(0x04); /* enable DAT */ - return rc ? 0 : -EINVAL; + return rc == 0; #else - return -EINVAL; + return false; #endif } From 94158e544fd60c6a94af348790dae76578ed8dae Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 9 Oct 2017 17:49:38 +0200 Subject: [PATCH 0391/1085] s390/debug: improve debug_event debug_event currently truncates the data if used with a size larger than the buf_size of the debug feature. For lots of callers of this function, wrappers have been implemented that loop until all data is handled. Move that functionality into debug_event_common and get rid of the wrappers. Signed-off-by: Sebastian Ott Acked-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci_debug.h | 6 +----- arch/s390/kernel/debug.c | 28 ++++++++++++++++++---------- drivers/s390/block/scm_blk.h | 8 +------- drivers/s390/cio/chsc_sch.c | 6 +----- drivers/s390/cio/cio_debug.h | 8 +------- drivers/s390/cio/eadm_sch.c | 8 +------- drivers/s390/cio/qdio_debug.h | 18 +++--------------- 7 files changed, 26 insertions(+), 56 deletions(-) diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h index ac24b26fc065..773ff1352a96 100644 --- a/arch/s390/include/asm/pci_debug.h +++ b/arch/s390/include/asm/pci_debug.h @@ -18,11 +18,7 @@ extern debug_info_t *pci_debug_err_id; static inline void zpci_err_hex(void *addr, int len) { - while (len > 0) { - debug_event(pci_debug_err_id, 0, (void *) addr, len); - len -= pci_debug_err_id->buf_size; - addr += pci_debug_err_id->buf_size; - } + debug_event(pci_debug_err_id, 0, addr, len); } #endif diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c960797c8a6f..df916738fbd8 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -900,12 +900,16 @@ debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, } else { spin_lock_irqsave(&id->lock, flags); } - active = get_active_entry(id); - memset(DEBUG_DATA(active), 0, id->buf_size); - memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); - debug_finish_entry(id, active, level, 0); - spin_unlock_irqrestore(&id->lock, flags); + do { + active = get_active_entry(id); + memset(DEBUG_DATA(active), 0, id->buf_size); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + debug_finish_entry(id, active, level, 0); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + spin_unlock_irqrestore(&id->lock, flags); return active; } EXPORT_SYMBOL(debug_event_common); @@ -928,12 +932,16 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, } else { spin_lock_irqsave(&id->lock, flags); } - active = get_active_entry(id); - memset(DEBUG_DATA(active), 0, id->buf_size); - memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); - debug_finish_entry(id, active, level, 1); - spin_unlock_irqrestore(&id->lock, flags); + do { + active = get_active_entry(id); + memset(DEBUG_DATA(active), 0, id->buf_size); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + debug_finish_entry(id, active, level, len <= id->buf_size); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + spin_unlock_irqrestore(&id->lock, flags); return active; } EXPORT_SYMBOL(debug_exception_common); diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 71288dd9dd7f..f9338d20858b 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -55,13 +55,7 @@ extern debug_info_t *scm_debug; static inline void SCM_LOG_HEX(int level, void *data, int length) { - if (!debug_level_enabled(scm_debug, level)) - return; - while (length > 0) { - debug_event(scm_debug, level, data, length); - length -= scm_debug->buf_size; - data += scm_debug->buf_size; - } + debug_event(scm_debug, level, data, length); } static inline void SCM_LOG_STATE(int level, struct scm_device *scmdev) diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c index 735052ecd3e5..8e7e19b9e92c 100644 --- a/drivers/s390/cio/chsc_sch.c +++ b/drivers/s390/cio/chsc_sch.c @@ -43,11 +43,7 @@ static DEFINE_MUTEX(on_close_mutex); static void CHSC_LOG_HEX(int level, void *data, int length) { - while (length > 0) { - debug_event(chsc_debug_log_id, level, data, length); - length -= chsc_debug_log_id->buf_size; - data += chsc_debug_log_id->buf_size; - } + debug_event(chsc_debug_log_id, level, data, length); } MODULE_AUTHOR("IBM Corporation"); diff --git a/drivers/s390/cio/cio_debug.h b/drivers/s390/cio/cio_debug.h index e64e8278c42e..9fead866163f 100644 --- a/drivers/s390/cio/cio_debug.h +++ b/drivers/s390/cio/cio_debug.h @@ -22,13 +22,7 @@ extern debug_info_t *cio_debug_crw_id; static inline void CIO_HEX_EVENT(int level, void *data, int length) { - if (unlikely(!cio_debug_trace_id)) - return; - while (length > 0) { - debug_event(cio_debug_trace_id, level, data, length); - length -= cio_debug_trace_id->buf_size; - data += cio_debug_trace_id->buf_size; - } + debug_event(cio_debug_trace_id, level, data, length); } #endif diff --git a/drivers/s390/cio/eadm_sch.c b/drivers/s390/cio/eadm_sch.c index 0f11f3bcac82..d14795f7110b 100644 --- a/drivers/s390/cio/eadm_sch.c +++ b/drivers/s390/cio/eadm_sch.c @@ -43,13 +43,7 @@ static debug_info_t *eadm_debug; static void EADM_LOG_HEX(int level, void *data, int length) { - if (!debug_level_enabled(eadm_debug, level)) - return; - while (length > 0) { - debug_event(eadm_debug, level, data, length); - length -= eadm_debug->buf_size; - data += eadm_debug->buf_size; - } + debug_event(eadm_debug, level, data, length); } static void orb_init(union orb *orb) diff --git a/drivers/s390/cio/qdio_debug.h b/drivers/s390/cio/qdio_debug.h index 1d595d17bf11..be402da10778 100644 --- a/drivers/s390/cio/qdio_debug.h +++ b/drivers/s390/cio/qdio_debug.h @@ -33,11 +33,7 @@ extern debug_info_t *qdio_dbf_error; static inline void DBF_HEX(void *addr, int len) { - while (len > 0) { - debug_event(qdio_dbf_setup, DBF_ERR, addr, len); - len -= qdio_dbf_setup->buf_size; - addr += qdio_dbf_setup->buf_size; - } + debug_event(qdio_dbf_setup, DBF_ERR, addr, len); } #define DBF_ERROR(text...) \ @@ -49,11 +45,7 @@ static inline void DBF_HEX(void *addr, int len) static inline void DBF_ERROR_HEX(void *addr, int len) { - while (len > 0) { - debug_event(qdio_dbf_error, DBF_ERR, addr, len); - len -= qdio_dbf_error->buf_size; - addr += qdio_dbf_error->buf_size; - } + debug_event(qdio_dbf_error, DBF_ERR, addr, len); } #define DBF_DEV_EVENT(level, device, text...) \ @@ -68,11 +60,7 @@ static inline void DBF_ERROR_HEX(void *addr, int len) static inline void DBF_DEV_HEX(struct qdio_irq *dev, void *addr, int len, int level) { - while (len > 0) { - debug_event(dev->debug_area, level, addr, len); - len -= dev->debug_area->buf_size; - addr += dev->debug_area->buf_size; - } + debug_event(dev->debug_area, level, addr, len); } int qdio_allocate_dbf(struct qdio_initialize *init_data, From 0dcd91a9e6cc0a401416bbdd34b0a255cde0aee1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 12 Oct 2017 13:57:26 +0200 Subject: [PATCH 0392/1085] s390/debug: only write data once debug_event_common memsets the active debug entry with zeros to prevent stale data leakage. This is overwritten with the actual debug data in the next step. Only write zeros to that part of the debug entry that's not used by new debug data. Micro benchmarks show a 2-10% reduction of cpu cycles with this approach. Signed-off-by: Sebastian Ott Acked-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/debug.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index df916738fbd8..2df679d75454 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -902,8 +902,9 @@ debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, } do { active = get_active_entry(id); - memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); debug_finish_entry(id, active, level, 0); len -= id->buf_size; buf += id->buf_size; @@ -934,8 +935,9 @@ debug_entry_t *debug_exception_common(debug_info_t *id, int level, } do { active = get_active_entry(id); - memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); debug_finish_entry(id, active, level, len <= id->buf_size); len -= id->buf_size; buf += id->buf_size; From 76b3b62ade4baa13b270bafb96be730ebff913f0 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 12 Oct 2017 13:42:54 +0200 Subject: [PATCH 0393/1085] s390/dasd: remove unused debug macros Get rid of unused wrapper macros around debug_sprintf_exception. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_int.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index db470bd10175..99a45859aee6 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -95,14 +95,6 @@ do { \ d_data); \ } while(0) -#define DBF_DEV_EXC(d_level, d_device, d_str, d_data...) \ -do { \ - debug_sprintf_exception(d_device->debug_area, \ - d_level, \ - d_str "\n", \ - d_data); \ -} while(0) - #define DBF_EVENT(d_level, d_str, d_data...)\ do { \ debug_sprintf_event(dasd_debug_area, \ @@ -121,14 +113,6 @@ do { \ __dev_id.ssid, __dev_id.devno, d_data); \ } while (0) -#define DBF_EXC(d_level, d_str, d_data...)\ -do { \ - debug_sprintf_exception(dasd_debug_area, \ - d_level,\ - d_str "\n", \ - d_data); \ -} while(0) - /* limit size for an errorstring */ #define ERRORLENGTH 30 From 686140a1a9c41d85a4212a1c26d671139b76404b Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 12 Oct 2017 13:01:47 +0200 Subject: [PATCH 0394/1085] s390: introduce CPU alternatives Implement CPU alternatives, which allows to optionally patch newer instructions at runtime, based on CPU facilities availability. A new kernel boot parameter "noaltinstr" disables patching. Current implementation is derived from x86 alternatives. Although ideal instructions padding (when altinstr is longer then oldinstr) is added at compile time, and no oldinstr nops optimization has to be done at runtime. Also couple of compile time sanity checks are done: 1. oldinstr and altinstr must be <= 254 bytes long, 2. oldinstr and altinstr must not have an odd length. alternative(oldinstr, altinstr, facility); alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2); Both compile time and runtime padding consists of either 6/4/2 bytes nop or a jump (brcl) + 2 bytes nop filler if padding is longer then 6 bytes. .altinstructions and .altinstr_replacement sections are part of __init_begin : __init_end region and are freed after initialization. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- .../admin-guide/kernel-parameters.txt | 3 + arch/s390/Kconfig | 17 ++ arch/s390/include/asm/alternative.h | 163 ++++++++++++++++++ arch/s390/kernel/Makefile | 1 + arch/s390/kernel/alternative.c | 110 ++++++++++++ arch/s390/kernel/module.c | 17 ++ arch/s390/kernel/setup.c | 3 + arch/s390/kernel/vmlinux.lds.S | 23 +++ 8 files changed, 337 insertions(+) create mode 100644 arch/s390/include/asm/alternative.h create mode 100644 arch/s390/kernel/alternative.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..464f60c50c36 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2548,6 +2548,9 @@ noalign [KNL,ARM] + noaltinstr [S390] Disables alternative instructions patching + (CPU alternatives feature). + noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5fd4a67c93b3..080a3fcc3cb8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -537,6 +537,22 @@ config ARCH_RANDOM If unsure, say Y. +config ALTERNATIVES + def_bool y + prompt "Patch optimized instructions for running CPU type" + help + When enabled the kernel code is compiled with additional + alternative instructions blocks optimized for newer CPU types. + These alternative instructions blocks are patched at kernel boot + time when running CPU supports them. This mechanism is used to + optimize some critical code paths (i.e. spinlocks) for newer CPUs + even if kernel is build to support older machine generations. + + This mechanism could be disabled by appending "noaltinstr" + option to the kernel command line. + + If unsure, say Y. + endmenu menu "Memory setup" @@ -811,6 +827,7 @@ config PFAULT config SHARED_KERNEL bool "VM shared kernel support" depends on !JUMP_LABEL + depends on !ALTERNATIVES help Select this option, if you want to share the text segment of the Linux kernel between different VM guests. This reduces memory diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h new file mode 100644 index 000000000000..6c268f6a51d3 --- /dev/null +++ b/arch/s390/include/asm/alternative.h @@ -0,0 +1,163 @@ +#ifndef _ASM_S390_ALTERNATIVE_H +#define _ASM_S390_ALTERNATIVE_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 facility; /* facility bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction */ +} __packed; + +#ifdef CONFIG_ALTERNATIVES +extern void apply_alternative_instructions(void); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +#else +static inline void apply_alternative_instructions(void) {}; +static inline void apply_alternatives(struct alt_instr *start, + struct alt_instr *end) {}; +#endif +/* + * |661: |662: |6620 |663: + * +-----------+---------------------+ + * | oldinstr | oldinstr_padding | + * | +----------+----------+ + * | | | | + * | | >6 bytes |6/4/2 nops| + * | |6 bytes jg-----------> + * +-----------+---------------------+ + * ^^ static padding ^^ + * + * .altinstr_replacement section + * +---------------------+-----------+ + * |6641: |6651: + * | alternative instr 1 | + * +-----------+---------+- - - - - -+ + * |6642: |6652: | + * | alternative instr 2 | padding + * +---------------------+- - - - - -+ + * ^ runtime ^ + * + * .altinstructions section + * +---------------------------------+ + * | alt_instr entries for each | + * | alternative instr | + * +---------------------------------+ + */ + +#define b_altinstr(num) "664"#num +#define e_altinstr(num) "665"#num + +#define e_oldinstr_pad_end "663" +#define oldinstr_len "662b-661b" +#define oldinstr_total_len e_oldinstr_pad_end"b-661b" +#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b" +#define oldinstr_pad_len(num) \ + "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \ + "((" altinstr_len(num) ")-(" oldinstr_len "))" + +#define INSTR_LEN_SANITY_CHECK(len) \ + ".if " len " > 254\n" \ + "\t.error \"cpu alternatives does not support instructions " \ + "blocks > 254 bytes\"\n" \ + ".endif\n" \ + ".if (" len ") %% 2\n" \ + "\t.error \"cpu alternatives instructions length is odd\"\n" \ + ".endif\n" + +#define OLDINSTR_PADDING(oldinstr, num) \ + ".if " oldinstr_pad_len(num) " > 6\n" \ + "\tjg " e_oldinstr_pad_end "f\n" \ + "6620:\n" \ + "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \ + ".else\n" \ + "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \ + "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \ + "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \ + ".endif\n" + +#define OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + OLDINSTR_PADDING(oldinstr, num) \ + e_oldinstr_pad_end ":\n" \ + INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \ + OLDINSTR_PADDING(oldinstr, num2) \ + ".else\n" \ + OLDINSTR_PADDING(oldinstr, num1) \ + ".endif\n" \ + e_oldinstr_pad_end ":\n" \ + INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define ALTINSTR_ENTRY(facility, num) \ + "\t.long 661b - .\n" /* old instruction */ \ + "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ + "\t.word " __stringify(facility) "\n" /* facility bit */ \ + "\t.byte " oldinstr_total_len "\n" /* source len */ \ + "\t.byte " altinstr_len(num) "\n" /* alt instruction len */ + +#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \ + b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \ + INSTR_LEN_SANITY_CHECK(altinstr_len(num)) + +#ifdef CONFIG_ALTERNATIVES +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, altinstr, facility) \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr, 1) \ + ".popsection\n" \ + OLDINSTR(oldinstr, 1) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility, 1) \ + ".popsection\n" + +#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr1, 1) \ + ALTINSTR_REPLACEMENT(altinstr2, 2) \ + ".popsection\n" \ + OLDINSTR_2(oldinstr, 1, 2) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility1, 1) \ + ALTINSTR_ENTRY(facility2, 2) \ + ".popsection\n" +#else +/* Alternative instructions are disabled, let's put just oldinstr in */ +#define ALTERNATIVE(oldinstr, altinstr, facility) \ + oldinstr "\n" + +#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ + oldinstr "\n" +#endif + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * oldinstr is padded with jump and nops at compile time if altinstr is + * longer. altinstr is padded with jump and nops at run-time during patching. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, altinstr, facility) \ + asm volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory") + +#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ + asm volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ + altinstr2, facility2) ::: "memory") + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 7b742ffac7d3..addf61024153 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_ALTERNATIVES) += alternative.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c new file mode 100644 index 000000000000..315986a06cf5 --- /dev/null +++ b/arch/s390/kernel/alternative.c @@ -0,0 +1,110 @@ +#include +#include +#include + +#define MAX_PATCH_LEN (255 - 1) + +static int __initdata_or_module alt_instr_disabled; + +static int __init disable_alternative_instructions(char *str) +{ + alt_instr_disabled = 1; + return 0; +} + +early_param("noaltinstr", disable_alternative_instructions); + +struct brcl_insn { + u16 opc; + s32 disp; +} __packed; + +static u16 __initdata_or_module nop16 = 0x0700; +static u32 __initdata_or_module nop32 = 0x47000000; +static struct brcl_insn __initdata_or_module nop48 = { + 0xc004, 0 +}; + +static const void *nops[] __initdata_or_module = { + &nop16, + &nop32, + &nop48 +}; + +static void __init_or_module add_jump_padding(void *insns, unsigned int len) +{ + struct brcl_insn brcl = { + 0xc0f4, + len / 2 + }; + + memcpy(insns, &brcl, sizeof(brcl)); + insns += sizeof(brcl); + len -= sizeof(brcl); + + while (len > 0) { + memcpy(insns, &nop16, 2); + insns += 2; + len -= 2; + } +} + +static void __init_or_module add_padding(void *insns, unsigned int len) +{ + if (len > 6) + add_jump_padding(insns, len); + else if (len >= 2) + memcpy(insns, nops[len / 2 - 1], len); +} + +static void __init_or_module __apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + */ + for (a = start; a < end; a++) { + int insnbuf_sz = 0; + + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + + if (!test_facility(a->facility)) + continue; + + if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + WARN_ONCE(1, "cpu alternatives instructions length is " + "odd, skipping patching\n"); + continue; + } + + memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; + + if (a->instrlen > a->replacementlen) { + add_padding(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + + s390_kernel_write(instr, insnbuf, insnbuf_sz); + } +} + +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + if (!alt_instr_disabled) + __apply_alternatives(start, end); +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +void __init apply_alternative_instructions(void) +{ + apply_alternatives(__alt_instructions, __alt_instructions_end); +} diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 1a27f307a920..6d9f73bb4142 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -31,6 +31,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -429,6 +430,22 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { + const Elf_Shdr *s; + char *secstrings; + + if (IS_ENABLED(CONFIG_ALTERNATIVES)) { + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".altinstructions", + secstrings + s->sh_name)) { + /* patch .altinstructions */ + void *aseg = (void *)s->sh_addr; + + apply_alternatives(aseg, aseg + s->sh_size); + } + } + } + jump_label_apply_nops(me); return 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b2c9af9b88d5..c07e6d6a91cc 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "entry.h" /* @@ -957,6 +958,8 @@ void __init setup_arch(char **cmdline_p) conmode_default(); set_preferred_console(); + apply_alternative_instructions(); + /* Setup zfcpdump support */ setup_zfcpdump(); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 6e2c42bd1c3b..7c9fcf7cb43d 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -104,6 +104,29 @@ SECTIONS EXIT_DATA } + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + * Note, that it is a part of __init region. + */ + . = ALIGN(8); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + * Note, that it is a part of __init region. + */ + .altinstr_replacement : { + *(.altinstr_replacement) + } + /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) From f554be42fd0fd8dd14680c67f2db26b3e7de9670 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 12 Oct 2017 13:01:47 +0200 Subject: [PATCH 0395/1085] s390/spinlock: use cpu alternatives to enable niai instruction Enable niai instruction in the spinlock code at run-time for machines on which facility 49 is available (zEC12 and newer). Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/spinlock.h | 5 ++--- arch/s390/lib/spinlock.c | 9 +++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 09e783d83d5d..709cb6c01db9 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -13,6 +13,7 @@ #include #include #include +#include #define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval) @@ -86,9 +87,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); asm volatile( -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - " .long 0xb2fa0070\n" /* NIAI 7 */ -#endif + ALTERNATIVE("", ".long 0xb2fa0070", 49) /* NIAI 7 */ " sth %1,%0\n" : "=Q" (((unsigned short *) &lp->lock)[1]) : "d" (0) : "cc", "memory"); diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 43b0d46c3786..920503174252 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -12,6 +12,7 @@ #include #include #include +#include #include int spin_retry = -1; @@ -73,9 +74,7 @@ static inline int arch_load_niai4(int *lock) int owner; asm volatile( -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - " .long 0xb2fa0040\n" /* NIAI 4 */ -#endif + ALTERNATIVE("", ".long 0xb2fa0040", 49) /* NIAI 4 */ " l %0,%1\n" : "=d" (owner) : "Q" (*lock) : "memory"); return owner; @@ -86,9 +85,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) int expected = old; asm volatile( -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - " .long 0xb2fa0080\n" /* NIAI 8 */ -#endif + ALTERNATIVE("", ".long 0xb2fa0080", 49) /* NIAI 8 */ " cs %0,%3,%1\n" : "=d" (old), "=Q" (*lock) : "0" (old), "d" (new), "Q" (*lock) From 608796ffe13855bb066bebbd58d8b86a49cb5c27 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 13 Oct 2017 12:59:22 +0200 Subject: [PATCH 0396/1085] s390/vdso: move boot_vdso_data to vdso.c The boot_vdso_data variable is related to the vdso code, the magic of the initial vdso area for the early boot and the replacement of it in vdso_init should all be put into vdso.c. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/vdso.h | 1 + arch/s390/kernel/setup.c | 2 +- arch/s390/kernel/vdso.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 88bdc477a843..ac7bf9806a9d 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -46,6 +46,7 @@ struct vdso_per_cpu_data { extern struct vdso_data *vdso_data; +void vdso_alloc_boot_cpu(struct lowcore *lowcore); int vdso_alloc_per_cpu(struct lowcore *lowcore); void vdso_free_per_cpu(struct lowcore *lowcore); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c07e6d6a91cc..bf139f9e120e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -349,7 +349,7 @@ static void __init setup_lowcore(void) if (MACHINE_HAS_GS) lc->mcesad |= bits; } - lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; + vdso_alloc_boot_cpu(lc); lc->sync_enter_timer = S390_lowcore.sync_enter_timer; lc->async_enter_timer = S390_lowcore.async_enter_timer; lc->exit_timer = S390_lowcore.exit_timer; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index aaf8f77a636e..0520854a4dab 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -140,6 +140,20 @@ static void __init vdso_init_data(struct vdso_data *vd) */ #define SEGMENT_ORDER 2 +/* + * The initial vdso_data structure for the boot CPU. Eventually + * it is replaced with a properly allocated structure in vdso_init. + * This is necessary because a valid S390_lowcore.vdso_per_cpu_data + * pointer is required to be able to return from an interrupt or + * program check. See the exit paths in entry.S. + */ +struct vdso_data boot_vdso_data __initdata; + +void __init vdso_alloc_boot_cpu(struct lowcore *lowcore) +{ + lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data; +} + int vdso_alloc_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; From 58788a9b6060890e481c8111fac43d065560ebcb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 18 Oct 2017 12:51:09 +0100 Subject: [PATCH 0397/1085] locking/arch, powerpc/rtas: Use arch_spin_lock() instead of arch_spin_lock_flags() arch_spin_lock_flags() is an internal part of the spinlock implementation and is no longer available when SMP=n and DEBUG_SPINLOCK=y, so the PPC RTAS code fails to compile in this configuration: arch/powerpc/kernel/rtas.c: In function 'lock_rtas': >> arch/powerpc/kernel/rtas.c:81:2: error: implicit declaration of function 'arch_spin_lock_flags' [-Werror=implicit-function-declaration] arch_spin_lock_flags(&rtas.lock, flags); ^~~~~~~~~~~~~~~~~~~~ Since there's no good reason to use arch_spin_lock_flags() here (the code in question already calls local_irq_save(flags)), switch it over to arch_spin_lock and get things building again. Reported-by: kbuild test robot Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508327469-20231-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/rtas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 1643e9e53655..3f1c4fcbe0aa 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -78,7 +78,7 @@ static unsigned long lock_rtas(void) local_irq_save(flags); preempt_disable(); - arch_spin_lock_flags(&rtas.lock, flags); + arch_spin_lock(&rtas.lock); return flags; } From 4f3a871443669c6b4d458a60ac8d8ca5eedc3f97 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 17 Oct 2017 13:48:34 +0530 Subject: [PATCH 0398/1085] Revert "kprobes: Warn if optprobe handler tries to change execution path" This reverts commit: e863d539614641 ("kprobes: Warn if optprobe handler tries to change execution path") On PowerPC, we place a probe at kretprobe_trampoline to catch function returns and with CONFIG_OPTPROBES=y, this probe gets optimized. This works for us due to the way we handle the optprobe as described in commit: 762df10bad6954 ("powerpc/kprobes: Optimize kprobe in kretprobe_trampoline()") With the above commit, we end up with a warning. As such, revert this change. Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171017081834.3629-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d28377a0e32..15fba7fe57c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -387,10 +387,7 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); - if (kp->pre_handler(kp, regs)) { - if (WARN_ON_ONCE(1)) - pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr); - } + kp->pre_handler(kp, regs); } reset_kprobe_instance(); } From 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Sat, 14 Oct 2017 20:17:54 +0530 Subject: [PATCH 0399/1085] objtool: Print top level commands on incorrect usage Print top-level objtool commands, along with the error on incorrect command line usage. Objtool command line parser exit's with code 129, for incorrect usage. Convert the cmd_usage() exit code also, to maintain consistency across objtool. After the patch: $ ./objtool -j Unknown option: -j usage: objtool COMMAND [ARGS] Commands: check Perform stack metadata validation on an object file orc Generate in-place ORC unwind tables for an object file $ echo $? 129 Signed-off-by: Kamalesh Babulal Acked-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- tools/objtool/objtool.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 31e0f9143840..07f329919828 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -70,7 +70,7 @@ static void cmd_usage(void) printf("\n"); - exit(1); + exit(129); } static void handle_options(int *argc, const char ***argv) @@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) break; } else { fprintf(stderr, "Unknown option: %s\n", cmd); - fprintf(stderr, "\n Usage: %s\n", - objtool_usage_string); - exit(1); + cmd_usage(); } (*argv)++; From c310ce4dcb9df9b2f1be82caff7dae609fe53f72 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 8 Oct 2017 20:55:59 -0700 Subject: [PATCH 0400/1085] timers: Avoid an unnecessary iteration in __run_timers() If the base clock is behind jiffies in the soft irq expiry code then the next timer is retrieved by get_next_timer_interrupt() to avoid incrementing base clock one by one. If the next timer interrupt is past current jiffies then the base clock is set to jiffies - 1. At the call site this is incremented and another iteration through the expiry loop is executed which checks empty hash buckets. That's a pointless excercise because it's already known that the next timer is past jiffies. Set the base clock in that case to jiffies directly so it gets incremented to jiffies + 1 at the call site resulting in immediate termination of the expiry loop. [ tglx: Massaged changelog and added comment to the code ] Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Acked-by: Anna-Maria Gleixner Cc: Joe Jin Cc: sboyd@codeaurora.org Cc: Srinivas Reddy Eeda Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/7086a857-f90c-4616-bbe8-f7696f21626c@default --- kernel/time/timer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 38613ced2324..ee1a88d8afb2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1560,8 +1560,11 @@ static int collect_expired_timers(struct timer_base *base, * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { - /* The call site will increment clock! */ - base->clk = jiffies - 1; + /* + * The call site will increment base->clk and then + * terminate the expiry loop immediately. + */ + base->clk = jiffies; return 0; } base->clk = next; From 2b5175c4fa974b6aa05bbd2ee8d443a8036a1714 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2017 09:54:57 +0200 Subject: [PATCH 0401/1085] genirq: Add config option for reservation mode The interrupt reservation mode requires reactivation of PCI/MSI interrupts. Create a config option, so the PCI code can set the corresponding flag when required. Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.369375409@linutronix.de --- kernel/irq/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index ac1a3e29d3b9..89e355866450 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,6 +100,9 @@ config IRQ_TIMINGS config GENERIC_IRQ_MATRIX_ALLOCATOR bool +config GENERIC_IRQ_RESERVATION_MODE + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS From 25e960efc63852b84d1c3739aef586285b177395 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2017 09:54:58 +0200 Subject: [PATCH 0402/1085] PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code If interrupt reservation mode is enabled then the PCI/MSI interrupts must be reactivated after early activation. Make sure that all callers of pci_msi_create_irq_domain() have the MSI_FLAG_MUST_REACTIVATE set when reservation mode is enabled. Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.448649905@linutronix.de --- drivers/pci/msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 496ed9130600..e06607167858 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1441,6 +1441,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, pci_msi_domain_update_chip_ops(info); info->flags |= MSI_FLAG_ACTIVATE_EARLY; + if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) + info->flags |= MSI_FLAG_MUST_REACTIVATE; domain = msi_create_irq_domain(fwnode, info, parent); if (!domain) From c201c91799d687c0a6d8c3272950f51aad5ffebe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2017 09:54:59 +0200 Subject: [PATCH 0403/1085] x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE Select CONFIG_GENERIC_IRQ_RESERVATION_MODE so PCI/MSI domains get the MSI_FLAG_MUST_REACTIVATE flag set in pci_msi_create_irq_domain(). Remove the explicit setters of this flag in the apic/msi code as they are not longer required. Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode") Reported-and-tested-by: Dexuan Cui Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.527569354@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/msi.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 64e99d3c5169..ea4bedaba4b8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -95,6 +95,7 @@ config X86 select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC select GENERIC_IRQ_MIGRATION if SMP select GENERIC_IRQ_PROBE + select GENERIC_IRQ_RESERVATION_MODE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP select GENERIC_SMP_IDLE_THREAD diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5b6dd1a85ec4..9b18be764422 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -129,7 +129,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX | MSI_FLAG_MUST_REACTIVATE, + MSI_FLAG_PCI_MSIX, .ops = &pci_msi_domain_ops, .chip = &pci_msi_controller, .handler = handle_edge_irq, @@ -167,8 +167,7 @@ static struct irq_chip pci_msi_ir_controller = { static struct msi_domain_info pci_msi_ir_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | - MSI_FLAG_MUST_REACTIVATE, + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, .ops = &pci_msi_domain_ops, .chip = &pci_msi_ir_controller, .handler = handle_edge_irq, From 32a6c7233c41216f5dd41fc7bf100eedb1063dfc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 15:58:25 -0700 Subject: [PATCH 0404/1085] workqueue: Convert timers to use timer_setup() (part 2) In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. (The prior workqueue patch missed a few timers.) Signed-off-by: Kees Cook Acked-by: Tejun Heo Cc: Lai Jiangshan Link: https://lkml.kernel.org/r/20171016225825.GA99101@beast Signed-off-by: Thomas Gleixner --- kernel/workqueue.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c77fdf6bf24f..6e5eed58f215 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1831,9 +1831,9 @@ static void destroy_worker(struct worker *worker) wake_up_process(worker->task); } -static void idle_worker_timeout(unsigned long __pool) +static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, idle_timer); spin_lock_irq(&pool->lock); @@ -1879,9 +1879,9 @@ static void send_mayday(struct work_struct *work) } } -static void pool_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; spin_lock_irq(&pool->lock); @@ -3241,11 +3241,9 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, - (unsigned long)pool); + timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); + timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); From ba16490eac146ebb178017e5de3d61c645552fab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Oct 2017 16:10:19 +0200 Subject: [PATCH 0405/1085] timer: Convert stub timer to timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Thomas Gleixner Cc: Kees Cook --- kernel/time/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ee1a88d8afb2..fbb1f85327bf 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) } /* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) +static void stub_timer(struct timer_list *unused) { WARN_ON(1); } @@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: @@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; default: return false; From 0bbc931a074a741cf8e6279e8045cf7118586780 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 25 Aug 2017 17:45:05 +0100 Subject: [PATCH 0406/1085] tpm_tis: make array cmd_getticks static const to shrink object code size Don't populate array cmd_getticks on the stack, instead make it static const. Makes the object code smaller by over 160 bytes: Before: text data bss dec hex filename 18813 3152 128 22093 564d drivers/char/tpm/tpm_tis_core.o After: text data bss dec hex filename 18554 3248 128 21930 55aa drivers/char/tpm/tpm_tis_core.o Cc: stable@vger.kernel.org Signed-off-by: Colin Ian King Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 63bc6c3b949e..1e957e923d21 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -445,7 +445,7 @@ static int probe_itpm(struct tpm_chip *chip) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); int rc = 0; - u8 cmd_getticks[] = { + static const u8 cmd_getticks[] = { 0x00, 0xc1, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0xf1 }; From c37fbc09bd4977736f6bc4050c6f099c587052a7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Sep 2017 15:30:45 +0200 Subject: [PATCH 0407/1085] tpm: constify transmit data pointers Making cmd_getticks 'const' introduced a couple of harmless warnings: drivers/char/tpm/tpm_tis_core.c: In function 'probe_itpm': drivers/char/tpm/tpm_tis_core.c:469:31: error: passing argument 2 of 'tpm_tis_send_data' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] rc = tpm_tis_send_data(chip, cmd_getticks, len); drivers/char/tpm/tpm_tis_core.c:477:31: error: passing argument 2 of 'tpm_tis_send_data' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] rc = tpm_tis_send_data(chip, cmd_getticks, len); drivers/char/tpm/tpm_tis_core.c:255:12: note: expected 'u8 * {aka unsigned char *}' but argument is of type 'const u8 * {aka const unsigned char *}' static int tpm_tis_send_data(struct tpm_chip *chip, u8 *buf, size_t len) This changes the related functions to all take 'const' pointers so that gcc can see this as being correct. I had to slightly modify the logic around tpm_tis_spi_transfer() for this to work without introducing ugly casts. Cc: stable@vger.kernel.org Fixes: 5e35bd8e06b9 ("tpm_tis: make array cmd_getticks static const to shink object code size") Signed-off-by: Arnd Bergmann Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 2 +- drivers/char/tpm/tpm_tis_core.c | 4 ++-- drivers/char/tpm/tpm_tis_core.h | 4 ++-- drivers/char/tpm/tpm_tis_spi.c | 25 +++++++++++-------------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 7e55aa9ce680..ebd0e75a3e4d 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -223,7 +223,7 @@ static int tpm_tcg_read_bytes(struct tpm_tis_data *data, u32 addr, u16 len, } static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, - u8 *value) + const u8 *value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 1e957e923d21..fdde971bc810 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -252,7 +252,7 @@ out: * tpm.c can skip polling for the data to be available as the interrupt is * waited for here */ -static int tpm_tis_send_data(struct tpm_chip *chip, u8 *buf, size_t len) +static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); int rc, status, burstcnt; @@ -343,7 +343,7 @@ static void disable_interrupts(struct tpm_chip *chip) * tpm.c can skip polling for the data to be available as the interrupt is * waited for here */ -static int tpm_tis_send_main(struct tpm_chip *chip, u8 *buf, size_t len) +static int tpm_tis_send_main(struct tpm_chip *chip, const u8 *buf, size_t len) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); int rc; diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index e2212f021a02..6bbac319ff3b 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -98,7 +98,7 @@ struct tpm_tis_phy_ops { int (*read_bytes)(struct tpm_tis_data *data, u32 addr, u16 len, u8 *result); int (*write_bytes)(struct tpm_tis_data *data, u32 addr, u16 len, - u8 *value); + const u8 *value); int (*read16)(struct tpm_tis_data *data, u32 addr, u16 *result); int (*read32)(struct tpm_tis_data *data, u32 addr, u32 *result); int (*write32)(struct tpm_tis_data *data, u32 addr, u32 src); @@ -128,7 +128,7 @@ static inline int tpm_tis_read32(struct tpm_tis_data *data, u32 addr, } static inline int tpm_tis_write_bytes(struct tpm_tis_data *data, u32 addr, - u16 len, u8 *value) + u16 len, const u8 *value) { return data->phy_ops->write_bytes(data, addr, len, value); } diff --git a/drivers/char/tpm/tpm_tis_spi.c b/drivers/char/tpm/tpm_tis_spi.c index 88fe72ae967f..e49f5b9b739a 100644 --- a/drivers/char/tpm/tpm_tis_spi.c +++ b/drivers/char/tpm/tpm_tis_spi.c @@ -57,7 +57,7 @@ static inline struct tpm_tis_spi_phy *to_tpm_tis_spi_phy(struct tpm_tis_data *da } static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, - u8 *buffer, u8 direction) + u8 *in, const u8 *out) { struct tpm_tis_spi_phy *phy = to_tpm_tis_spi_phy(data); int ret = 0; @@ -71,7 +71,7 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, while (len) { transfer_len = min_t(u16, len, MAX_SPI_FRAMESIZE); - phy->tx_buf[0] = direction | (transfer_len - 1); + phy->tx_buf[0] = (in ? 0x80 : 0) | (transfer_len - 1); phy->tx_buf[1] = 0xd4; phy->tx_buf[2] = addr >> 8; phy->tx_buf[3] = addr; @@ -112,14 +112,8 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, spi_xfer.cs_change = 0; spi_xfer.len = transfer_len; spi_xfer.delay_usecs = 5; - - if (direction) { - spi_xfer.tx_buf = NULL; - spi_xfer.rx_buf = buffer; - } else { - spi_xfer.tx_buf = buffer; - spi_xfer.rx_buf = NULL; - } + spi_xfer.tx_buf = out; + spi_xfer.rx_buf = in; spi_message_init(&m); spi_message_add_tail(&spi_xfer, &m); @@ -128,7 +122,10 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, goto exit; len -= transfer_len; - buffer += transfer_len; + if (in) + in += transfer_len; + if (out) + out += transfer_len; } exit: @@ -139,13 +136,13 @@ exit: static int tpm_tis_spi_read_bytes(struct tpm_tis_data *data, u32 addr, u16 len, u8 *result) { - return tpm_tis_spi_transfer(data, addr, len, result, 0x80); + return tpm_tis_spi_transfer(data, addr, len, result, NULL); } static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, - u16 len, u8 *value) + u16 len, const u8 *value) { - return tpm_tis_spi_transfer(data, addr, len, value, 0); + return tpm_tis_spi_transfer(data, addr, len, NULL, value); } static int tpm_tis_spi_read16(struct tpm_tis_data *data, u32 addr, u16 *result) From f5357413dbaadd82361903f3c389cb1d5763a85e Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Fri, 25 Aug 2017 18:28:55 -0500 Subject: [PATCH 0408/1085] tpm/tpm_crb: Use start method value from ACPI table directly This patch gets rid of dealing with intermediate flag for start method and use start method value from ACPI table directly. For ARM64, the locality is handled by Trust Zone in FW. The layout does not have crb_regs_head. It is hitting the following line. dev_warn(dev, FW_BUG "Bad ACPI memory layout"); Current code excludes CRB_FL_ACPI_START for this check. Now since ARM64 support for TPM CRB is added, CRB_FL_CRB_SMC_START should also be excluded from this check. For goIdle and cmdReady where code was excluding CRB_FL_ACPI_START only (do nothing for ACPI start method), CRB_FL_CRB_SMC_START was also excluded as ARM64 SMC start method does not have TPM_CRB_CTRL_REQ. However with special PPT workaround requiring CRB_FL_CRB_START to be set in addition to CRB_FL_ACPI_START and the addition flag of SMC start method CRB_FL_CRB_SMC_START, the code has become difficult to maintain and undrestand. It is better to make code deal with start method value from ACPI table directly. Signed-off-by: Jiandi An Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_crb.c | 59 +++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index 8f0a98dea327..7b3c2a8aa9de 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -92,14 +92,9 @@ enum crb_status { CRB_DRV_STS_COMPLETE = BIT(0), }; -enum crb_flags { - CRB_FL_ACPI_START = BIT(0), - CRB_FL_CRB_START = BIT(1), - CRB_FL_CRB_SMC_START = BIT(2), -}; - struct crb_priv { - unsigned int flags; + u32 sm; + const char *hid; void __iomem *iobase; struct crb_regs_head __iomem *regs_h; struct crb_regs_tail __iomem *regs_t; @@ -128,14 +123,16 @@ struct tpm2_crb_smc { * Anyhow, we do not wait here as a consequent CMD_READY request * will be handled correctly even if idle was not completed. * - * The function does nothing for devices with ACPI-start method. + * The function does nothing for devices with ACPI-start method + * or SMC-start method. * * Return: 0 always */ static int __maybe_unused crb_go_idle(struct device *dev, struct crb_priv *priv) { - if ((priv->flags & CRB_FL_ACPI_START) || - (priv->flags & CRB_FL_CRB_SMC_START)) + if ((priv->sm == ACPI_TPM2_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) return 0; iowrite32(CRB_CTRL_REQ_GO_IDLE, &priv->regs_t->ctrl_req); @@ -174,14 +171,16 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value, * The device should respond within TIMEOUT_C. * * The function does nothing for devices with ACPI-start method + * or SMC-start method. * * Return: 0 on success -ETIME on timeout; */ static int __maybe_unused crb_cmd_ready(struct device *dev, struct crb_priv *priv) { - if ((priv->flags & CRB_FL_ACPI_START) || - (priv->flags & CRB_FL_CRB_SMC_START)) + if ((priv->sm == ACPI_TPM2_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC)) return 0; iowrite32(CRB_CTRL_REQ_CMD_READY, &priv->regs_t->ctrl_req); @@ -325,13 +324,20 @@ static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len) /* Make sure that cmd is populated before issuing start. */ wmb(); - if (priv->flags & CRB_FL_CRB_START) + /* The reason for the extra quirk is that the PTT in 4th Gen Core CPUs + * report only ACPI start but in practice seems to require both + * CRB start, hence invoking CRB start method if hid == MSFT0101. + */ + if ((priv->sm == ACPI_TPM2_COMMAND_BUFFER) || + (priv->sm == ACPI_TPM2_MEMORY_MAPPED) || + (!strcmp(priv->hid, "MSFT0101"))) iowrite32(CRB_START_INVOKE, &priv->regs_t->ctrl_start); - if (priv->flags & CRB_FL_ACPI_START) + if ((priv->sm == ACPI_TPM2_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD)) rc = crb_do_acpi_start(chip); - if (priv->flags & CRB_FL_CRB_SMC_START) { + if (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC) { iowrite32(CRB_START_INVOKE, &priv->regs_t->ctrl_start); rc = tpm_crb_smc_start(&chip->dev, priv->smc_func_id); } @@ -345,7 +351,9 @@ static void crb_cancel(struct tpm_chip *chip) iowrite32(CRB_CANCEL_INVOKE, &priv->regs_t->ctrl_cancel); - if ((priv->flags & CRB_FL_ACPI_START) && crb_do_acpi_start(chip)) + if (((priv->sm == ACPI_TPM2_START_METHOD) || + (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD)) && + crb_do_acpi_start(chip)) dev_err(&chip->dev, "ACPI Start failed\n"); } @@ -458,7 +466,8 @@ static int crb_map_io(struct acpi_device *device, struct crb_priv *priv, * the control area, as one nice sane region except for some older * stuff that puts the control area outside the ACPI IO region. */ - if (!(priv->flags & CRB_FL_ACPI_START)) { + if ((priv->sm == ACPI_TPM2_COMMAND_BUFFER) || + (priv->sm == ACPI_TPM2_MEMORY_MAPPED)) { if (buf->control_address == io_res.start + sizeof(*priv->regs_h)) priv->regs_h = priv->iobase; @@ -552,18 +561,6 @@ static int crb_acpi_add(struct acpi_device *device) if (!priv) return -ENOMEM; - /* The reason for the extra quirk is that the PTT in 4th Gen Core CPUs - * report only ACPI start but in practice seems to require both - * ACPI start and CRB start. - */ - if (sm == ACPI_TPM2_COMMAND_BUFFER || sm == ACPI_TPM2_MEMORY_MAPPED || - !strcmp(acpi_device_hid(device), "MSFT0101")) - priv->flags |= CRB_FL_CRB_START; - - if (sm == ACPI_TPM2_START_METHOD || - sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) - priv->flags |= CRB_FL_ACPI_START; - if (sm == ACPI_TPM2_COMMAND_BUFFER_WITH_ARM_SMC) { if (buf->header.length < (sizeof(*buf) + sizeof(*crb_smc))) { dev_err(dev, @@ -574,9 +571,11 @@ static int crb_acpi_add(struct acpi_device *device) } crb_smc = ACPI_ADD_PTR(struct tpm2_crb_smc, buf, sizeof(*buf)); priv->smc_func_id = crb_smc->smc_func_id; - priv->flags |= CRB_FL_CRB_SMC_START; } + priv->sm = sm; + priv->hid = acpi_device_hid(device); + rc = crb_map_io(device, priv, buf); if (rc) return rc; From 6b3a13173f23e798e1ba213dd4a2c065a3b8d751 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Mon, 11 Sep 2017 12:26:52 +0200 Subject: [PATCH 0409/1085] tpm_tis_spi: Use DMA-safe memory for SPI transfers The buffers used as tx_buf/rx_buf in a SPI transfer need to be DMA-safe. This cannot be guaranteed for the buffers passed to tpm_tis_spi_read_bytes and tpm_tis_spi_write_bytes. Therefore, we need to use our own DMA-safe buffer and copy the data to/from it. The buffer needs to be allocated separately, to ensure that it is cacheline-aligned and not shared with other data, so that DMA can work correctly. Fixes: 0edbfea537d1 ("tpm/tpm_tis_spi: Add support for spi phy") Cc: stable@vger.kernel.org Reviewed-by: Jarkko Sakkinen Signed-off-by: Alexander Steffen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_spi.c | 45 ++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_spi.c b/drivers/char/tpm/tpm_tis_spi.c index e49f5b9b739a..8ab0bd8445f6 100644 --- a/drivers/char/tpm/tpm_tis_spi.c +++ b/drivers/char/tpm/tpm_tis_spi.c @@ -46,9 +46,7 @@ struct tpm_tis_spi_phy { struct tpm_tis_data priv; struct spi_device *spi_device; - - u8 tx_buf[4]; - u8 rx_buf[4]; + u8 *iobuf; }; static inline struct tpm_tis_spi_phy *to_tpm_tis_spi_phy(struct tpm_tis_data *data) @@ -71,14 +69,14 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, while (len) { transfer_len = min_t(u16, len, MAX_SPI_FRAMESIZE); - phy->tx_buf[0] = (in ? 0x80 : 0) | (transfer_len - 1); - phy->tx_buf[1] = 0xd4; - phy->tx_buf[2] = addr >> 8; - phy->tx_buf[3] = addr; + phy->iobuf[0] = (in ? 0x80 : 0) | (transfer_len - 1); + phy->iobuf[1] = 0xd4; + phy->iobuf[2] = addr >> 8; + phy->iobuf[3] = addr; memset(&spi_xfer, 0, sizeof(spi_xfer)); - spi_xfer.tx_buf = phy->tx_buf; - spi_xfer.rx_buf = phy->rx_buf; + spi_xfer.tx_buf = phy->iobuf; + spi_xfer.rx_buf = phy->iobuf; spi_xfer.len = 4; spi_xfer.cs_change = 1; @@ -88,9 +86,9 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, if (ret < 0) goto exit; - if ((phy->rx_buf[3] & 0x01) == 0) { + if ((phy->iobuf[3] & 0x01) == 0) { // handle SPI wait states - phy->tx_buf[0] = 0; + phy->iobuf[0] = 0; for (i = 0; i < TPM_RETRY; i++) { spi_xfer.len = 1; @@ -99,7 +97,7 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, ret = spi_sync_locked(phy->spi_device, &m); if (ret < 0) goto exit; - if (phy->rx_buf[0] & 0x01) + if (phy->iobuf[0] & 0x01) break; } @@ -112,8 +110,14 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, spi_xfer.cs_change = 0; spi_xfer.len = transfer_len; spi_xfer.delay_usecs = 5; - spi_xfer.tx_buf = out; - spi_xfer.rx_buf = in; + + if (in) { + spi_xfer.tx_buf = NULL; + } else if (out) { + spi_xfer.rx_buf = NULL; + memcpy(phy->iobuf, out, transfer_len); + out += transfer_len; + } spi_message_init(&m); spi_message_add_tail(&spi_xfer, &m); @@ -121,11 +125,12 @@ static int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, if (ret < 0) goto exit; - len -= transfer_len; - if (in) + if (in) { + memcpy(in, phy->iobuf, transfer_len); in += transfer_len; - if (out) - out += transfer_len; + } + + len -= transfer_len; } exit: @@ -191,6 +196,10 @@ static int tpm_tis_spi_probe(struct spi_device *dev) phy->spi_device = dev; + phy->iobuf = devm_kmalloc(&dev->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL); + if (!phy->iobuf) + return -ENOMEM; + return tpm_tis_core_init(&dev->dev, &phy->priv, -1, &tpm_spi_phy_ops, NULL); } From 2482b1bba5122b1d5516c909832bdd282015b8e9 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Thu, 31 Aug 2017 19:18:56 +0200 Subject: [PATCH 0410/1085] tpm: Trigger only missing TPM 2.0 self tests tpm2_do_selftest is only used during initialization of the TPM to ensure that the device functions correctly. Therefore, it is sufficient to request only missing self tests (parameter full_test=0), not a reexecution of all self tests, as was done before. This allows for a faster execution of this command. Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index e1a41b788f08..8e940a530df8 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -865,7 +865,7 @@ static int tpm2_start_selftest(struct tpm_chip *chip, bool full) } /** - * tpm2_do_selftest() - run a full self test + * tpm2_do_selftest() - ensure that all self tests have passed * * @chip: TPM chip to use * @@ -886,7 +886,7 @@ static int tpm2_do_selftest(struct tpm_chip *chip) loops = jiffies_to_msecs(duration) / delay_msec; - rc = tpm2_start_selftest(chip, true); + rc = tpm2_start_selftest(chip, false); if (rc) return rc; From 87434f58be31a96d72b5ddbb98d53307300e0024 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Thu, 31 Aug 2017 19:18:57 +0200 Subject: [PATCH 0411/1085] tpm: Use dynamic delay to wait for TPM 2.0 self test result In order to avoid delaying the code longer than necessary while still giving the TPM enough time to execute the self tests asynchronously, start with a small delay between two polls and increase it each round. Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-cmd.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 8e940a530df8..2178437e541a 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -877,20 +877,17 @@ static int tpm2_start_selftest(struct tpm_chip *chip, bool full) static int tpm2_do_selftest(struct tpm_chip *chip) { int rc; - unsigned int loops; - unsigned int delay_msec = 100; - unsigned long duration; - int i; + unsigned int delay_msec = 20; + long duration; - duration = tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST); - - loops = jiffies_to_msecs(duration) / delay_msec; + duration = jiffies_to_msecs( + tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST)); rc = tpm2_start_selftest(chip, false); if (rc) return rc; - for (i = 0; i < loops; i++) { + while (duration > 0) { /* Attempt to read a PCR value */ rc = tpm2_pcr_read(chip, 0, NULL); if (rc < 0) @@ -900,6 +897,10 @@ static int tpm2_do_selftest(struct tpm_chip *chip) break; tpm_msleep(delay_msec); + duration -= delay_msec; + + /* wait longer the next round */ + delay_msec *= 2; } return rc; From 125a2210541079e8e7c69e629ad06cabed788f8c Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Thu, 31 Aug 2017 19:18:58 +0200 Subject: [PATCH 0412/1085] tpm: React correctly to RC_TESTING from TPM 2.0 self tests The TPM can choose one of two ways to react to the TPM2_SelfTest command. It can either run all self tests synchronously and then return RC_SUCCESS once all tests were successful. Or it can choose to run the tests asynchronously and return RC_TESTING immediately while the self tests still execute in the background. The previous implementation apparently was not aware of those possibilities and attributed RC_TESTING to some prototype chips instead. With this change the return code of TPM2_SelfTest is interpreted correctly, i.e. the self test result is polled if and only if RC_TESTING is received. Unfortunately, the polling cannot be done in the most straightforward way. If RC_TESTING is received, ideally the code should now poll the selfTestDone bit in the STS register, as this avoids sending more commands, that might interrupt self tests executing in the background and thus prevent them from ever completing. But it cannot be guaranteed that this bit is correctly implemented for all devices, so the next best thing would be to use TPM2_GetTestResult to query the test result. But the response to that command can be very long, and the code currently lacks the capabilities for efficient unmarshalling, so it is difficult to execute this command. Therefore, we simply run the TPM2_SelfTest command in a loop, which should complete eventually, since we only request the execution of self tests that have not yet been done. Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-cmd.c | 52 ++++++++----------------------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 2178437e541a..70ee32816c48 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -833,37 +833,6 @@ static const struct tpm_input_header tpm2_selftest_header = { .ordinal = cpu_to_be32(TPM2_CC_SELF_TEST) }; -/** - * tpm2_continue_selftest() - start a self test - * - * @chip: TPM chip to use - * @full: test all commands instead of testing only those that were not - * previously tested. - * - * Return: Same as with tpm_transmit_cmd with exception of RC_TESTING. - */ -static int tpm2_start_selftest(struct tpm_chip *chip, bool full) -{ - int rc; - struct tpm2_cmd cmd; - - cmd.header.in = tpm2_selftest_header; - cmd.params.selftest_in.full_test = full; - - rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, 0, 0, - "continue selftest"); - - /* At least some prototype chips seem to give RC_TESTING error - * immediately. This is a workaround for that. - */ - if (rc == TPM2_RC_TESTING) { - dev_warn(&chip->dev, "Got RC_TESTING, ignoring\n"); - rc = 0; - } - - return rc; -} - /** * tpm2_do_selftest() - ensure that all self tests have passed * @@ -871,27 +840,28 @@ static int tpm2_start_selftest(struct tpm_chip *chip, bool full) * * Return: Same as with tpm_transmit_cmd. * - * During the self test TPM2 commands return with the error code RC_TESTING. - * Waiting is done by issuing PCR read until it executes successfully. + * The TPM can either run all self tests synchronously and then return + * RC_SUCCESS once all tests were successful. Or it can choose to run the tests + * asynchronously and return RC_TESTING immediately while the self tests still + * execute in the background. This function handles both cases and waits until + * all tests have completed. */ static int tpm2_do_selftest(struct tpm_chip *chip) { int rc; unsigned int delay_msec = 20; long duration; + struct tpm2_cmd cmd; duration = jiffies_to_msecs( tpm2_calc_ordinal_duration(chip, TPM2_CC_SELF_TEST)); - rc = tpm2_start_selftest(chip, false); - if (rc) - return rc; - while (duration > 0) { - /* Attempt to read a PCR value */ - rc = tpm2_pcr_read(chip, 0, NULL); - if (rc < 0) - break; + cmd.header.in = tpm2_selftest_header; + cmd.params.selftest_in.full_test = 0; + + rc = tpm_transmit_cmd(chip, NULL, &cmd, TPM2_SELF_TEST_IN_SIZE, + 0, 0, "continue selftest"); if (rc != TPM2_RC_TESTING) break; From ee70bc1e7b63ac8023c9ff9475d8741e397316e7 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Fri, 8 Sep 2017 17:21:32 +0200 Subject: [PATCH 0413/1085] tpm-dev-common: Reject too short writes tpm_transmit() does not offer an explicit interface to indicate the number of valid bytes in the communication buffer. Instead, it relies on the commandSize field in the TPM header that is encoded within the buffer. Therefore, ensure that a) enough data has been written to the buffer, so that the commandSize field is present and b) the commandSize field does not announce more data than has been written to the buffer. This should have been fixed with CVE-2011-1161 long ago, but apparently a correct version of that patch never made it into the kernel. Cc: stable@vger.kernel.org Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-dev-common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c index 610638a80383..461bf0b8a094 100644 --- a/drivers/char/tpm/tpm-dev-common.c +++ b/drivers/char/tpm/tpm-dev-common.c @@ -110,6 +110,12 @@ ssize_t tpm_common_write(struct file *file, const char __user *buf, return -EFAULT; } + if (in_size < 6 || + in_size < be32_to_cpu(*((__be32 *) (priv->data_buffer + 2)))) { + mutex_unlock(&priv->buffer_mutex); + return -EINVAL; + } + /* atomic tpm command send and result receive. We only hold the ops * lock during this period so that the tpm can be unregistered even if * the char dev is held open. From 171360d7800c19622dbdaf202ed6f48ff24a5ae2 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 13 Sep 2017 09:58:49 -0700 Subject: [PATCH 0414/1085] tpm: fix type of a local variable in tpm2_get_cc_attrs_tbl() The local variable 'attrs' should have the type __be32 instead of u32. Fixes: 58472f5cd4f6 ("tpm: validate TPM 2.0 commands") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jason Gunthorpe --- drivers/char/tpm/tpm2-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 70ee32816c48..f40d20671a78 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -980,7 +980,7 @@ static int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip) { struct tpm_buf buf; u32 nr_commands; - u32 *attrs; + __be32 *attrs; u32 cc; int i; int rc; @@ -1020,7 +1020,7 @@ static int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip) chip->nr_commands = nr_commands; - attrs = (u32 *)&buf.data[TPM_HEADER_SIZE + 9]; + attrs = (__be32 *)&buf.data[TPM_HEADER_SIZE + 9]; for (i = 0; i < nr_commands; i++, attrs++) { chip->cc_attrs_tbl[i] = be32_to_cpup(attrs); cc = chip->cc_attrs_tbl[i] & 0xFFFF; From 4557d4bedc9c4c51201a8e802a6671ed8dfc0f41 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 13 Sep 2017 10:04:35 -0700 Subject: [PATCH 0415/1085] tpm: fix type of a local variable in tpm2_map_command() The local variable 'handle' should have the type __be32 instead of u32. Fixes: 745b361e989a ("tpm: infrastructure for TPM spaces") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jason Gunthorpe --- drivers/char/tpm/tpm2-space.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c index e2e059d8ffec..4e4014eabdb9 100644 --- a/drivers/char/tpm/tpm2-space.c +++ b/drivers/char/tpm/tpm2-space.c @@ -242,7 +242,7 @@ static int tpm2_map_command(struct tpm_chip *chip, u32 cc, u8 *cmd) struct tpm_space *space = &chip->work_space; unsigned int nr_handles; u32 attrs; - u32 *handle; + __be32 *handle; int i; i = tpm2_find_cc(chip, cc); @@ -252,7 +252,7 @@ static int tpm2_map_command(struct tpm_chip *chip, u32 cc, u8 *cmd) attrs = chip->cc_attrs_tbl[i]; nr_handles = (attrs >> TPM2_CC_ATTR_CHANDLES) & GENMASK(2, 0); - handle = (u32 *)&cmd[TPM_HEADER_SIZE]; + handle = (__be32 *)&cmd[TPM_HEADER_SIZE]; for (i = 0; i < nr_handles; i++, handle++) { if ((be32_to_cpu(*handle) & 0xFF000000) == TPM2_HT_TRANSIENT) { if (!tpm2_map_to_phandle(space, handle)) From 1a7a9b26c69d92582de53061b3a18740043bef29 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 13 Sep 2017 10:17:25 -0700 Subject: [PATCH 0416/1085] tpm: fix type of a local variables in tpm_tis_spi.c Use __le32 type for data in that format. Fixes: 0edbfea537d1 ("tpm/tpm_tis_spi: Add support for spi phy") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jason Gunthorpe --- drivers/char/tpm/tpm_tis_spi.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_spi.c b/drivers/char/tpm/tpm_tis_spi.c index 8ab0bd8445f6..424ff2fde1f2 100644 --- a/drivers/char/tpm/tpm_tis_spi.c +++ b/drivers/char/tpm/tpm_tis_spi.c @@ -152,29 +152,40 @@ static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, static int tpm_tis_spi_read16(struct tpm_tis_data *data, u32 addr, u16 *result) { + __le16 result_le; int rc; - rc = data->phy_ops->read_bytes(data, addr, sizeof(u16), (u8 *)result); + rc = data->phy_ops->read_bytes(data, addr, sizeof(u16), + (u8 *)&result_le); if (!rc) - *result = le16_to_cpu(*result); + *result = le16_to_cpu(result_le); + return rc; } static int tpm_tis_spi_read32(struct tpm_tis_data *data, u32 addr, u32 *result) { + __le32 result_le; int rc; - rc = data->phy_ops->read_bytes(data, addr, sizeof(u32), (u8 *)result); + rc = data->phy_ops->read_bytes(data, addr, sizeof(u32), + (u8 *)&result_le); if (!rc) - *result = le32_to_cpu(*result); + *result = le32_to_cpu(result_le); + return rc; } static int tpm_tis_spi_write32(struct tpm_tis_data *data, u32 addr, u32 value) { - value = cpu_to_le32(value); - return data->phy_ops->write_bytes(data, addr, sizeof(u32), - (u8 *)&value); + __le32 value_le; + int rc; + + value_le = cpu_to_le32(value); + rc = data->phy_ops->write_bytes(data, addr, sizeof(u32), + (u8 *)&value_le); + + return rc; } static const struct tpm_tis_phy_ops tpm_spi_phy_ops = { From 2d56c71835acc9d69b402f7027091920306daf83 Mon Sep 17 00:00:00 2001 From: Ruben Roy Date: Tue, 26 Sep 2017 13:58:57 +0000 Subject: [PATCH 0417/1085] tpm: fix duplicate inline declaration specifier This commit fixes the duplicate inline declaration specifier in tpm2_rc_value which caused a warning Signed-off-by: Ruben Roy Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index b50e92fbca31..528cffbd49d3 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -544,7 +544,7 @@ static inline void tpm_add_ppi(struct tpm_chip *chip) } #endif -static inline inline u32 tpm2_rc_value(u32 rc) +static inline u32 tpm2_rc_value(u32 rc) { return (rc & BIT(7)) ? rc & 0xff : rc; } From 33957a104c86985f087b08a363590a948c9ff08c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= Date: Sun, 1 Oct 2017 15:30:51 -0400 Subject: [PATCH 0418/1085] tpm, tpm_tis: use ARRAY_SIZE() to define TPM_HID_USR_IDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérémy Lefaure Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index ebd0e75a3e4d..e2d1055fb814 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "tpm.h" #include "tpm_tis_core.h" @@ -365,7 +366,7 @@ static struct pnp_driver tis_pnp_driver = { }, }; -#define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2 +#define TIS_HID_USR_IDX (ARRAY_SIZE(tpm_pnp_tbl) - 2) module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id, sizeof(tpm_pnp_tbl[TIS_HID_USR_IDX].id), 0444); MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe"); From 57b8b1a1856adaa849d02d547411a553a531022b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Oct 2017 19:39:35 +0200 Subject: [PATCH 0419/1085] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible 'features' which can be handed in are larger than this, because after the capabilities the bug 'feature' bits occupy another 32bit. Not really obvious... So clearing any of the misfeature bits, as 32bit does for the F00F bug, accesses that bitmap out of bounds thereby corrupting the stack. Size the bitmap proper and add a sanity check to catch accidental out of bound access. Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop --- arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e48eb7313120..c1d49842a411 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) __clear_cpu_cap(c, feature); } +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) { - bool changed; - DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; clear_feature(c, feature); From 5cdda5117e125e0dbb020425cc55a4c143c6febc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Oct 2017 17:24:28 +0200 Subject: [PATCH 0420/1085] locking/static_keys: Improve uninitialized key warning Right now it says: static_key_disable_cpuslocked used before call to jump_label_init ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:161 static_key_disable_cpuslocked+0x68/0x70 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.14.0-rc5+ #1 Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 05/09/2016 task: ffffffff81c0e480 task.stack: ffffffff81c00000 RIP: 0010:static_key_disable_cpuslocked+0x68/0x70 RSP: 0000:ffffffff81c03ef0 EFLAGS: 00010096 ORIG_RAX: 0000000000000000 RAX: 0000000000000041 RBX: ffffffff81c32680 RCX: ffffffff81c5cbf8 RDX: 0000000000000001 RSI: 0000000000000092 RDI: 0000000000000002 RBP: ffff88807fffd240 R08: 726f666562206465 R09: 0000000000000136 R10: 0000000000000000 R11: 696e695f6c656261 R12: ffffffff82158900 R13: ffffffff8215f760 R14: 0000000000000001 R15: 0000000000000008 FS: 0000000000000000(0000) GS:ffff883f7f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88807ffff000 CR3: 0000000001c09000 CR4: 00000000000606b0 Call Trace: static_key_disable+0x16/0x20 start_kernel+0x15a/0x45d ? load_ucode_intel_bsp+0x11/0x2d secondary_startup_64+0xa5/0xb0 Code: 48 c7 c7 a0 15 cf 81 e9 47 53 4b 00 48 89 df e8 5f fc ff ff eb e8 48 c7 c6 \ c0 97 83 81 48 c7 c7 d0 ff a2 81 31 c0 e8 c5 9d f5 ff <0f> ff eb a7 0f ff eb \ b0 e8 eb a2 4b 00 53 48 89 fb e8 42 0e f0 but it doesn't tell me which key it is. So dump the key's name too: static_key_disable_cpuslocked(): static key 'virt_spin_lock_key' used before call to jump_label_init() And that makes pinpointing which key is causing that a lot easier. include/linux/jump_label.h | 14 +++++++------- include/linux/jump_label_ratelimit.h | 6 +++--- kernel/jump_label.c | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) Signed-off-by: Borislav Petkov Reviewed-by: Steven Rostedt (VMware) Cc: Boris Ostrovsky Cc: Hannes Frederic Sowa Cc: Jason Baron Cc: Juergen Gross Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018152428.ffjgak4o25f7ept6@pd.tnic Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 14 +++++++------- include/linux/jump_label_ratelimit.h | 6 +++--- kernel/jump_label.c | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index cd5861651b17..979a2f2d529b 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -81,9 +81,9 @@ extern bool static_key_initialized; -#define STATIC_KEY_CHECK_USE() WARN(!static_key_initialized, \ - "%s used before call to jump_label_init", \ - __func__) +#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ + "%s(): static key '%pS' used before call to jump_label_init()", \ + __func__, (key)) #ifdef HAVE_JUMP_LABEL @@ -211,13 +211,13 @@ static __always_inline bool static_key_true(struct static_key *key) static inline void static_key_slow_inc(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); atomic_inc(&key->enabled); } static inline void static_key_slow_dec(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } @@ -236,7 +236,7 @@ static inline int jump_label_apply_nops(struct module *mod) static inline void static_key_enable(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); @@ -247,7 +247,7 @@ static inline void static_key_enable(struct static_key *key) static inline void static_key_disable(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index 23da3af459fe..93086df0a847 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -24,18 +24,18 @@ struct static_key_deferred { }; static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); static_key_slow_dec(&key->key); } static inline void static_key_deferred_flush(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); } static inline void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); } #endif /* HAVE_JUMP_LABEL */ #endif /* _LINUX_JUMP_LABEL_RATELIMIT_H */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0bf2e8f5244a..8ff4ca4665ff 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); /* * Careful if we get concurrent static_key_slow_inc() calls; @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); @@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void static_key_deferred_flush(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); flush_delayed_work(&key->work); } EXPORT_SYMBOL_GPL(static_key_deferred_flush); @@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } From 7339605aa1b5fc551eb23a9563b8b84ba33632f0 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 19 Oct 2017 08:41:29 +0200 Subject: [PATCH 0421/1085] MAINTAINERS: Split Cavium EDAC entry and add myself Split the Cavium EDAC entry into MIPS and ARM drivers because they have different maintainers and mailing lists. Add myself as additional maintainer to the ThunderX driver. Signed-off-by: Jan Glauber Cc: David Daney Cc: Ralf Baechle Cc: Sergey Temerkhanov Cc: linux-edac Link: http://lkml.kernel.org/r/20171019064129.5064-1-jglauber@cavium.com Signed-off-by: Borislav Petkov --- MAINTAINERS | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..0eedf0bd1b7d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4895,13 +4895,19 @@ L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/highbank* -EDAC-CAVIUM +EDAC-CAVIUM OCTEON M: Ralf Baechle M: David Daney L: linux-edac@vger.kernel.org L: linux-mips@linux-mips.org S: Supported F: drivers/edac/octeon_edac* + +EDAC-CAVIUM THUNDERX +M: David Daney +M: Jan Glauber +L: linux-edac@vger.kernel.org +S: Supported F: drivers/edac/thunderx_edac* EDAC-CORE From a8e9b186f153a44690ad0363a56716e7077ad28c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 12:40:29 -0500 Subject: [PATCH 0422/1085] EDAC, sb_edac: Fix missing break in switch Add missing break statement in order to prevent the code from falling through. Signed-off-by: Gustavo A. R. Silva Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 72b98a081d2b..f34430f99fd8 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -2485,6 +2485,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci, case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA: case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA: pvt->pci_ta = pdev; + break; case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS: case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS: pvt->pci_ras = pdev; From f1d783585486c7c612f277c2a6f0c9bb5a67e463 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Oct 2017 10:44:54 +0900 Subject: [PATCH 0423/1085] irqdomain: Move revmap_trees_mutex to struct irq_domain The revmap_trees_mutex protects domain->revmap_tree. There is no need to make it global because it is allowed to modify revmap_tree of two different domains concurrently. Having said that, this would not be a actual bottleneck because the interrupt map/unmap does not occur quite often. Rather, the motivation is to tidy up the code from a data structure point of view. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 2 ++ kernel/irq/irqdomain.c | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 7d0c6c144708..df162f7a4aad 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -32,6 +32,7 @@ #include #include #include +#include #include struct device_node; @@ -176,6 +177,7 @@ struct irq_domain { unsigned int revmap_direct_max_irq; unsigned int revmap_size; struct radix_tree_root revmap_tree; + struct mutex revmap_tree_mutex; unsigned int linear_revmap[]; }; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b50f737574ae..31d0f9ff4f00 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -21,7 +21,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; static void irq_domain_check_hierarchy(struct irq_domain *domain); @@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); + mutex_init(&domain->revmap_tree_mutex); domain->ops = ops; domain->host_data = host_data; domain->hwirq_max = hwirq_max; @@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = irq_data->irq; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -1459,11 +1459,11 @@ static void irq_domain_fix_revmap(struct irq_data *d) return; /* Not using radix tree. */ /* Fix up the revmap. */ - mutex_lock(&revmap_trees_mutex); + mutex_lock(&d->domain->revmap_tree_mutex); slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&d->domain->revmap_tree_mutex); } /** From d03cc2d8aed3bfbdfc1f52e79054e0013b5721ca Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: [PATCH 0424/1085] irqdomain: Add __rcu annotations to radix tree slot Fix different address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31d0f9ff4f00..fbbf34293b17 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ From bea173e5acd73d536dd234b34328cd52c1cadaab Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 Oct 2017 13:51:32 +0200 Subject: [PATCH 0425/1085] dt-bindings: irqchip: renesas-irqc: Document R-Car M3-W, V3M, D3 support Document support for the Interrupt Controller for Externel Devices (INTC-EX) in the Renesas M3-W (r8a7796), V3M (r8a77970), and D3 (r8a77995) SoCs. No driver update is needed. Reviewed-by: Simon Horman Signed-off-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier --- .../devicetree/bindings/interrupt-controller/renesas,irqc.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt index e3f052d8c11a..33c9a10fdc91 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt @@ -13,6 +13,9 @@ Required properties: - "renesas,irqc-r8a7793" (R-Car M2-N) - "renesas,irqc-r8a7794" (R-Car E2) - "renesas,intc-ex-r8a7795" (R-Car H3) + - "renesas,intc-ex-r8a7796" (R-Car M3-W) + - "renesas,intc-ex-r8a77970" (R-Car V3M) + - "renesas,intc-ex-r8a77995" (R-Car D3) - #interrupt-cells: has to be <2>: an interrupt index and flags, as defined in interrupts.txt in this directory - clocks: Must contain a reference to the functional clock. From 42a5968c0ae8f19906e16fa34ea9bdb6f5095166 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Oct 2017 14:17:58 +0200 Subject: [PATCH 0426/1085] irqchip/renesas-intc-irqpin: Use of_device_get_match_data() helper Use the of_device_get_match_data() helper instead of open coding. Acked-by: Simon Horman Acked-by: Rob Herring Signed-off-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-renesas-intc-irqpin.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c index 713177d97c7a..06f29cf5018a 100644 --- a/drivers/irqchip/irq-renesas-intc-irqpin.c +++ b/drivers/irqchip/irq-renesas-intc-irqpin.c @@ -389,9 +389,8 @@ MODULE_DEVICE_TABLE(of, intc_irqpin_dt_ids); static int intc_irqpin_probe(struct platform_device *pdev) { - const struct intc_irqpin_config *config = NULL; + const struct intc_irqpin_config *config; struct device *dev = &pdev->dev; - const struct of_device_id *of_id; struct intc_irqpin_priv *p; struct intc_irqpin_iomem *i; struct resource *io[INTC_IRQPIN_REG_NR]; @@ -422,11 +421,9 @@ static int intc_irqpin_probe(struct platform_device *pdev) p->pdev = pdev; platform_set_drvdata(pdev, p); - of_id = of_match_device(intc_irqpin_dt_ids, dev); - if (of_id && of_id->data) { - config = of_id->data; + config = of_device_get_match_data(dev); + if (config) p->needs_clk = config->needs_clk; - } p->clk = devm_clk_get(dev, NULL); if (IS_ERR(p->clk)) { From 49aa6ef0b43912ead9cc5525dca182534d1d6676 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 18 Sep 2017 17:59:58 -0700 Subject: [PATCH 0427/1085] irqchip/brcmstb-l2: Remove some processing from the handler Saving the generic chip pointer in the brcmstb_l2_intc_data prevents the need to call irq_get_domain_generic_chip(). Also don't need to save parent_irq and base there since local variables in the brcmstb_l2_intc_of_init() function are just as good. The handle_edge_irq flow or chained_irq_enter takes care of the acknowledgment of the interrupt so it is redundant to clear it in brcmstb_l2_intc_irq_handle(). irq_linear_revmap() is a fast path equivalent of irq_find_mapping() that is appropriate to use for domain controllers of this type. Defining irq_mask_ack is slightly more efficient than just implementing irq_mask and irq_ack separately. Reviewed-by: Florian Fainelli Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-brcmstb-l2.c | 72 +++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index b009b916a292..48bd1a36c7d4 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -1,7 +1,7 @@ /* * Generic Broadcom Set Top Box Level 2 Interrupt controller driver * - * Copyright (C) 2014 Broadcom Corporation + * Copyright (C) 2014-2017 Broadcom * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -41,25 +41,49 @@ /* L2 intc private data structure */ struct brcmstb_l2_intc_data { - int parent_irq; - void __iomem *base; struct irq_domain *domain; + struct irq_chip_generic *gc; bool can_wake; u32 saved_mask; /* for suspend/resume */ }; +/** + * brcmstb_l2_mask_and_ack - Mask and ack pending interrupt + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register and pending interrupt is acknowledged by setting a bit. + * + * Note: This function is generic and could easily be added to the + * generic irqchip implementation if there ever becomes a will to do so. + * Perhaps with a name like irq_gc_mask_disable_and_ack_set(). + * + * e.g.: https://patchwork.kernel.org/patch/9831047/ + */ +static void brcmstb_l2_mask_and_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; + + irq_gc_lock(gc); + irq_reg_writel(gc, mask, ct->regs.disable); + *ct->mask_cache &= ~mask; + irq_reg_writel(gc, mask, ct->regs.ack); + irq_gc_unlock(gc); +} + static void brcmstb_l2_intc_irq_handle(struct irq_desc *desc) { struct brcmstb_l2_intc_data *b = irq_desc_get_handler_data(desc); - struct irq_chip_generic *gc = irq_get_domain_generic_chip(b->domain, 0); struct irq_chip *chip = irq_desc_get_chip(desc); unsigned int irq; u32 status; chained_irq_enter(chip, desc); - status = irq_reg_readl(gc, CPU_STATUS) & - ~(irq_reg_readl(gc, CPU_MASK_STATUS)); + status = irq_reg_readl(b->gc, CPU_STATUS) & + ~(irq_reg_readl(b->gc, CPU_MASK_STATUS)); if (status == 0) { raw_spin_lock(&desc->lock); @@ -70,10 +94,8 @@ static void brcmstb_l2_intc_irq_handle(struct irq_desc *desc) do { irq = ffs(status) - 1; - /* ack at our level */ - irq_reg_writel(gc, 1 << irq, CPU_CLEAR); status &= ~(1 << irq); - generic_handle_irq(irq_find_mapping(b->domain, irq)); + generic_handle_irq(irq_linear_revmap(b->domain, irq)); } while (status); out: chained_irq_exit(chip, desc); @@ -116,32 +138,33 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct brcmstb_l2_intc_data *data; - struct irq_chip_generic *gc; struct irq_chip_type *ct; int ret; unsigned int flags; + int parent_irq; + void __iomem *base; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - data->base = of_iomap(np, 0); - if (!data->base) { + base = of_iomap(np, 0); + if (!base) { pr_err("failed to remap intc L2 registers\n"); ret = -ENOMEM; goto out_free; } /* Disable all interrupts by default */ - writel(0xffffffff, data->base + CPU_MASK_SET); + writel(0xffffffff, base + CPU_MASK_SET); /* Wakeup interrupts may be retained from S5 (cold boot) */ data->can_wake = of_property_read_bool(np, "brcm,irq-can-wake"); if (!data->can_wake) - writel(0xffffffff, data->base + CPU_CLEAR); + writel(0xffffffff, base + CPU_CLEAR); - data->parent_irq = irq_of_parse_and_map(np, 0); - if (!data->parent_irq) { + parent_irq = irq_of_parse_and_map(np, 0); + if (!parent_irq) { pr_err("failed to find parent interrupt\n"); ret = -EINVAL; goto out_unmap; @@ -170,18 +193,19 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, } /* Set the IRQ chaining logic */ - irq_set_chained_handler_and_data(data->parent_irq, + irq_set_chained_handler_and_data(parent_irq, brcmstb_l2_intc_irq_handle, data); - gc = irq_get_domain_generic_chip(data->domain, 0); - gc->reg_base = data->base; - gc->private = data; - ct = gc->chip_types; + data->gc = irq_get_domain_generic_chip(data->domain, 0); + data->gc->reg_base = base; + data->gc->private = data; + ct = data->gc->chip_types; ct->chip.irq_ack = irq_gc_ack_set_bit; ct->regs.ack = CPU_CLEAR; ct->chip.irq_mask = irq_gc_mask_disable_reg; + ct->chip.irq_mask_ack = brcmstb_l2_mask_and_ack; ct->regs.disable = CPU_MASK_SET; ct->chip.irq_unmask = irq_gc_unmask_enable_reg; @@ -195,19 +219,19 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, /* This IRQ chip can wake the system, set all child interrupts * in wake_enabled mask */ - gc->wake_enabled = 0xffffffff; + data->gc->wake_enabled = 0xffffffff; ct->chip.irq_set_wake = irq_gc_set_wake; } pr_info("registered L2 intc (mem: 0x%p, parent irq: %d)\n", - data->base, data->parent_irq); + base, parent_irq); return 0; out_free_domain: irq_domain_remove(data->domain); out_unmap: - iounmap(data->base); + iounmap(base); out_free: kfree(data); return ret; From 8480ca477e916e748c908a19e88d0c52a8f4d8fa Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 18 Sep 2017 17:59:59 -0700 Subject: [PATCH 0428/1085] irqchip/brcmstb-l2: Abstract register accesses Added register block offsets to the brcmstb_l2_intc_data structure for the status and mask registers to support reading the active interupts in an abstracted way. It seems like an irq_chip method should have been provided for this, but it's not there yet. Abstracted the implementation of the handler, suspend, and resume functions to not use any hard coded register offsets. Reviewed-by: Florian Fainelli Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-brcmstb-l2.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 48bd1a36c7d4..8d54cd7a090d 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -43,6 +43,8 @@ struct brcmstb_l2_intc_data { struct irq_domain *domain; struct irq_chip_generic *gc; + int status_offset; + int mask_offset; bool can_wake; u32 saved_mask; /* for suspend/resume */ }; @@ -82,8 +84,8 @@ static void brcmstb_l2_intc_irq_handle(struct irq_desc *desc) chained_irq_enter(chip, desc); - status = irq_reg_readl(b->gc, CPU_STATUS) & - ~(irq_reg_readl(b->gc, CPU_MASK_STATUS)); + status = irq_reg_readl(b->gc, b->status_offset) & + ~(irq_reg_readl(b->gc, b->mask_offset)); if (status == 0) { raw_spin_lock(&desc->lock); @@ -104,16 +106,17 @@ out: static void brcmstb_l2_intc_suspend(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); struct brcmstb_l2_intc_data *b = gc->private; irq_gc_lock(gc); /* Save the current mask */ - b->saved_mask = irq_reg_readl(gc, CPU_MASK_STATUS); + b->saved_mask = irq_reg_readl(gc, ct->regs.mask); if (b->can_wake) { /* Program the wakeup mask */ - irq_reg_writel(gc, ~gc->wake_active, CPU_MASK_SET); - irq_reg_writel(gc, gc->wake_active, CPU_MASK_CLEAR); + irq_reg_writel(gc, ~gc->wake_active, ct->regs.disable); + irq_reg_writel(gc, gc->wake_active, ct->regs.enable); } irq_gc_unlock(gc); } @@ -121,15 +124,19 @@ static void brcmstb_l2_intc_suspend(struct irq_data *d) static void brcmstb_l2_intc_resume(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); struct brcmstb_l2_intc_data *b = gc->private; irq_gc_lock(gc); - /* Clear unmasked non-wakeup interrupts */ - irq_reg_writel(gc, ~b->saved_mask & ~gc->wake_active, CPU_CLEAR); + if (ct->chip.irq_ack != irq_gc_noop) { + /* Clear unmasked non-wakeup interrupts */ + irq_reg_writel(gc, ~b->saved_mask & ~gc->wake_active, + ct->regs.ack); + } /* Restore the saved mask */ - irq_reg_writel(gc, b->saved_mask, CPU_MASK_SET); - irq_reg_writel(gc, ~b->saved_mask, CPU_MASK_CLEAR); + irq_reg_writel(gc, b->saved_mask, ct->regs.disable); + irq_reg_writel(gc, ~b->saved_mask, ct->regs.enable); irq_gc_unlock(gc); } @@ -199,6 +206,9 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, data->gc = irq_get_domain_generic_chip(data->domain, 0); data->gc->reg_base = base; data->gc->private = data; + data->status_offset = CPU_STATUS; + data->mask_offset = CPU_MASK_STATUS; + ct = data->gc->chip_types; ct->chip.irq_ack = irq_gc_ack_set_bit; @@ -207,6 +217,7 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, ct->chip.irq_mask = irq_gc_mask_disable_reg; ct->chip.irq_mask_ack = brcmstb_l2_mask_and_ack; ct->regs.disable = CPU_MASK_SET; + ct->regs.mask = CPU_MASK_STATUS; ct->chip.irq_unmask = irq_gc_unmask_enable_reg; ct->regs.enable = CPU_MASK_CLEAR; From c0ca7262088ebab67452a39f27979d3faa4762f0 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 18 Sep 2017 18:00:00 -0700 Subject: [PATCH 0429/1085] irqchip/brcmstb-l2: Add support for the BCM7271 L2 controller Add the initialization of the generic irq chip for the BCM7271 L2 interrupt controller. This controller only supports level interrupts and uses the "brcm,bcm7271-l2-intc" compatibility string. Acked-by: Rob Herring Reviewed-by: Florian Fainelli Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- .../interrupt-controller/brcm,l2-intc.txt | 3 +- drivers/irqchip/irq-brcmstb-l2.c | 86 ++++++++++++++----- 2 files changed, 66 insertions(+), 23 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt index 448273a30a11..36df06c5c567 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt @@ -2,7 +2,8 @@ Broadcom Generic Level 2 Interrupt Controller Required properties: -- compatible: should be "brcm,l2-intc" +- compatible: should be "brcm,l2-intc" for latched interrupt controllers + should be "brcm,bcm7271-l2-intc" for level interrupt controllers - reg: specifies the base physical address and size of the registers - interrupt-controller: identifies the node as an interrupt controller - #interrupt-cells: specifies the number of cells needed to encode an diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 8d54cd7a090d..691d20eb0bec 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -31,13 +31,34 @@ #include #include -/* Register offsets in the L2 interrupt controller */ -#define CPU_STATUS 0x00 -#define CPU_SET 0x04 -#define CPU_CLEAR 0x08 -#define CPU_MASK_STATUS 0x0c -#define CPU_MASK_SET 0x10 -#define CPU_MASK_CLEAR 0x14 +struct brcmstb_intc_init_params { + irq_flow_handler_t handler; + int cpu_status; + int cpu_clear; + int cpu_mask_status; + int cpu_mask_set; + int cpu_mask_clear; +}; + +/* Register offsets in the L2 latched interrupt controller */ +static const struct brcmstb_intc_init_params l2_edge_intc_init = { + .handler = handle_edge_irq, + .cpu_status = 0x00, + .cpu_clear = 0x08, + .cpu_mask_status = 0x0c, + .cpu_mask_set = 0x10, + .cpu_mask_clear = 0x14 +}; + +/* Register offsets in the L2 level interrupt controller */ +static const struct brcmstb_intc_init_params l2_lvl_intc_init = { + .handler = handle_level_irq, + .cpu_status = 0x00, + .cpu_clear = -1, /* Register not present */ + .cpu_mask_status = 0x04, + .cpu_mask_set = 0x08, + .cpu_mask_clear = 0x0C +}; /* L2 intc private data structure */ struct brcmstb_l2_intc_data { @@ -128,7 +149,7 @@ static void brcmstb_l2_intc_resume(struct irq_data *d) struct brcmstb_l2_intc_data *b = gc->private; irq_gc_lock(gc); - if (ct->chip.irq_ack != irq_gc_noop) { + if (ct->chip.irq_ack) { /* Clear unmasked non-wakeup interrupts */ irq_reg_writel(gc, ~b->saved_mask & ~gc->wake_active, ct->regs.ack); @@ -141,7 +162,9 @@ static void brcmstb_l2_intc_resume(struct irq_data *d) } static int __init brcmstb_l2_intc_of_init(struct device_node *np, - struct device_node *parent) + struct device_node *parent, + const struct brcmstb_intc_init_params + *init_params) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct brcmstb_l2_intc_data *data; @@ -163,12 +186,12 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, } /* Disable all interrupts by default */ - writel(0xffffffff, base + CPU_MASK_SET); + writel(0xffffffff, base + init_params->cpu_mask_set); /* Wakeup interrupts may be retained from S5 (cold boot) */ data->can_wake = of_property_read_bool(np, "brcm,irq-can-wake"); - if (!data->can_wake) - writel(0xffffffff, base + CPU_CLEAR); + if (!data->can_wake && (init_params->cpu_clear >= 0)) + writel(0xffffffff, base + init_params->cpu_clear); parent_irq = irq_of_parse_and_map(np, 0); if (!parent_irq) { @@ -193,7 +216,7 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, /* Allocate a single Generic IRQ chip for this node */ ret = irq_alloc_domain_generic_chips(data->domain, 32, 1, - np->full_name, handle_edge_irq, clr, 0, flags); + np->full_name, init_params->handler, clr, 0, flags); if (ret) { pr_err("failed to allocate generic irq chip\n"); goto out_free_domain; @@ -206,21 +229,26 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, data->gc = irq_get_domain_generic_chip(data->domain, 0); data->gc->reg_base = base; data->gc->private = data; - data->status_offset = CPU_STATUS; - data->mask_offset = CPU_MASK_STATUS; + data->status_offset = init_params->cpu_status; + data->mask_offset = init_params->cpu_mask_status; ct = data->gc->chip_types; - ct->chip.irq_ack = irq_gc_ack_set_bit; - ct->regs.ack = CPU_CLEAR; + if (init_params->cpu_clear >= 0) { + ct->regs.ack = init_params->cpu_clear; + ct->chip.irq_ack = irq_gc_ack_set_bit; + ct->chip.irq_mask_ack = brcmstb_l2_mask_and_ack; + } else { + /* No Ack - but still slightly more efficient to define this */ + ct->chip.irq_mask_ack = irq_gc_mask_disable_reg; + } ct->chip.irq_mask = irq_gc_mask_disable_reg; - ct->chip.irq_mask_ack = brcmstb_l2_mask_and_ack; - ct->regs.disable = CPU_MASK_SET; - ct->regs.mask = CPU_MASK_STATUS; + ct->regs.disable = init_params->cpu_mask_set; + ct->regs.mask = init_params->cpu_mask_status; ct->chip.irq_unmask = irq_gc_unmask_enable_reg; - ct->regs.enable = CPU_MASK_CLEAR; + ct->regs.enable = init_params->cpu_mask_clear; ct->chip.irq_suspend = brcmstb_l2_intc_suspend; ct->chip.irq_resume = brcmstb_l2_intc_resume; @@ -247,4 +275,18 @@ out_free: kfree(data); return ret; } -IRQCHIP_DECLARE(brcmstb_l2_intc, "brcm,l2-intc", brcmstb_l2_intc_of_init); + +int __init brcmstb_l2_edge_intc_of_init(struct device_node *np, + struct device_node *parent) +{ + return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init); +} +IRQCHIP_DECLARE(brcmstb_l2_intc, "brcm,l2-intc", brcmstb_l2_edge_intc_of_init); + +int __init brcmstb_l2_lvl_intc_of_init(struct device_node *np, + struct device_node *parent) +{ + return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init); +} +IRQCHIP_DECLARE(bcm7271_l2_intc, "brcm,bcm7271-l2-intc", + brcmstb_l2_lvl_intc_of_init); From eda0d04acc5e317da675ee93a3f09e7c2e2fa592 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 6 Oct 2017 10:24:00 -0500 Subject: [PATCH 0430/1085] irqchip/gic-v3: Add support for Range Selector (RS) feature A new feature Range Selector (RS) has been added to GIC specification in order to support more than 16 CPUs at affinity level 0. New fields are introduced in SGI system registers (ICC_SGI0R_EL1, ICC_SGI1R_EL1 and ICC_ASGI1R_EL1) to relax an artificial limit of 16 at level 0. - A new RSS field in ICC_CTLR_EL3, ICC_CTLR_EL1 and ICV_CTLR_EL1: [18] - Range Selector Support (RSS) 0b0 = Targeted SGIs with affinity level 0 values of 0-15 are supported. 0b1 = Targeted SGIs with affinity level 0 values of 0-255 are supported. - A new RS field in ICC_SGI0R_EL1, ICC_SGI1R_EL1 and ICC_ASGI1R_EL1: [47:44] - RangeSelector (RS) which group of 16 TargetList[n] field TargetList[n] represents aff0 value ((RS*16)+n) When ICC_CTLR_EL3.RSS==0 or ICC_CTLR_EL1.RSS==0, RS is RES0. - A new RSS field in GICD_TYPER: [26] - Range Selector Support (RSS) 0b0 = Targeted SGIs with affinity level 0 values of 0-15 are supported. 0b1 = Targeted SGIs with affinity level 0 values of 0-255 are supported. Signed-off-by: Shanker Donthineni Signed-off-by: Marc Zyngier --- arch/arm/include/asm/arch_gicv3.h | 5 +++ arch/arm64/include/asm/arch_gicv3.h | 5 +++ drivers/irqchip/irq-gic-v3.c | 50 +++++++++++++++++++++++------ include/linux/irqchip/arm-gic-v3.h | 4 +++ 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index eee269321923..1070044f5c3f 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -196,6 +196,11 @@ static inline void gic_write_ctlr(u32 val) isb(); } +static inline u32 gic_read_ctlr(void) +{ + return read_sysreg(ICC_CTLR); +} + static inline void gic_write_grpen1(u32 val) { write_sysreg(val, ICC_IGRPEN1); diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index b7e3f74822da..9becba9ab392 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -87,6 +87,11 @@ static inline void gic_write_ctlr(u32 val) isb(); } +static inline u32 gic_read_ctlr(void) +{ + return read_sysreg_s(SYS_ICC_CTLR_EL1); +} + static inline void gic_write_grpen1(u32 val) { write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index b5df99c6f680..b54b55597ffb 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -55,6 +55,7 @@ struct gic_chip_data { struct irq_domain *domain; u64 redist_stride; u32 nr_redist_regions; + bool has_rss; unsigned int irq_nr; struct partition_desc *ppi_descs[16]; }; @@ -63,7 +64,9 @@ static struct gic_chip_data gic_data __read_mostly; static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE; static struct gic_kvm_info gic_v3_kvm_info; +static DEFINE_PER_CPU(bool, has_rss); +#define MPIDR_RS(mpidr) (((mpidr) & 0xF0UL) >> 4) #define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K) @@ -526,6 +529,10 @@ static void gic_update_vlpi_properties(void) static void gic_cpu_sys_reg_init(void) { + int i, cpu = smp_processor_id(); + u64 mpidr = cpu_logical_map(cpu); + u64 need_rss = MPIDR_RS(mpidr); + /* * Need to check that the SRE bit has actually been set. If * not, it means that SRE is disabled at EL2. We're going to @@ -557,6 +564,30 @@ static void gic_cpu_sys_reg_init(void) /* ... and let's hit the road... */ gic_write_grpen1(1); + + /* Keep the RSS capability status in per_cpu variable */ + per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS); + + /* Check all the CPUs have capable of sending SGIs to other CPUs */ + for_each_online_cpu(i) { + bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu); + + need_rss |= MPIDR_RS(cpu_logical_map(i)); + if (need_rss && (!have_rss)) + pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n", + cpu, (unsigned long)mpidr, + i, (unsigned long)cpu_logical_map(i)); + } + + /** + * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0, + * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED + * UNPREDICTABLE choice of : + * - The write is ignored. + * - The RS field is treated as 0. + */ + if (need_rss && (!gic_data.has_rss)) + pr_crit_once("RSS is required but GICD doesn't support it\n"); } static int gic_dist_supports_lpis(void) @@ -591,6 +622,9 @@ static void gic_cpu_init(void) #ifdef CONFIG_SMP +#define MPIDR_TO_SGI_RS(mpidr) (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT) +#define MPIDR_TO_SGI_CLUSTER_ID(mpidr) ((mpidr) & ~0xFUL) + static int gic_starting_cpu(unsigned int cpu) { gic_cpu_init(); @@ -605,13 +639,6 @@ static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, u16 tlist = 0; while (cpu < nr_cpu_ids) { - /* - * If we ever get a cluster of more than 16 CPUs, just - * scream and skip that CPU. - */ - if (WARN_ON((mpidr & 0xff) >= 16)) - goto out; - tlist |= 1 << (mpidr & 0xf); next_cpu = cpumask_next(cpu, mask); @@ -621,7 +648,7 @@ static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, mpidr = cpu_logical_map(cpu); - if (cluster_id != (mpidr & ~0xffUL)) { + if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) { cpu--; goto out; } @@ -643,6 +670,7 @@ static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq) MPIDR_TO_SGI_AFFINITY(cluster_id, 2) | irq << ICC_SGI1R_SGI_ID_SHIFT | MPIDR_TO_SGI_AFFINITY(cluster_id, 1) | + MPIDR_TO_SGI_RS(cluster_id) | tlist << ICC_SGI1R_TARGET_LIST_SHIFT); pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); @@ -663,7 +691,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) smp_wmb(); for_each_cpu(cpu, mask) { - unsigned long cluster_id = cpu_logical_map(cpu) & ~0xffUL; + u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu)); u16 tlist; tlist = gic_compute_target_list(&cpu, mask, cluster_id); @@ -1007,6 +1035,10 @@ static int __init gic_init_bases(void __iomem *dist_base, goto out_free; } + gic_data.has_rss = !!(typer & GICD_TYPER_RSS); + pr_info("Distributor has %sRange Selector support\n", + gic_data.has_rss ? "" : "no "); + set_handle_irq(gic_handle_irq); gic_update_vlpi_properties(); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 1ea576c8126f..b8b59989bd73 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -68,6 +68,7 @@ #define GICD_CTLR_ENABLE_SS_G1 (1U << 1) #define GICD_CTLR_ENABLE_SS_G0 (1U << 0) +#define GICD_TYPER_RSS (1U << 26) #define GICD_TYPER_LPIS (1U << 17) #define GICD_TYPER_MBIS (1U << 16) @@ -459,6 +460,7 @@ #define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) #define ICC_CTLR_EL1_A3V_SHIFT 15 #define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_CTLR_EL1_RSS (0x1 << 18) #define ICC_PMR_EL1_SHIFT 0 #define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) #define ICC_BPR0_EL1_SHIFT 0 @@ -547,6 +549,8 @@ #define ICC_SGI1R_AFFINITY_2_SHIFT 32 #define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT) #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40 +#define ICC_SGI1R_RS_SHIFT 44 +#define ICC_SGI1R_RS_MASK (0xfULL << ICC_SGI1R_RS_SHIFT) #define ICC_SGI1R_AFFINITY_3_SHIFT 48 #define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT) From fa1500191958660952a3a8466aad54003701a7b6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 17 Oct 2017 17:55:54 +0100 Subject: [PATCH 0431/1085] irqchip/gic-v3: Probe device ID space before quirks handling Before adding another SoC whose device ID space deviates from the value presented in the GIC ID registers, let's slightly refactor the code so that the ID registers are probed before that quirks handling executes. This allows us to move the device ID override into the quirk handler itself. Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 20e2b5fac7b9..fbb8eba2ac84 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1650,23 +1650,14 @@ static void its_free_tables(struct its_node *its) static int its_alloc_tables(struct its_node *its) { - u64 typer = gic_read_typer(its->base + GITS_TYPER); - u32 ids = GITS_TYPER_DEVBITS(typer); u64 shr = GITS_BASER_InnerShareable; u64 cache = GITS_BASER_RaWaWb; u32 psz = SZ_64K; int err, i; - if (its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_22375) { - /* - * erratum 22375: only alloc 8MB table size - * erratum 24313: ignore memory access type - */ - cache = GITS_BASER_nCnB; - ids = 0x14; /* 20 bits, 8MB */ - } - - its->device_ids = ids; + if (its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_22375) + /* erratum 24313: ignore memory access type */ + cache = GITS_BASER_nCnB; for (i = 0; i < GITS_BASER_NR_REGS; i++) { struct its_baser *baser = its->tables + i; @@ -2743,6 +2734,8 @@ static void __maybe_unused its_enable_quirk_cavium_22375(void *data) { struct its_node *its = data; + /* erratum 22375: only alloc 8MB table size */ + its->device_ids = 0x14; /* 20 bits, 8MB */ its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_22375; } @@ -2944,6 +2937,7 @@ static int __init its_probe_one(struct resource *res, its->base = its_base; its->phys_base = res->start; its->ite_size = GITS_TYPER_ITT_ENTRY_SIZE(typer); + its->device_ids = GITS_TYPER_DEVBITS(typer); its->is_v4 = !!(typer & GITS_TYPER_VLPIS); if (its->is_v4) { if (!(typer & GITS_TYPER_VMOVP)) { From 9d111d49106b61f5a652d5418e85d8741b1a0427 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 17 Oct 2017 17:55:55 +0100 Subject: [PATCH 0432/1085] irqchip/gic: Make quirks matching conditional on init return value As it turns out, the IIDR is not sufficient to distinguish between GICv3 implementations when it comes to enabling quirks. So update the prototype of the init() hook to return a bool, and interpret a 'false' return value as no match, in which case the 'enabling workaround' log message should not be printed. Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-common.c | 5 +++-- drivers/irqchip/irq-gic-common.h | 2 +- drivers/irqchip/irq-gic-v3-its.c | 12 +++++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c index 9ae71804b5dd..30017df5b54c 100644 --- a/drivers/irqchip/irq-gic-common.c +++ b/drivers/irqchip/irq-gic-common.c @@ -40,8 +40,9 @@ void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, for (; quirks->desc; quirks++) { if (quirks->iidr != (quirks->mask & iidr)) continue; - quirks->init(data); - pr_info("GIC: enabling workaround for %s\n", quirks->desc); + if (quirks->init(data)) + pr_info("GIC: enabling workaround for %s\n", + quirks->desc); } } diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h index 205e5fddf6da..3919cd7c5285 100644 --- a/drivers/irqchip/irq-gic-common.h +++ b/drivers/irqchip/irq-gic-common.h @@ -23,7 +23,7 @@ struct gic_quirk { const char *desc; - void (*init)(void *data); + bool (*init)(void *data); u32 iidr; u32 mask; }; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fbb8eba2ac84..4d432804c2bc 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2730,28 +2730,34 @@ static int its_force_quiescent(void __iomem *base) } } -static void __maybe_unused its_enable_quirk_cavium_22375(void *data) +static bool __maybe_unused its_enable_quirk_cavium_22375(void *data) { struct its_node *its = data; /* erratum 22375: only alloc 8MB table size */ its->device_ids = 0x14; /* 20 bits, 8MB */ its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_22375; + + return true; } -static void __maybe_unused its_enable_quirk_cavium_23144(void *data) +static bool __maybe_unused its_enable_quirk_cavium_23144(void *data) { struct its_node *its = data; its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_23144; + + return true; } -static void __maybe_unused its_enable_quirk_qdf2400_e0065(void *data) +static bool __maybe_unused its_enable_quirk_qdf2400_e0065(void *data) { struct its_node *its = data; /* On QDF2400, the size of the ITE is 16Bytes */ its->ite_size = 16; + + return true; } static const struct gic_quirk its_quirks[] = { From 558b01654d92332d3b7b17bf773cdce8ce16ef46 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 17 Oct 2017 17:55:56 +0100 Subject: [PATCH 0433/1085] irqchip/gic-v3: Add workaround for Synquacer pre-ITS The Socionext Synquacer SoC's implementation of GICv3 has a so-called 'pre-ITS', which maps 32-bit writes targeted at a separate window of size '4 << device_id_bits' onto writes to GITS_TRANSLATER with device ID taken from bits [device_id_bits + 1:2] of the window offset. Writes that target GITS_TRANSLATER directly are reported as originating from device ID #0. So add a workaround for this. Given that this breaks isolation, clear the IRQ_DOMAIN_FLAG_MSI_REMAP flag as well. Acked-by: Rob Herring Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- .../interrupt-controller/arm,gic-v3.txt | 4 ++ arch/arm64/Kconfig | 8 +++ drivers/irqchip/irq-gic-v3-its.c | 72 ++++++++++++++++++- 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt index 4c29cdab0ea5..c3e6092f3add 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt @@ -75,6 +75,10 @@ These nodes must have the following properties: - reg: Specifies the base physical address and size of the ITS registers. +Optional: +- socionext,synquacer-pre-its: (u32, u32) tuple describing the untranslated + address and size of the pre-ITS window. + The main GIC node must contain the appropriate #address-cells, #size-cells and ranges properties for the reg property of all ITS nodes. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0df64a6a56d4..c4361dff2b74 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -539,6 +539,14 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. +config SOCIONEXT_SYNQUACER_PREITS + bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" + default y + help + Socionext Synquacer SoCs implement a separate h/w block to generate + MSI doorbell writes with non-zero values for the device ID. + + If unsure, say Y. endmenu diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 4d432804c2bc..54ea4e26c7a9 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -83,6 +83,8 @@ struct its_baser { u32 psz; }; +struct its_device; + /* * The ITS structure - contains most of the infrastructure, with the * top-level MSI domain, the command queue, the collections, and the @@ -97,11 +99,15 @@ struct its_node { struct its_cmd_block *cmd_write; struct its_baser tables[GITS_BASER_NR_REGS]; struct its_collection *collections; + struct fwnode_handle *fwnode_handle; + u64 (*get_msi_base)(struct its_device *its_dev); struct list_head its_device_list; u64 flags; u32 ite_size; u32 device_ids; int numa_node; + unsigned int msi_domain_flags; + u32 pre_its_base; /* for Socionext Synquacer */ bool is_v4; }; @@ -1095,6 +1101,13 @@ static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val, return IRQ_SET_MASK_OK_DONE; } +static u64 its_irq_get_msi_base(struct its_device *its_dev) +{ + struct its_node *its = its_dev->its; + + return its->phys_base + GITS_TRANSLATER; +} + static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); @@ -1102,7 +1115,7 @@ static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) u64 addr; its = its_dev->its; - addr = its->phys_base + GITS_TRANSLATER; + addr = its->get_msi_base(its_dev); msg->address_lo = lower_32_bits(addr); msg->address_hi = upper_32_bits(addr); @@ -2760,6 +2773,45 @@ static bool __maybe_unused its_enable_quirk_qdf2400_e0065(void *data) return true; } +static u64 its_irq_get_msi_base_pre_its(struct its_device *its_dev) +{ + struct its_node *its = its_dev->its; + + /* + * The Socionext Synquacer SoC has a so-called 'pre-ITS', + * which maps 32-bit writes targeted at a separate window of + * size '4 << device_id_bits' onto writes to GITS_TRANSLATER + * with device ID taken from bits [device_id_bits + 1:2] of + * the window offset. + */ + return its->pre_its_base + (its_dev->device_id << 2); +} + +static bool __maybe_unused its_enable_quirk_socionext_synquacer(void *data) +{ + struct its_node *its = data; + u32 pre_its_window[2]; + u32 ids; + + if (!fwnode_property_read_u32_array(its->fwnode_handle, + "socionext,synquacer-pre-its", + pre_its_window, + ARRAY_SIZE(pre_its_window))) { + + its->pre_its_base = pre_its_window[0]; + its->get_msi_base = its_irq_get_msi_base_pre_its; + + ids = ilog2(pre_its_window[1]) - 2; + if (its->device_ids > ids) + its->device_ids = ids; + + /* the pre-ITS breaks isolation, so disable MSI remapping */ + its->msi_domain_flags &= ~IRQ_DOMAIN_FLAG_MSI_REMAP; + return true; + } + return false; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -2784,6 +2836,19 @@ static const struct gic_quirk its_quirks[] = { .mask = 0xffffffff, .init = its_enable_quirk_qdf2400_e0065, }, +#endif +#ifdef CONFIG_SOCIONEXT_SYNQUACER_PREITS + { + /* + * The Socionext Synquacer SoC incorporates ARM's own GIC-500 + * implementation, but with a 'pre-ITS' added that requires + * special handling in software. + */ + .desc = "ITS: Socionext Synquacer pre-ITS", + .iidr = 0x0001143b, + .mask = 0xffffffff, + .init = its_enable_quirk_socionext_synquacer, + }, #endif { } @@ -2813,7 +2878,7 @@ static int its_init_domain(struct fwnode_handle *handle, struct its_node *its) inner_domain->parent = its_parent; irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS); - inner_domain->flags |= IRQ_DOMAIN_FLAG_MSI_REMAP; + inner_domain->flags |= its->msi_domain_flags; info->ops = &its_msi_domain_ops; info->data = its; inner_domain->host_data = info; @@ -2967,6 +3032,9 @@ static int __init its_probe_one(struct resource *res, goto out_free_its; } its->cmd_write = its->cmd_base; + its->fwnode_handle = handle; + its->get_msi_base = its_irq_get_msi_base; + its->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP; its_enable_quirks(its); From a19b462f044b1bc5327f0bc871b46a03efb84363 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 4 Aug 2017 17:45:50 +0100 Subject: [PATCH 0434/1085] irqchip/gic-v3-its: Add post-mortem info on command timeout If the ITS stops processing commands, we're pretty much toasted as we cannot update the configuration anymore (and we're not even sure that the ITS still translates interrups). If that happens, let's dump some basic information about the state of affairs before moving on. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 54ea4e26c7a9..0788de9a5407 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -690,9 +690,9 @@ static void its_flush_cmd(struct its_node *its, struct its_cmd_block *cmd) dsb(ishst); } -static void its_wait_for_range_completion(struct its_node *its, - struct its_cmd_block *from, - struct its_cmd_block *to) +static int its_wait_for_range_completion(struct its_node *its, + struct its_cmd_block *from, + struct its_cmd_block *to) { u64 rd_idx, from_idx, to_idx; u32 count = 1000000; /* 1s! */ @@ -713,12 +713,15 @@ static void its_wait_for_range_completion(struct its_node *its, count--; if (!count) { - pr_err_ratelimited("ITS queue timeout\n"); - return; + pr_err_ratelimited("ITS queue timeout (%llu %llu %llu)\n", + from_idx, to_idx, rd_idx); + return -1; } cpu_relax(); udelay(1); } + + return 0; } /* Warning, macro hell follows */ @@ -754,7 +757,8 @@ post: \ next_cmd = its_post_commands(its); \ raw_spin_unlock_irqrestore(&its->lock, flags); \ \ - its_wait_for_range_completion(its, cmd, next_cmd); \ + if (its_wait_for_range_completion(its, cmd, next_cmd)) \ + pr_err_ratelimited("ITS cmd %ps failed\n", builder); \ } static void its_build_sync_cmd(struct its_cmd_block *sync_cmd, From 67047f90d7dd886d3f505185a6d75517bdbd907c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 28 Jul 2017 21:16:58 +0100 Subject: [PATCH 0435/1085] irqchip/gic-v3-its: Pass its_node pointer to each command builder In order to be able to issue command variants depending on how broken an ITS is, let's pass the its pointer to all command building primitives. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 58 +++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0788de9a5407..4aedbdc62aa0 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -274,10 +274,12 @@ struct its_cmd_block { #define ITS_CMD_QUEUE_SZ SZ_64K #define ITS_CMD_QUEUE_NR_ENTRIES (ITS_CMD_QUEUE_SZ / sizeof(struct its_cmd_block)) -typedef struct its_collection *(*its_cmd_builder_t)(struct its_cmd_block *, +typedef struct its_collection *(*its_cmd_builder_t)(struct its_node *, + struct its_cmd_block *, struct its_cmd_desc *); -typedef struct its_vpe *(*its_cmd_vbuilder_t)(struct its_cmd_block *, +typedef struct its_vpe *(*its_cmd_vbuilder_t)(struct its_node *, + struct its_cmd_block *, struct its_cmd_desc *); static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l) @@ -381,7 +383,8 @@ static inline void its_fixup_cmd(struct its_cmd_block *cmd) cmd->raw_cmd[3] = cpu_to_le64(cmd->raw_cmd[3]); } -static struct its_collection *its_build_mapd_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_mapd_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { unsigned long itt_addr; @@ -401,7 +404,8 @@ static struct its_collection *its_build_mapd_cmd(struct its_cmd_block *cmd, return NULL; } -static struct its_collection *its_build_mapc_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_mapc_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { its_encode_cmd(cmd, GITS_CMD_MAPC); @@ -414,7 +418,8 @@ static struct its_collection *its_build_mapc_cmd(struct its_cmd_block *cmd, return desc->its_mapc_cmd.col; } -static struct its_collection *its_build_mapti_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_mapti_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -433,7 +438,8 @@ static struct its_collection *its_build_mapti_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_movi_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_movi_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -451,7 +457,8 @@ static struct its_collection *its_build_movi_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_discard_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_discard_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -468,7 +475,8 @@ static struct its_collection *its_build_discard_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_inv_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_inv_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -485,7 +493,8 @@ static struct its_collection *its_build_inv_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_int_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -502,7 +511,8 @@ static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_clear_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_clear_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { struct its_collection *col; @@ -519,7 +529,8 @@ static struct its_collection *its_build_clear_cmd(struct its_cmd_block *cmd, return col; } -static struct its_collection *its_build_invall_cmd(struct its_cmd_block *cmd, +static struct its_collection *its_build_invall_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { its_encode_cmd(cmd, GITS_CMD_INVALL); @@ -530,7 +541,8 @@ static struct its_collection *its_build_invall_cmd(struct its_cmd_block *cmd, return NULL; } -static struct its_vpe *its_build_vinvall_cmd(struct its_cmd_block *cmd, +static struct its_vpe *its_build_vinvall_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { its_encode_cmd(cmd, GITS_CMD_VINVALL); @@ -541,7 +553,8 @@ static struct its_vpe *its_build_vinvall_cmd(struct its_cmd_block *cmd, return desc->its_vinvall_cmd.vpe; } -static struct its_vpe *its_build_vmapp_cmd(struct its_cmd_block *cmd, +static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { unsigned long vpt_addr; @@ -560,7 +573,8 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_cmd_block *cmd, return desc->its_vmapp_cmd.vpe; } -static struct its_vpe *its_build_vmapti_cmd(struct its_cmd_block *cmd, +static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { u32 db; @@ -582,7 +596,8 @@ static struct its_vpe *its_build_vmapti_cmd(struct its_cmd_block *cmd, return desc->its_vmapti_cmd.vpe; } -static struct its_vpe *its_build_vmovi_cmd(struct its_cmd_block *cmd, +static struct its_vpe *its_build_vmovi_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { u32 db; @@ -604,7 +619,8 @@ static struct its_vpe *its_build_vmovi_cmd(struct its_cmd_block *cmd, return desc->its_vmovi_cmd.vpe; } -static struct its_vpe *its_build_vmovp_cmd(struct its_cmd_block *cmd, +static struct its_vpe *its_build_vmovp_cmd(struct its_node *its, + struct its_cmd_block *cmd, struct its_cmd_desc *desc) { its_encode_cmd(cmd, GITS_CMD_VMOVP); @@ -741,7 +757,7 @@ void name(struct its_node *its, \ raw_spin_unlock_irqrestore(&its->lock, flags); \ return; \ } \ - sync_obj = builder(cmd, desc); \ + sync_obj = builder(its, cmd, desc); \ its_flush_cmd(its, cmd); \ \ if (sync_obj) { \ @@ -749,7 +765,7 @@ void name(struct its_node *its, \ if (!sync_cmd) \ goto post; \ \ - buildfn(sync_cmd, sync_obj); \ + buildfn(its, sync_cmd, sync_obj); \ its_flush_cmd(its, sync_cmd); \ } \ \ @@ -761,7 +777,8 @@ post: \ pr_err_ratelimited("ITS cmd %ps failed\n", builder); \ } -static void its_build_sync_cmd(struct its_cmd_block *sync_cmd, +static void its_build_sync_cmd(struct its_node *its, + struct its_cmd_block *sync_cmd, struct its_collection *sync_col) { its_encode_cmd(sync_cmd, GITS_CMD_SYNC); @@ -773,7 +790,8 @@ static void its_build_sync_cmd(struct its_cmd_block *sync_cmd, static BUILD_SINGLE_CMD_FUNC(its_send_single_command, its_cmd_builder_t, struct its_collection, its_build_sync_cmd) -static void its_build_vsync_cmd(struct its_cmd_block *sync_cmd, +static void its_build_vsync_cmd(struct its_node *its, + struct its_cmd_block *sync_cmd, struct its_vpe *sync_vpe) { its_encode_cmd(sync_cmd, GITS_CMD_VSYNC); From 5c9a882e940dde2f3e80eb3c7635a3307be511b6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 28 Jul 2017 21:20:37 +0100 Subject: [PATCH 0436/1085] irqchip/gic-v3-its: Workaround HiSilicon Hip07 redistributor addressing The ITSes on the Hip07 (as present in the Huawei D05) are broken when it comes to addressing the redistributors, and need to be explicitely told to address the VLPI page instead of the redistributor base address. So let's add yet another quirk, fixing up the target address in the command stream. Signed-off-by: Marc Zyngier --- Documentation/arm64/silicon-errata.txt | 1 + arch/arm64/Kconfig | 11 ++++++++++ drivers/irqchip/irq-gic-v3-its.c | 30 ++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 66e8ce14d23d..304bf22bb83c 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -70,6 +70,7 @@ stable kernels. | | | | | | Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | | Hisilicon | Hip0{6,7} | #161010701 | N/A | +| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | | | | | | | Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4361dff2b74..22455e4168c1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -539,6 +539,7 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. + config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -546,6 +547,16 @@ config SOCIONEXT_SYNQUACER_PREITS Socionext Synquacer SoCs implement a separate h/w block to generate MSI doorbell writes with non-zero values for the device ID. + If unsure, say Y. + +config HISILICON_ERRATUM_161600802 + bool "Hip07 161600802: Erroneous redistributor VLPI base" + default y + help + The HiSilicon Hip07 SoC usees the wrong redistributor base + when issued ITS commands such as VMOVP and VMAPP, and requires + a 128kB offset to be applied to the target address in this commands. + If unsure, say Y. endmenu diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 4aedbdc62aa0..6cc57dc142df 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -109,6 +109,7 @@ struct its_node { unsigned int msi_domain_flags; u32 pre_its_base; /* for Socionext Synquacer */ bool is_v4; + int vlpi_redist_offset; }; #define ITS_ITT_ALIGN SZ_256 @@ -558,13 +559,15 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, struct its_cmd_desc *desc) { unsigned long vpt_addr; + u64 target; vpt_addr = virt_to_phys(page_address(desc->its_vmapp_cmd.vpe->vpt_page)); + target = desc->its_vmapp_cmd.col->target_address + its->vlpi_redist_offset; its_encode_cmd(cmd, GITS_CMD_VMAPP); its_encode_vpeid(cmd, desc->its_vmapp_cmd.vpe->vpe_id); its_encode_valid(cmd, desc->its_vmapp_cmd.valid); - its_encode_target(cmd, desc->its_vmapp_cmd.col->target_address); + its_encode_target(cmd, target); its_encode_vpt_addr(cmd, vpt_addr); its_encode_vpt_size(cmd, LPI_NRBITS - 1); @@ -623,11 +626,14 @@ static struct its_vpe *its_build_vmovp_cmd(struct its_node *its, struct its_cmd_block *cmd, struct its_cmd_desc *desc) { + u64 target; + + target = desc->its_vmovp_cmd.col->target_address + its->vlpi_redist_offset; its_encode_cmd(cmd, GITS_CMD_VMOVP); its_encode_seq_num(cmd, desc->its_vmovp_cmd.seq_num); its_encode_its_list(cmd, desc->its_vmovp_cmd.its_list); its_encode_vpeid(cmd, desc->its_vmovp_cmd.vpe->vpe_id); - its_encode_target(cmd, desc->its_vmovp_cmd.col->target_address); + its_encode_target(cmd, target); its_fixup_cmd(cmd); @@ -2834,6 +2840,18 @@ static bool __maybe_unused its_enable_quirk_socionext_synquacer(void *data) return false; } +static bool __maybe_unused its_enable_quirk_hip07_161600802(void *data) +{ + struct its_node *its = data; + + /* + * Hip07 insists on using the wrong address for the VLPI + * page. Trick it into doing the right thing... + */ + its->vlpi_redist_offset = SZ_128K; + return true; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -2871,6 +2889,14 @@ static const struct gic_quirk its_quirks[] = { .mask = 0xffffffff, .init = its_enable_quirk_socionext_synquacer, }, +#endif +#ifdef CONFIG_HISILICON_ERRATUM_161600802 + { + .desc = "ITS: Hip07 erratum 161600802", + .iidr = 0x00000004, + .mask = 0xffffffff, + .init = its_enable_quirk_hip07_161600802, + }, #endif { } From debf6d02bb58a099202375ca2ba88e9775b153c6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 8 Oct 2017 18:44:42 +0100 Subject: [PATCH 0437/1085] irqchip/gic-v3-its: Track per-ITS list number At boot time, we enumerate all the GICv4-capable ITSs, and build a mask of the available ITSs. Take this opportunity to store the ITS number in the its_node structure so that we can use it at a later time. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 6cc57dc142df..994f0879e7df 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -103,6 +103,7 @@ struct its_node { u64 (*get_msi_base)(struct its_device *its_dev); struct list_head its_device_list; u64 flags; + unsigned long list_nr; u32 ite_size; u32 device_ids; int numa_node; @@ -3064,6 +3065,8 @@ static int __init its_probe_one(struct resource *res, if (err < 0) goto out_free_its; + its->list_nr = err; + pr_info("ITS@%pa: Using ITS number %d\n", &res->start, err); } else { From ab60491ee5d346557f152c7e8d3e7238c9b96c5c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 8 Oct 2017 18:48:06 +0100 Subject: [PATCH 0438/1085] irqchip/gic-v3-its: Make GICv4_ITS_LIST_MAX globally available As we're about to make use of the maximum number of ITSs in a GICv4 system, let's make this value global (and rename it to GICv4_ITS_LIST_MAX). Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 10 ++-------- include/linux/irqchip/arm-gic-v4.h | 6 ++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 994f0879e7df..a63b4ee34860 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -156,12 +156,6 @@ static DEFINE_SPINLOCK(its_lock); static struct rdists *gic_rdists; static struct irq_domain *its_parent; -/* - * We have a maximum number of 16 ITSs in the whole system if we're - * using the ITSList mechanism - */ -#define ITS_LIST_MAX 16 - static unsigned long its_list_map; static u16 vmovp_seq_num; static DEFINE_RAW_SPINLOCK(vmovp_lock); @@ -2988,8 +2982,8 @@ static int __init its_compute_its_list_map(struct resource *res, * locking. Should this change, we should address * this. */ - its_number = find_first_zero_bit(&its_list_map, ITS_LIST_MAX); - if (its_number >= ITS_LIST_MAX) { + its_number = find_first_zero_bit(&its_list_map, GICv4_ITS_LIST_MAX); + if (its_number >= GICv4_ITS_LIST_MAX) { pr_err("ITS@%pa: No ITSList entry available!\n", &res->start); return -EINVAL; diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 58a4d89aa82c..e26a668826e6 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -20,6 +20,12 @@ struct its_vpe; +/* + * Maximum number of ITTs when GITS_TYPER.VMOVP == 0, using the + * ITSList mechanism to perform inter-ITS synchronization. + */ +#define GICv4_ITS_LIST_MAX 16 + /* Embedded in kvm.arch */ struct its_vm { struct fwnode_handle *fwnode; From 40619a2ef69d5e183717d28e8c98a59319c78a4f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 8 Oct 2017 15:16:09 +0100 Subject: [PATCH 0439/1085] irqchip/gic-v3-its: Make its_send_vinvall operate on a single ITS Currently, its_send_vinvall operates on all ITSs. As we're about to try and limit the amount of commands we send to ITSs that are not involved in dealing with a given VM, let's redefine that primitive so that it takes a target ITS as a parameter. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index a63b4ee34860..22ee83043785 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -983,18 +983,12 @@ static void its_send_vmovp(struct its_vpe *vpe) raw_spin_unlock_irqrestore(&vmovp_lock, flags); } -static void its_send_vinvall(struct its_vpe *vpe) +static void its_send_vinvall(struct its_node *its, struct its_vpe *vpe) { struct its_cmd_desc desc; - struct its_node *its; desc.its_vinvall_cmd.vpe = vpe; - - list_for_each_entry(its, &its_nodes, entry) { - if (!its->is_v4) - continue; - its_send_single_vcommand(its, its_build_vinvall_cmd, &desc); - } + its_send_single_vcommand(its, its_build_vinvall_cmd, &desc); } /* @@ -2466,6 +2460,18 @@ static void its_vpe_deschedule(struct its_vpe *vpe) } } +static void its_vpe_invall(struct its_vpe *vpe) +{ + struct its_node *its; + + list_for_each_entry(its, &its_nodes, entry) { + if (!its->is_v4) + continue; + + its_send_vinvall(its, vpe); + } +} + static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); @@ -2481,7 +2487,7 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) return 0; case INVALL_VPE: - its_send_vinvall(vpe); + its_vpe_invall(vpe); return 0; default: @@ -2710,11 +2716,19 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain, struct irq_data *d, bool early) { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + struct its_node *its; /* Map the VPE to the first possible CPU */ vpe->col_idx = cpumask_first(cpu_online_mask); its_send_vmapp(vpe, true); - its_send_vinvall(vpe); + + list_for_each_entry(its, &its_nodes, entry) { + if (!its->is_v4) + continue; + + its_send_vinvall(its, vpe); + } + return 0; } From 75fd951be846a494b0f01f80e859dcb74970f0d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 8 Oct 2017 18:46:39 +0100 Subject: [PATCH 0440/1085] irqchip/gic-v3-its: Make its_send_vmapp operate on a single ITS Currently, its_send_vmapp operates on all ITSs. As we're about to try and limit the amount of commands we send to ITSs that are not involved in dealing with a given VM, let's redefine that primitive so that it takes a target ITS as a parameter. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 22ee83043785..b14585d6397b 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -924,21 +924,16 @@ static void its_send_vmovi(struct its_device *dev, u32 id) its_send_single_vcommand(dev->its, its_build_vmovi_cmd, &desc); } -static void its_send_vmapp(struct its_vpe *vpe, bool valid) +static void its_send_vmapp(struct its_node *its, + struct its_vpe *vpe, bool valid) { struct its_cmd_desc desc; - struct its_node *its; desc.its_vmapp_cmd.vpe = vpe; desc.its_vmapp_cmd.valid = valid; + desc.its_vmapp_cmd.col = &its->collections[vpe->col_idx]; - list_for_each_entry(its, &its_nodes, entry) { - if (!its->is_v4) - continue; - - desc.its_vmapp_cmd.col = &its->collections[vpe->col_idx]; - its_send_single_vcommand(its, its_build_vmapp_cmd, &desc); - } + its_send_single_vcommand(its, its_build_vmapp_cmd, &desc); } static void its_send_vmovp(struct its_vpe *vpe) @@ -2720,12 +2715,12 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain, /* Map the VPE to the first possible CPU */ vpe->col_idx = cpumask_first(cpu_online_mask); - its_send_vmapp(vpe, true); list_for_each_entry(its, &its_nodes, entry) { if (!its->is_v4) continue; + its_send_vmapp(its, vpe, true); its_send_vinvall(its, vpe); } @@ -2736,8 +2731,14 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain, struct irq_data *d) { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + struct its_node *its; - its_send_vmapp(vpe, false); + list_for_each_entry(its, &its_nodes, entry) { + if (!its->is_v4) + continue; + + its_send_vmapp(its, vpe, false); + } } static const struct irq_domain_ops its_vpe_domain_ops = { From 2247e1bf70639642b1c1375aa9176ccd95736400 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 8 Oct 2017 18:50:36 +0100 Subject: [PATCH 0441/1085] irqchip/gic-v3-its: Limit scope of VPE mapping to be per ITS So far, we map all VPEs on all ITSs. While this is not wrong, this is quite a big hammer, as moving a VPE around requires all ITSs to be synchronized. Needles to say, this is an expensive proposition. Instead, let's switch to a mode where we issue VMAPP commands only on ITSs that are actually involved in reporting interrupts to the given VM. For that purpose, we refcount the number of interrupts are are mapped for this VM on each ITS, performing the map/unmap operations as required. It then allows us to use this refcount to only issue VMOVP to the ITSs that need to know about this VM. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 75 ++++++++++++++++++++++++++++++ include/linux/irqchip/arm-gic-v4.h | 1 + 2 files changed, 76 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index b14585d6397b..dc0ece20f964 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -971,6 +971,9 @@ static void its_send_vmovp(struct its_vpe *vpe) if (!its->is_v4) continue; + if (!vpe->its_vm->vlpi_count[its->list_nr]) + continue; + desc.its_vmovp_cmd.col = &its->collections[col_id]; its_send_single_vcommand(its, its_build_vmovp_cmd, &desc); } @@ -1154,6 +1157,58 @@ static int its_irq_set_irqchip_state(struct irq_data *d, return 0; } +static void its_map_vm(struct its_node *its, struct its_vm *vm) +{ + unsigned long flags; + + /* Not using the ITS list? Everything is always mapped. */ + if (!its_list_map) + return; + + raw_spin_lock_irqsave(&vmovp_lock, flags); + + /* + * If the VM wasn't mapped yet, iterate over the vpes and get + * them mapped now. + */ + vm->vlpi_count[its->list_nr]++; + + if (vm->vlpi_count[its->list_nr] == 1) { + int i; + + for (i = 0; i < vm->nr_vpes; i++) { + struct its_vpe *vpe = vm->vpes[i]; + + /* Map the VPE to the first possible CPU */ + vpe->col_idx = cpumask_first(cpu_online_mask); + its_send_vmapp(its, vpe, true); + its_send_vinvall(its, vpe); + } + } + + raw_spin_unlock_irqrestore(&vmovp_lock, flags); +} + +static void its_unmap_vm(struct its_node *its, struct its_vm *vm) +{ + unsigned long flags; + + /* Not using the ITS list? Everything is always mapped. */ + if (!its_list_map) + return; + + raw_spin_lock_irqsave(&vmovp_lock, flags); + + if (!--vm->vlpi_count[its->list_nr]) { + int i; + + for (i = 0; i < vm->nr_vpes; i++) + its_send_vmapp(its, vm->vpes[i], false); + } + + raw_spin_unlock_irqrestore(&vmovp_lock, flags); +} + static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); @@ -1189,6 +1244,9 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) /* Already mapped, move it around */ its_send_vmovi(its_dev, event); } else { + /* Ensure all the VPEs are mapped on this ITS */ + its_map_vm(its_dev->its, info->map->vm); + /* Drop the physical mapping */ its_send_discard(its_dev, event); @@ -1250,6 +1308,9 @@ static int its_vlpi_unmap(struct irq_data *d) LPI_PROP_ENABLED | LPI_PROP_GROUP1)); + /* Potentially unmap the VM from this ITS */ + its_unmap_vm(its_dev->its, its_dev->event_map.vm); + /* * Drop the refcount and make the device available again if * this was the last VLPI. @@ -2463,6 +2524,9 @@ static void its_vpe_invall(struct its_vpe *vpe) if (!its->is_v4) continue; + if (its_list_map && !vpe->its_vm->vlpi_count[its->list_nr]) + continue; + its_send_vinvall(its, vpe); } } @@ -2713,6 +2777,10 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain, struct its_vpe *vpe = irq_data_get_irq_chip_data(d); struct its_node *its; + /* If we use the list map, we issue VMAPP on demand... */ + if (its_list_map) + return true; + /* Map the VPE to the first possible CPU */ vpe->col_idx = cpumask_first(cpu_online_mask); @@ -2733,6 +2801,13 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain, struct its_vpe *vpe = irq_data_get_irq_chip_data(d); struct its_node *its; + /* + * If we use the list map, we unmap the VPE once no VLPIs are + * associated with the VM. + */ + if (its_list_map) + return; + list_for_each_entry(its, &its_nodes, entry) { if (!its->is_v4) continue; diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index e26a668826e6..43cde15f221b 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -36,6 +36,7 @@ struct its_vm { irq_hw_number_t db_lpi_base; unsigned long *db_bitmap; int nr_db_lpis; + u32 vlpi_count[GICv4_ITS_LIST_MAX]; }; /* Embedded in kvm_vcpu.arch */ From 3c1cceeb3d2879d98eda7afd62838fde7b0f920d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 9 Oct 2017 13:17:43 +0100 Subject: [PATCH 0442/1085] irqchip/gic-v3-its: Only send VINVALL to a single ITS Sending VINVALL to all ITSs is completely pointless, as all we're trying to achieve is to tell the redistributor that the property table for this VPE should be invalidated. Let's issue the command on the first valid ITS and be done with it. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index dc0ece20f964..bc7099352cc9 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2527,7 +2527,12 @@ static void its_vpe_invall(struct its_vpe *vpe) if (its_list_map && !vpe->its_vm->vlpi_count[its->list_nr]) continue; + /* + * Sending a VINVALL to a single ITS is enough, as all + * we need is to reach the redistributors. + */ its_send_vinvall(its, vpe); + return; } } From 44c4c25e3103d26bee4cc041cbf526e41975055b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 19 Oct 2017 10:11:34 +0100 Subject: [PATCH 0443/1085] irqchip/gic-v3-its: Update effective affinity on VPE mapping When setting the affinity of a VPE (either because we map or move it), make sure the effective affinity is correctly reported back to the core kernel. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index bc7099352cc9..6a74f0497f82 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1178,11 +1178,13 @@ static void its_map_vm(struct its_node *its, struct its_vm *vm) for (i = 0; i < vm->nr_vpes; i++) { struct its_vpe *vpe = vm->vpes[i]; + struct irq_data *d = irq_get_irq_data(vpe->irq); /* Map the VPE to the first possible CPU */ vpe->col_idx = cpumask_first(cpu_online_mask); its_send_vmapp(its, vpe, true); its_send_vinvall(its, vpe); + irq_data_update_effective_affinity(d, cpumask_of(vpe->col_idx)); } } @@ -2449,6 +2451,8 @@ static int its_vpe_set_affinity(struct irq_data *d, its_vpe_db_proxy_move(vpe, from, cpu); } + irq_data_update_effective_affinity(d, cpumask_of(cpu)); + return IRQ_SET_MASK_OK_DONE; } @@ -2797,6 +2801,8 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain, its_send_vinvall(its, vpe); } + irq_data_update_effective_affinity(d, cpumask_of(vpe->col_idx)); + return 0; } From df48d3b5ef276998e399846db12c241e6f9d318c Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 18 Sep 2017 15:46:09 +0200 Subject: [PATCH 0444/1085] dt-bindings: interrupt-controller: Add DT binding for meson GPIO interrupt controller This commit adds the device tree bindings description for Amlogic's GPIO interrupt controller available on the meson8b, gxbb and gxl SoC families Cc: Heiner Kallweit Acked-by: Rob Herring Signed-off-by: Jerome Brunet Signed-off-by: Marc Zyngier --- .../amlogic,meson-gpio-intc.txt | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt new file mode 100644 index 000000000000..633e21ce4b17 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt @@ -0,0 +1,35 @@ +Amlogic meson GPIO interrupt controller + +Meson SoCs contains an interrupt controller which is able to watch the SoC +pads and generate an interrupt on edge or level. The controller is essentially +a 256 pads to 8 GIC interrupt multiplexer, with a filter block to select edge +or level and polarity. It does not expose all 256 mux inputs because the +documentation shows that the upper part is not mapped to any pad. The actual +number of interrupt exposed depends on the SoC. + +Required properties: + +- compatible : must have "amlogic,meson8-gpio-intc” and either + “amlogic,meson8b-gpio-intc” for meson8b SoCs (S805) or + “amlogic,meson-gxbb-gpio-intc” for GXBB SoCs (S905) or + “amlogic,meson-gxl-gpio-intc” for GXL SoCs (S905X, S912) +- interrupt-parent : a phandle to the GIC the interrupts are routed to. + Usually this is provided at the root level of the device tree as it is + common to most of the SoC. +- reg : Specifies base physical address and size of the registers. +- interrupt-controller : Identifies the node as an interrupt controller. +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The value must be 2. +- meson,channel-interrupts: Array with the 8 upstream hwirq numbers. These + are the hwirqs used on the parent interrupt controller. + +Example: + +gpio_interrupt: interrupt-controller@9880 { + compatible = "amlogic,meson-gxbb-gpio-intc", + "amlogic,meson-gpio-intc"; + reg = <0x0 0x9880 0x0 0x10>; + interrupt-controller; + #interrupt-cells = <2>; + meson,channel-interrupts = <64 65 66 67 68 69 70 71>; +}; From 215f4cc0fb208665dd15a524ec57edf4d7e215e6 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 18 Sep 2017 15:46:10 +0200 Subject: [PATCH 0445/1085] irqchip/meson: Add support for gpio interrupt controller Add support for the interrupt gpio controller found on Amlogic's meson SoC family. This controller is a separate controller from the gpio controller. It is able to spy on the SoC pad. It is essentially a 256 to 8 router with a filtering block to select level or edge and polarity. The number of actual mappable inputs depends on the SoC. Cc: Heiner Kallweit Signed-off-by: Jerome Brunet Signed-off-by: Marc Zyngier --- drivers/irqchip/Kconfig | 8 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-meson-gpio.c | 414 +++++++++++++++++++++++++++++++ 3 files changed, 423 insertions(+) create mode 100644 drivers/irqchip/irq-meson-gpio.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 77df38ed0050..1cb39405d7f8 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -324,4 +324,12 @@ config IRQ_UNIPHIER_AIDET help Support for the UniPhier AIDET (ARM Interrupt Detector). +config MESON_IRQ_GPIO + bool "Meson GPIO Interrupt Multiplexer" + depends on ARCH_MESON || COMPILE_TEST + select IRQ_DOMAIN + select IRQ_DOMAIN_HIERARCHY + help + Support Meson SoC Family GPIO Interrupt Multiplexer + endmenu diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 845abc107ad5..065adf4102c9 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_ARCH_ASPEED) += irq-aspeed-vic.o irq-aspeed-i2c-ic.o obj-$(CONFIG_STM32_EXTI) += irq-stm32-exti.o obj-$(CONFIG_QCOM_IRQ_COMBINER) += qcom-irq-combiner.o obj-$(CONFIG_IRQ_UNIPHIER_AIDET) += irq-uniphier-aidet.o +obj-$(CONFIG_MESON_IRQ_GPIO) += irq-meson-gpio.o diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c new file mode 100644 index 000000000000..c7cc7e37a23c --- /dev/null +++ b/drivers/irqchip/irq-meson-gpio.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2015 Endless Mobile, Inc. + * Author: Carlo Caione + * Copyright (c) 2016 BayLibre, SAS. + * Author: Jerome Brunet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * The full GNU General Public License is included in this distribution + * in the file called COPYING. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_CHANNEL 8 +#define MAX_INPUT_MUX 256 + +#define REG_EDGE_POL 0x00 +#define REG_PIN_03_SEL 0x04 +#define REG_PIN_47_SEL 0x08 +#define REG_FILTER_SEL 0x0c + +#define REG_EDGE_POL_MASK(x) (BIT(x) | BIT(16 + (x))) +#define REG_EDGE_POL_EDGE(x) BIT(x) +#define REG_EDGE_POL_LOW(x) BIT(16 + (x)) +#define REG_PIN_SEL_SHIFT(x) (((x) % 4) * 8) +#define REG_FILTER_SEL_SHIFT(x) ((x) * 4) + +struct meson_gpio_irq_params { + unsigned int nr_hwirq; +}; + +static const struct meson_gpio_irq_params meson8b_params = { + .nr_hwirq = 119, +}; + +static const struct meson_gpio_irq_params gxbb_params = { + .nr_hwirq = 133, +}; + +static const struct meson_gpio_irq_params gxl_params = { + .nr_hwirq = 110, +}; + +static const struct of_device_id meson_irq_gpio_matches[] = { + { .compatible = "amlogic,meson8b-gpio-intc", .data = &meson8b_params }, + { .compatible = "amlogic,meson-gxbb-gpio-intc", .data = &gxbb_params }, + { .compatible = "amlogic,meson-gxl-gpio-intc", .data = &gxl_params }, + { } +}; + +struct meson_gpio_irq_controller { + unsigned int nr_hwirq; + void __iomem *base; + u32 channel_irqs[NUM_CHANNEL]; + DECLARE_BITMAP(channel_map, NUM_CHANNEL); + spinlock_t lock; +}; + +static void meson_gpio_irq_update_bits(struct meson_gpio_irq_controller *ctl, + unsigned int reg, u32 mask, u32 val) +{ + u32 tmp; + + tmp = readl_relaxed(ctl->base + reg); + tmp &= ~mask; + tmp |= val; + writel_relaxed(tmp, ctl->base + reg); +} + +static unsigned int meson_gpio_irq_channel_to_reg(unsigned int channel) +{ + return (channel < 4) ? REG_PIN_03_SEL : REG_PIN_47_SEL; +} + +static int +meson_gpio_irq_request_channel(struct meson_gpio_irq_controller *ctl, + unsigned long hwirq, + u32 **channel_hwirq) +{ + unsigned int reg, idx; + + spin_lock(&ctl->lock); + + /* Find a free channel */ + idx = find_first_zero_bit(ctl->channel_map, NUM_CHANNEL); + if (idx >= NUM_CHANNEL) { + spin_unlock(&ctl->lock); + pr_err("No channel available\n"); + return -ENOSPC; + } + + /* Mark the channel as used */ + set_bit(idx, ctl->channel_map); + + /* + * Setup the mux of the channel to route the signal of the pad + * to the appropriate input of the GIC + */ + reg = meson_gpio_irq_channel_to_reg(idx); + meson_gpio_irq_update_bits(ctl, reg, + 0xff << REG_PIN_SEL_SHIFT(idx), + hwirq << REG_PIN_SEL_SHIFT(idx)); + + /* + * Get the hwirq number assigned to this channel through + * a pointer the channel_irq table. The added benifit of this + * method is that we can also retrieve the channel index with + * it, using the table base. + */ + *channel_hwirq = &(ctl->channel_irqs[idx]); + + spin_unlock(&ctl->lock); + + pr_debug("hwirq %lu assigned to channel %d - irq %u\n", + hwirq, idx, **channel_hwirq); + + return 0; +} + +static unsigned int +meson_gpio_irq_get_channel_idx(struct meson_gpio_irq_controller *ctl, + u32 *channel_hwirq) +{ + return channel_hwirq - ctl->channel_irqs; +} + +static void +meson_gpio_irq_release_channel(struct meson_gpio_irq_controller *ctl, + u32 *channel_hwirq) +{ + unsigned int idx; + + idx = meson_gpio_irq_get_channel_idx(ctl, channel_hwirq); + clear_bit(idx, ctl->channel_map); +} + +static int meson_gpio_irq_type_setup(struct meson_gpio_irq_controller *ctl, + unsigned int type, + u32 *channel_hwirq) +{ + u32 val = 0; + unsigned int idx; + + idx = meson_gpio_irq_get_channel_idx(ctl, channel_hwirq); + + /* + * The controller has a filter block to operate in either LEVEL or + * EDGE mode, then signal is sent to the GIC. To enable LEVEL_LOW and + * EDGE_FALLING support (which the GIC does not support), the filter + * block is also able to invert the input signal it gets before + * providing it to the GIC. + */ + type &= IRQ_TYPE_SENSE_MASK; + + if (type == IRQ_TYPE_EDGE_BOTH) + return -EINVAL; + + if (type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) + val |= REG_EDGE_POL_EDGE(idx); + + if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_EDGE_FALLING)) + val |= REG_EDGE_POL_LOW(idx); + + spin_lock(&ctl->lock); + + meson_gpio_irq_update_bits(ctl, REG_EDGE_POL, + REG_EDGE_POL_MASK(idx), val); + + spin_unlock(&ctl->lock); + + return 0; +} + +static unsigned int meson_gpio_irq_type_output(unsigned int type) +{ + unsigned int sense = type & IRQ_TYPE_SENSE_MASK; + + type &= ~IRQ_TYPE_SENSE_MASK; + + /* + * The polarity of the signal provided to the GIC should always + * be high. + */ + if (sense & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW)) + type |= IRQ_TYPE_LEVEL_HIGH; + else if (sense & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) + type |= IRQ_TYPE_EDGE_RISING; + + return type; +} + +static int meson_gpio_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct meson_gpio_irq_controller *ctl = data->domain->host_data; + u32 *channel_hwirq = irq_data_get_irq_chip_data(data); + int ret; + + ret = meson_gpio_irq_type_setup(ctl, type, channel_hwirq); + if (ret) + return ret; + + return irq_chip_set_type_parent(data, + meson_gpio_irq_type_output(type)); +} + +static struct irq_chip meson_gpio_irq_chip = { + .name = "meson-gpio-irqchip", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_type = meson_gpio_irq_set_type, + .irq_retrigger = irq_chip_retrigger_hierarchy, +#ifdef CONFIG_SMP + .irq_set_affinity = irq_chip_set_affinity_parent, +#endif + .flags = IRQCHIP_SET_TYPE_MASKED, +}; + +static int meson_gpio_irq_domain_translate(struct irq_domain *domain, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + if (is_of_node(fwspec->fwnode) && fwspec->param_count == 2) { + *hwirq = fwspec->param[0]; + *type = fwspec->param[1]; + return 0; + } + + return -EINVAL; +} + +static int meson_gpio_irq_allocate_gic_irq(struct irq_domain *domain, + unsigned int virq, + u32 hwirq, + unsigned int type) +{ + struct irq_fwspec fwspec; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 3; + fwspec.param[0] = 0; /* SPI */ + fwspec.param[1] = hwirq; + fwspec.param[2] = meson_gpio_irq_type_output(type); + + return irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); +} + +static int meson_gpio_irq_domain_alloc(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs, + void *data) +{ + struct irq_fwspec *fwspec = data; + struct meson_gpio_irq_controller *ctl = domain->host_data; + unsigned long hwirq; + u32 *channel_hwirq; + unsigned int type; + int ret; + + if (WARN_ON(nr_irqs != 1)) + return -EINVAL; + + ret = meson_gpio_irq_domain_translate(domain, fwspec, &hwirq, &type); + if (ret) + return ret; + + ret = meson_gpio_irq_request_channel(ctl, hwirq, &channel_hwirq); + if (ret) + return ret; + + ret = meson_gpio_irq_allocate_gic_irq(domain, virq, + *channel_hwirq, type); + if (ret < 0) { + pr_err("failed to allocate gic irq %u\n", *channel_hwirq); + meson_gpio_irq_release_channel(ctl, channel_hwirq); + return ret; + } + + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, + &meson_gpio_irq_chip, channel_hwirq); + + return 0; +} + +static void meson_gpio_irq_domain_free(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs) +{ + struct meson_gpio_irq_controller *ctl = domain->host_data; + struct irq_data *irq_data; + u32 *channel_hwirq; + + if (WARN_ON(nr_irqs != 1)) + return; + + irq_domain_free_irqs_parent(domain, virq, 1); + + irq_data = irq_domain_get_irq_data(domain, virq); + channel_hwirq = irq_data_get_irq_chip_data(irq_data); + + meson_gpio_irq_release_channel(ctl, channel_hwirq); +} + +static const struct irq_domain_ops meson_gpio_irq_domain_ops = { + .alloc = meson_gpio_irq_domain_alloc, + .free = meson_gpio_irq_domain_free, + .translate = meson_gpio_irq_domain_translate, +}; + +static int __init meson_gpio_irq_parse_dt(struct device_node *node, + struct meson_gpio_irq_controller *ctl) +{ + const struct of_device_id *match; + const struct meson_gpio_irq_params *params; + int ret; + + match = of_match_node(meson_irq_gpio_matches, node); + if (!match) + return -ENODEV; + + params = match->data; + ctl->nr_hwirq = params->nr_hwirq; + + ret = of_property_read_variable_u32_array(node, + "amlogic,channel-interrupts", + ctl->channel_irqs, + NUM_CHANNEL, + NUM_CHANNEL); + if (ret < 0) { + pr_err("can't get %d channel interrupts\n", NUM_CHANNEL); + return ret; + } + + return 0; +} + +static int __init meson_gpio_irq_of_init(struct device_node *node, + struct device_node *parent) +{ + struct irq_domain *domain, *parent_domain; + struct meson_gpio_irq_controller *ctl; + int ret; + + if (!parent) { + pr_err("missing parent interrupt node\n"); + return -ENODEV; + } + + parent_domain = irq_find_host(parent); + if (!parent_domain) { + pr_err("unable to obtain parent domain\n"); + return -ENXIO; + } + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + spin_lock_init(&ctl->lock); + + ctl->base = of_iomap(node, 0); + if (!ctl->base) { + ret = -ENOMEM; + goto free_ctl; + } + + ret = meson_gpio_irq_parse_dt(node, ctl); + if (ret) + goto free_channel_irqs; + + domain = irq_domain_create_hierarchy(parent_domain, 0, ctl->nr_hwirq, + of_node_to_fwnode(node), + &meson_gpio_irq_domain_ops, + ctl); + if (!domain) { + pr_err("failed to add domain\n"); + ret = -ENODEV; + goto free_channel_irqs; + } + + pr_info("%d to %d gpio interrupt mux initialized\n", + ctl->nr_hwirq, NUM_CHANNEL); + + return 0; + +free_channel_irqs: + iounmap(ctl->base); +free_ctl: + kfree(ctl); + + return ret; +} + +IRQCHIP_DECLARE(meson_gpio_intc, "amlogic,meson-gpio-intc", + meson_gpio_irq_of_init); From 7bdeb7f52b1b193bb50fc6d01c6110ba50bafb5b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 11 Oct 2017 10:50:23 +0000 Subject: [PATCH 0446/1085] irqchip/aspeed-i2c-ic: Fix return value check in aspeed_i2c_ic_of_init() In case of error, the function of_iomap() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test.. Reviewed-by: Brendan Higgins Signed-off-by: Wei Yongjun Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-aspeed-i2c-ic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-aspeed-i2c-ic.c b/drivers/irqchip/irq-aspeed-i2c-ic.c index 815b88dd18f2..f20200af0992 100644 --- a/drivers/irqchip/irq-aspeed-i2c-ic.c +++ b/drivers/irqchip/irq-aspeed-i2c-ic.c @@ -76,8 +76,8 @@ static int __init aspeed_i2c_ic_of_init(struct device_node *node, return -ENOMEM; i2c_ic->base = of_iomap(node, 0); - if (IS_ERR(i2c_ic->base)) { - ret = PTR_ERR(i2c_ic->base); + if (!i2c_ic->base) { + ret = -ENOMEM; goto err_free_ic; } From 0cfe5b5fc0277463fa795dea312a3a2fd5e8bac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= Date: Sun, 1 Oct 2017 15:30:50 -0400 Subject: [PATCH 0447/1085] x86: Use ARRAY_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the ARRAY_SIZE macro improves the readability of the code. Found with Coccinelle with the following semantic patch: @r depends on (org || report)@ type T; T[] E; position p; @@ ( (sizeof(E)@p /sizeof(*E)) | (sizeof(E)@p /sizeof(E[...])) | (sizeof(E)@p /sizeof(T)) ) Signed-off-by: Jérémy Lefaure Signed-off-by: Thomas Gleixner Cc: linux-video@atrey.karlin.mff.cuni.cz Cc: Martin Mares Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20171001193101.8898-13-jeremy.lefaure@lse.epita.fr --- arch/x86/boot/video-vga.c | 6 +++--- arch/x86/entry/vdso/vdso2c.c | 3 ++- .../x86/platform/intel-mid/device_libs/platform_gpio_keys.c | 5 ++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 45bc9402aa49..a14c5178d4ba 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -241,9 +241,9 @@ static int vga_probe(void) vga_modes, }; static int mode_count[] = { - sizeof(cga_modes)/sizeof(struct mode_info), - sizeof(ega_modes)/sizeof(struct mode_info), - sizeof(vga_modes)/sizeof(struct mode_info), + ARRAY_SIZE(cga_modes), + ARRAY_SIZE(ega_modes), + ARRAY_SIZE(vga_modes), }; struct biosregs ireg, oreg; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 0780a443a53b..4674f58581a1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -65,6 +65,7 @@ #include #include +#include const char *outfilename; @@ -151,7 +152,7 @@ extern void bad_put_le(void); PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) -#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) +#define NSYMS ARRAY_SIZE(required_syms) #define BITSFUNC3(name, bits, suffix) name##bits##suffix #define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index 74283875c7e8..e639e3116acf 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -62,10 +62,9 @@ static struct platform_device pb_device = { static int __init pb_keys_init(void) { struct gpio_keys_button *gb = gpio_button; - int i, num, good = 0; + int i, good = 0; - num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); - for (i = 0; i < num; i++) { + for (i = 0; i < ARRAY_SIZE(gpio_button); i++) { gb[i].gpio = get_gpio_by_name(gb[i].desc); pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); From 642e641cbea57e559720b9df09889ffcf525cf04 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 19 Sep 2017 16:40:43 +0530 Subject: [PATCH 0448/1085] x86/events/amd/iommu: Make iommu_pmu const and __initconst iommu_pmu is only used as source for a copy operation in the init code path. Mark it const and __initconst. Signed-off-by: Bhumika Goyal Signed-off-by: Thomas Gleixner Cc: julia.lawall@lip6.fr Link: https://lkml.kernel.org/r/1505819443-670-1-git-send-email-bhumirks@gmail.com --- arch/x86/events/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 3641e24fdac5..38b5d41b0c37 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -405,7 +405,7 @@ const struct attribute_group *amd_iommu_attr_groups[] = { NULL, }; -static struct pmu iommu_pmu = { +static const struct pmu iommu_pmu __initconst = { .event_init = perf_iommu_event_init, .add = perf_iommu_add, .del = perf_iommu_del, From 3a29ddb1c5986a6d3f941bfb1f434105203ce7f6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 19 Oct 2017 15:17:23 +0100 Subject: [PATCH 0449/1085] clockevents: Retry programming min delta up to 10 times When CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=n, the call path hrtimer_reprogram -> clockevents_program_event -> clockevents_program_min_delta will not retry if the clock event driver returns -ETIME. If the driver could not satisfy the program_min_delta for any reason, the lack of a retry means the CPU may not receive a tick interrupt, potentially until the counter does a full period. This leads to rcu_sched timeout messages as the stalled CPU is detected by other CPUs, and other issues if the CPU is holding locks or other resources at the point at which it stalls. There have been a couple of observed mechanisms through which a clock event driver could not satisfy the requested min_delta and return -ETIME. With the MIPS GIC driver, the shared execution resource within MT cores means inconventient latency due to execution of instructions from other hardware threads in the core, within gic_next_event, can result in an event being set in the past. Additionally under virtualisation it is possible to get unexpected latency during a clockevent device's set_next_event() callback which can make it return -ETIME even for a delta based on min_delta_ns. It isn't appropriate to use MIN_ADJUST in the virtualisation case as occasional hypervisor induced high latency will cause min_delta_ns to quickly increase to the maximum. Instead, borrow the retry pattern from the MIN_ADJUST case, but without making adjustments. Retry up to 10 times, each time increasing the attempted delta by min_delta, before giving up. [ Matt: Reworked the loop and made retry increase the delta. ] Signed-off-by: James Hogan Signed-off-by: Matt Redfearn Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Daniel Lezcano Cc: "Martin Schwidefsky" Cc: James Hogan Link: https://lkml.kernel.org/r/1508422643-6075-1-git-send-email-matt.redfearn@mips.com --- kernel/time/clockevents.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4237e0744e26..16c027e9cc73 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; - int64_t delta; + int64_t delta = 0; + int i; - delta = dev->min_delta_ns; - dev->next_event = ktime_add_ns(ktime_get(), delta); + for (i = 0; i < 10; i++) { + delta += dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); - if (clockevent_state_shutdown(dev)) - return 0; + if (clockevent_state_shutdown(dev)) + return 0; - dev->retries++; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - return dev->set_next_event((unsigned long) clc, dev); + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + } + return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ From 00a8f886dbdaeea1d93543d5311ddf3a2680bf2b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 15 Sep 2017 16:24:31 +0200 Subject: [PATCH 0450/1085] s390/nmi: use smp_emergency_stop instead of smp_send_stop The smp_send_stop() function can be called from s390_handle_damage while DAT is off. This happens if a machine check indicates that kernel gprs or control registers can not be restored. The function smp_send_stop reenables DAT via __load_psw_mask. That should work for the case of lost kernel gprs and the system will do the expected stop of all CPUs. But if control registers are lost, in particular CR13 with the home space ASCE, interesting secondary crashes may occur. Make smp_emergency_stop callable from nmi.c and remove the cpumask argument. Replace the smp_send_stop call with smp_emergency_stop in the s390_handle_damage function. In addition add notrace and NOKPROBE_SYMBOL annotations for all functions required for the emergency shutdown. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/smp.h | 5 +++++ arch/s390/kernel/nmi.c | 9 +++++++-- arch/s390/kernel/smp.c | 30 +++++++++++++++++------------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 3deb134587b7..3470274a985c 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -27,6 +27,7 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); extern void smp_call_online_cpu(void (*func)(void *), void *); extern void smp_call_ipl_cpu(void (*func)(void *), void *); +extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); extern int smp_store_status(int cpu); @@ -52,6 +53,10 @@ static inline void smp_call_online_cpu(void (*func)(void *), void *data) func(data); } +static inline void smp_emergency_stop(void) +{ +} + static inline int smp_find_processor_id(u16 address) { return 0; } static inline int smp_store_status(int cpu) { return 0; } static inline int smp_vcpu_scheduled(int cpu) { return 1; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 31d03a84126c..15e28eefe7e9 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -38,12 +39,13 @@ struct mcck_struct { static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static void s390_handle_damage(void) +static notrace void s390_handle_damage(void) { - smp_send_stop(); + smp_emergency_stop(); disabled_wait((unsigned long) __builtin_return_address(0)); while (1); } +NOKPROBE_SYMBOL(s390_handle_damage); /* * Main machine check handler function. Will be called with interrupts enabled @@ -275,6 +277,7 @@ static int notrace s390_validate_registers(union mci mci, int umode) return kill_task; } +NOKPROBE_SYMBOL(s390_validate_registers); /* * Backup the guest's machine check info to its description block @@ -300,6 +303,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) mcck_backup->failing_storage_address = S390_lowcore.failing_storage_address; } +NOKPROBE_SYMBOL(s390_backup_mcck_info); #define MAX_IPD_COUNT 29 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ @@ -443,6 +447,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) clear_cpu_flag(CIF_MCCK_GUEST); nmi_exit(); } +NOKPROBE_SYMBOL(s390_do_machine_check); static int __init machine_check_init(void) { diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index cc04b74fbb84..2dba3e88a972 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -422,13 +423,17 @@ void smp_yield_cpu(int cpu) * Send cpus emergency shutdown signal. This gives the cpus the * opportunity to complete outstanding interrupts. */ -static void smp_emergency_stop(cpumask_t *cpumask) +void notrace smp_emergency_stop(void) { + cpumask_t cpumask; u64 end; int cpu; + cpumask_copy(&cpumask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpumask); + end = get_tod_clock() + (1000000UL << 12); - for_each_cpu(cpu, cpumask) { + for_each_cpu(cpu, &cpumask) { struct pcpu *pcpu = pcpu_devices + cpu; set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, @@ -437,21 +442,21 @@ static void smp_emergency_stop(cpumask_t *cpumask) cpu_relax(); } while (get_tod_clock() < end) { - for_each_cpu(cpu, cpumask) + for_each_cpu(cpu, &cpumask) if (pcpu_stopped(pcpu_devices + cpu)) - cpumask_clear_cpu(cpu, cpumask); - if (cpumask_empty(cpumask)) + cpumask_clear_cpu(cpu, &cpumask); + if (cpumask_empty(&cpumask)) break; cpu_relax(); } } +NOKPROBE_SYMBOL(smp_emergency_stop); /* * Stop all cpus but the current one. */ void smp_send_stop(void) { - cpumask_t cpumask; int cpu; /* Disable all interrupts/machine checks */ @@ -459,17 +464,16 @@ void smp_send_stop(void) trace_hardirqs_off(); debug_set_critical(); - cpumask_copy(&cpumask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &cpumask); if (oops_in_progress) - smp_emergency_stop(&cpumask); + smp_emergency_stop(); /* stop all processors */ - for_each_cpu(cpu, &cpumask) { - struct pcpu *pcpu = pcpu_devices + cpu; - pcpu_sigp_retry(pcpu, SIGP_STOP, 0); - while (!pcpu_stopped(pcpu)) + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu_devices + cpu)) cpu_relax(); } } From ad3bc0ac1d2ed311ef3a9d6f2849948433a9f338 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 12 Oct 2017 13:24:45 +0200 Subject: [PATCH 0451/1085] s390/ctl_reg: use decoding unions in update_cr_regs Add a decoding union for the bits in control registers 2 and use 'union ctlreg0' and 'union ctlreg2' in update_cr_regs to improve readability. Reviewed-by: Heiko Carstens Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ctl_reg.h | 19 ++++++++++++++++++- arch/s390/kernel/machine_kexec.c | 11 ++++++----- arch/s390/kernel/nmi.c | 4 +++- arch/s390/kernel/ptrace.c | 30 +++++++++++++++--------------- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index e508dff92535..5d23ecf4297a 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -54,7 +54,11 @@ void smp_ctl_clear_bit(int cr, int bit); union ctlreg0 { unsigned long val; struct { - unsigned long : 32; + unsigned long : 8; + unsigned long tcx : 1; /* Transactional-Execution control */ + unsigned long pifo : 1; /* Transactional-Execution Program- + Interruption-Filtering Override */ + unsigned long : 22; unsigned long : 3; unsigned long lap : 1; /* Low-address-protection control */ unsigned long : 4; @@ -70,6 +74,19 @@ union ctlreg0 { }; }; +union ctlreg2 { + unsigned long val; + struct { + unsigned long : 33; + unsigned long ducto : 25; + unsigned long : 1; + unsigned long gse : 1; + unsigned long : 1; + unsigned long tds : 1; + unsigned long tdc : 2; + }; +}; + #ifdef CONFIG_SMP # define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) # define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 51e8c63705a9..e94421678b13 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -105,7 +105,7 @@ static void __do_machine_kdump(void *image) static noinline void __machine_kdump(void *image) { struct mcesa *mcesa; - unsigned long cr2_old, cr2_new; + union ctlreg2 cr2_old, cr2_new; int this_cpu, cpu; lgr_info_log(); @@ -122,11 +122,12 @@ static noinline void __machine_kdump(void *image) if (MACHINE_HAS_VX) save_vx_regs((__vector128 *) mcesa->vector_save_area); if (MACHINE_HAS_GS) { - __ctl_store(cr2_old, 2, 2); - cr2_new = cr2_old | (1UL << 4); - __ctl_load(cr2_new, 2, 2); + __ctl_store(cr2_old.val, 2, 2); + cr2_new = cr2_old; + cr2_new.gse = 1; + __ctl_load(cr2_new.val, 2, 2); save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); - __ctl_load(cr2_old, 2, 2); + __ctl_load(cr2_old.val, 2, 2); } /* * To create a good backchain for this CPU in the dump store_status diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 15e28eefe7e9..eb3e702cee30 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -107,6 +107,7 @@ EXPORT_SYMBOL_GPL(s390_handle_mcck); */ static int notrace s390_validate_registers(union mci mci, int umode) { + union ctlreg2 cr2; int kill_task; u64 zero; void *fpt_save_area; @@ -231,7 +232,8 @@ static int notrace s390_validate_registers(union mci mci, int umode) kill_task = 1; } /* Validate guarded storage registers */ - if (MACHINE_HAS_GS && (S390_lowcore.cregs_save_area[2] & (1UL << 4))) { + cr2.val = S390_lowcore.cregs_save_area[2]; + if (cr2.gse) { if (!mci.gs) /* * Guarded storage register can't be restored and diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index ea711f141bb8..ea637365d9ef 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -47,42 +47,42 @@ void update_cr_regs(struct task_struct *task) struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; struct per_regs old, new; - unsigned long cr0_old, cr0_new; - unsigned long cr2_old, cr2_new; + union ctlreg0 cr0_old, cr0_new; + union ctlreg2 cr2_old, cr2_new; int cr0_changed, cr2_changed; - __ctl_store(cr0_old, 0, 0); - __ctl_store(cr2_old, 2, 2); + __ctl_store(cr0_old.val, 0, 0); + __ctl_store(cr2_old.val, 2, 2); cr0_new = cr0_old; cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ if (MACHINE_HAS_TE) { /* Set or clear transaction execution TXC bit 8. */ - cr0_new |= (1UL << 55); + cr0_new.tcx = 1; if (task->thread.per_flags & PER_FLAG_NO_TE) - cr0_new &= ~(1UL << 55); + cr0_new.tcx = 0; /* Set or clear transaction execution TDC bits 62 and 63. */ - cr2_new &= ~3UL; + cr2_new.tdc = 0; if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) - cr2_new |= 1UL; + cr2_new.tdc = 1; else - cr2_new |= 2UL; + cr2_new.tdc = 2; } } /* Take care of enable/disable of guarded storage. */ if (MACHINE_HAS_GS) { - cr2_new &= ~(1UL << 4); + cr2_new.gse = 0; if (task->thread.gs_cb) - cr2_new |= (1UL << 4); + cr2_new.gse = 1; } /* Load control register 0/2 iff changed */ - cr0_changed = cr0_new != cr0_old; - cr2_changed = cr2_new != cr2_old; + cr0_changed = cr0_new.val != cr0_old.val; + cr2_changed = cr2_new.val != cr2_old.val; if (cr0_changed) - __ctl_load(cr0_new, 0, 0); + __ctl_load(cr0_new.val, 0, 0); if (cr2_changed) - __ctl_load(cr2_new, 2, 2); + __ctl_load(cr2_new.val, 2, 2); /* Copy user specified PER registers */ new.control = thread->per_user.control; new.start = thread->per_user.start; From cc65450c8337848f97e893c5c3de973ece73aabf Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 12 Oct 2017 13:24:46 +0200 Subject: [PATCH 0452/1085] s390/ctl_reg: move control register definitions to ctl_reg.h The nmi.h header has some constant defines for control register bits. These definitions should really be located in ctl_reg.h. Move and rename the defines. Reviewed-by: Heiko Carstens Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ctl_reg.h | 11 +++++++++++ arch/s390/include/asm/nmi.h | 6 ------ arch/s390/kvm/interrupt.c | 6 +++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 5d23ecf4297a..0095a40886e1 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -7,6 +7,16 @@ #ifndef __ASM_CTL_REG_H #define __ASM_CTL_REG_H +#include + +#define CR14_CHANNEL_REPORT_SUBMASK _BITUL(63 - 35) +#define CR14_RECOVERY_SUBMASK _BITUL(63 - 36) +#define CR14_DEGRADATION_SUBMASK _BITUL(63 - 37) +#define CR14_EXTERNAL_DAMAGE_SUBMASK _BITUL(63 - 38) +#define CR14_WARNING_SUBMASK _BITUL(63 - 39) + +#ifndef __ASSEMBLY__ + #include #define __ctl_load(array, low, high) do { \ @@ -95,4 +105,5 @@ union ctlreg2 { # define ctl_clear_bit(cr, bit) __ctl_clear_bit(cr, bit) #endif +#endif /* __ASSEMBLY__ */ #endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index c8e211b9a002..77a7d9445e33 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -26,12 +26,6 @@ #define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) #define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) -#define MCCK_CR14_CR_PENDING_SUB_MASK (1 << 28) -#define MCCK_CR14_RECOVERY_SUB_MASK (1 << 27) -#define MCCK_CR14_DEGRAD_SUB_MASK (1 << 26) -#define MCCK_CR14_EXT_DAMAGE_SUB_MASK (1 << 25) -#define MCCK_CR14_WARN_SUB_MASK (1 << 24) - #ifndef __ASSEMBLY__ union mci { diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a832ad031cee..329b2843fee2 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2483,11 +2483,11 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, mci.val = mcck_info->mcic; if (mci.sr) - cr14 |= MCCK_CR14_RECOVERY_SUB_MASK; + cr14 |= CR14_RECOVERY_SUBMASK; if (mci.dg) - cr14 |= MCCK_CR14_DEGRAD_SUB_MASK; + cr14 |= CR14_DEGRADATION_SUBMASK; if (mci.w) - cr14 |= MCCK_CR14_WARN_SUB_MASK; + cr14 |= CR14_WARNING_SUBMASK; mchk = mci.ck ? &inti.mchk : &irq.u.mchk; mchk->cr14 = cr14; From 6c81511ca1f52a0bbe921b2b98e34319a4ca59ed Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 12 Oct 2017 13:24:47 +0200 Subject: [PATCH 0453/1085] s390/nmi: allocation of the extended save area The machine check extended save area is needed to store the vector registers and the guarded storage control block when a CPU is interrupted by a machine check. Move the slab cache allocation of the full save area to nmi.c, for early boot use a static __initdata block. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/nmi.h | 10 ++++- arch/s390/kernel/nmi.c | 82 +++++++++++++++++++++++++++++++++++++ arch/s390/kernel/setup.c | 11 +---- arch/s390/kernel/smp.c | 43 ++++--------------- 4 files changed, 99 insertions(+), 47 deletions(-) diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 77a7d9445e33..ed41c424448d 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -80,6 +80,8 @@ union mci { #define MCESA_ORIGIN_MASK (~0x3ffUL) #define MCESA_LC_MASK (0xfUL) +#define MCESA_MIN_SIZE (1024) +#define MCESA_MAX_SIZE (2048) struct mcesa { u8 vector_save_area[1024]; @@ -88,8 +90,12 @@ struct mcesa { struct pt_regs; -extern void s390_handle_mcck(void); -extern void s390_do_machine_check(struct pt_regs *regs); +void nmi_alloc_boot_cpu(struct lowcore *lc); +int nmi_alloc_per_cpu(struct lowcore *lc); +void nmi_free_per_cpu(struct lowcore *lc); + +void s390_handle_mcck(void); +void s390_do_machine_check(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_NMI_H */ diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index eb3e702cee30..7f6779695a43 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -38,6 +40,86 @@ struct mcck_struct { }; static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); +static struct kmem_cache *mcesa_cache; +static unsigned long mcesa_origin_lc; + +static inline int nmi_needs_mcesa(void) +{ + return MACHINE_HAS_VX || MACHINE_HAS_GS; +} + +static inline unsigned long nmi_get_mcesa_size(void) +{ + if (MACHINE_HAS_GS) + return MCESA_MAX_SIZE; + return MCESA_MIN_SIZE; +} + +/* + * The initial machine check extended save area for the boot CPU. + * It will be replaced by nmi_init() with an allocated structure. + * The structure is required for machine check happening early in + * the boot process. + */ +static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); + +void __init nmi_alloc_boot_cpu(struct lowcore *lc) +{ + if (!nmi_needs_mcesa()) + return; + lc->mcesad = (unsigned long) &boot_mcesa; + if (MACHINE_HAS_GS) + lc->mcesad |= ilog2(MCESA_MAX_SIZE); +} + +static int __init nmi_init(void) +{ + unsigned long origin, cr0, size; + + if (!nmi_needs_mcesa()) + return 0; + size = nmi_get_mcesa_size(); + if (size > MCESA_MIN_SIZE) + mcesa_origin_lc = ilog2(size); + /* create slab cache for the machine-check-extended-save-areas */ + mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); + if (!mcesa_cache) + panic("Couldn't create nmi save area cache"); + origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + if (!origin) + panic("Couldn't allocate nmi save area"); + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak((void *) origin); + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + /* Replace boot_mcesa on the boot CPU */ + S390_lowcore.mcesad = origin | mcesa_origin_lc; + __ctl_load(cr0, 0, 0); + return 0; +} +early_initcall(nmi_init); + +int nmi_alloc_per_cpu(struct lowcore *lc) +{ + unsigned long origin; + + if (!nmi_needs_mcesa()) + return 0; + origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + if (!origin) + return -ENOMEM; + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak((void *) origin); + lc->mcesad = origin | mcesa_origin_lc; + return 0; +} + +void nmi_free_per_cpu(struct lowcore *lc) +{ + if (!nmi_needs_mcesa()) + return; + kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK)); +} static notrace void s390_handle_damage(void) { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index bf139f9e120e..b0943ef8cc31 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -340,15 +341,7 @@ static void __init setup_lowcore(void) lc->stfl_fac_list = S390_lowcore.stfl_fac_list; memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); - if (MACHINE_HAS_VX || MACHINE_HAS_GS) { - unsigned long bits, size; - - bits = MACHINE_HAS_GS ? 11 : 10; - size = 1UL << bits; - lc->mcesad = (__u64) memblock_virt_alloc(size, size); - if (MACHINE_HAS_GS) - lc->mcesad |= bits; - } + nmi_alloc_boot_cpu(lc); vdso_alloc_boot_cpu(lc); lc->sync_enter_timer = S390_lowcore.sync_enter_timer; lc->async_enter_timer = S390_lowcore.async_enter_timer; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2dba3e88a972..6d17ff46b749 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -81,8 +81,6 @@ struct pcpu { static u8 boot_core_type; static struct pcpu pcpu_devices[NR_CPUS]; -static struct kmem_cache *pcpu_mcesa_cache; - unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -193,10 +191,8 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, panic_stack; - unsigned long mcesa_origin, mcesa_bits; struct lowcore *lc; - mcesa_origin = mcesa_bits = 0; if (pcpu != &pcpu_devices[0]) { pcpu->lowcore = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); @@ -204,40 +200,30 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) panic_stack = __get_free_page(GFP_KERNEL); if (!pcpu->lowcore || !panic_stack || !async_stack) goto out; - if (MACHINE_HAS_VX || MACHINE_HAS_GS) { - mcesa_origin = (unsigned long) - kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL); - if (!mcesa_origin) - goto out; - /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) mcesa_origin); - mcesa_bits = MACHINE_HAS_GS ? 11 : 0; - } } else { async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET; panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET; - mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK; - mcesa_bits = pcpu->lowcore->mcesad & MCESA_LC_MASK; } lc = pcpu->lowcore; memcpy(lc, &S390_lowcore, 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + ASYNC_FRAME_OFFSET; lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET; - lc->mcesad = mcesa_origin | mcesa_bits; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; - if (vdso_alloc_per_cpu(lc)) + if (nmi_alloc_per_cpu(lc)) goto out; + if (vdso_alloc_per_cpu(lc)) + goto out_mcesa; lowcore_ptr[cpu] = lc; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); return 0; + +out_mcesa: + nmi_free_per_cpu(lc); out: if (pcpu != &pcpu_devices[0]) { - if (mcesa_origin) - kmem_cache_free(pcpu_mcesa_cache, - (void *) mcesa_origin); free_page(panic_stack); free_pages(async_stack, ASYNC_ORDER); free_pages((unsigned long) pcpu->lowcore, LC_ORDER); @@ -249,17 +235,12 @@ out: static void pcpu_free_lowcore(struct pcpu *pcpu) { - unsigned long mcesa_origin; - pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[pcpu - pcpu_devices] = NULL; vdso_free_per_cpu(pcpu->lowcore); + nmi_free_per_cpu(pcpu->lowcore); if (pcpu == &pcpu_devices[0]) return; - if (MACHINE_HAS_VX || MACHINE_HAS_GS) { - mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK; - kmem_cache_free(pcpu_mcesa_cache, (void *) mcesa_origin); - } free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET); free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER); free_pages((unsigned long) pcpu->lowcore, LC_ORDER); @@ -936,22 +917,12 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - unsigned long size; - /* request the 0x1201 emergency signal external interrupt */ if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); /* request the 0x1202 external call external interrupt */ if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); - /* create slab cache for the machine-check-extended-save-areas */ - if (MACHINE_HAS_VX || MACHINE_HAS_GS) { - size = 1UL << (MACHINE_HAS_GS ? 11 : 10); - pcpu_mcesa_cache = kmem_cache_create("nmi_save_areas", - size, size, 0, NULL); - if (!pcpu_mcesa_cache) - panic("Couldn't create nmi save area cache"); - } } void __init smp_prepare_boot_cpu(void) From 3037a52f9846b9d6e233274453f2d4117a14f31b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 12 Oct 2017 13:24:48 +0200 Subject: [PATCH 0454/1085] s390/nmi: do register validation as early as possible The validation of the CPU registers in the machine check handler is currently split into two parts. The first part is done at the start of the low level mcck_int_handler function, this includes the CPU timer register and the general purpose registers. The second part is done a bit later in s390_do_machine_check for all the other registers, including the control registers, floating pointer control, vector or floating pointer registers, the access registers, the guarded storage registers, the TOD programmable registers and the clock comparator. This is working fine to far but in theory a future extensions could cause the C code to use registers that are not validated yet. A better approach is to validate all CPU registers in "safe" assembler code before any C function is called. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ctl_reg.h | 2 + arch/s390/include/asm/nmi.h | 3 + arch/s390/kernel/asm-offsets.c | 5 ++ arch/s390/kernel/entry.S | 60 +++++++++++++++-- arch/s390/kernel/nmi.c | 110 +++++++------------------------- 5 files changed, 85 insertions(+), 95 deletions(-) diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 0095a40886e1..4d4f35f705c7 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -9,6 +9,8 @@ #include +#define CR2_GUARDED_STORAGE _BITUL(63 - 59) + #define CR14_CHANNEL_REPORT_SUBMASK _BITUL(63 - 35) #define CR14_RECOVERY_SUBMASK _BITUL(63 - 36) #define CR14_DEGRADATION_SUBMASK _BITUL(63 - 37) diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index ed41c424448d..7472bf316a2f 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -25,6 +25,9 @@ #define MCCK_CODE_CPU_TIMER_VALID _BITUL(63 - 46) #define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) #define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) +#define MCCK_CODE_CR_VALID _BITUL(63 - 29) +#define MCCK_CODE_GS_VALID _BITUL(63 - 36) +#define MCCK_CODE_FC_VALID _BITUL(63 - 43) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 3d42f91c95fd..1f33c0193a89 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Make sure that the compiler is new enough. We want a compiler that @@ -158,6 +159,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); @@ -193,6 +195,9 @@ int main(void) OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); + /* extended machine check save area */ + OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); + BLANK(); /* gmap/sie offsets */ OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 21900e1cee9c..9887d3ed6eb1 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -948,15 +949,56 @@ load_fpu_regs: */ ENTRY(mcck_int_handler) STCK __LC_MCCK_CLOCK - la %r1,4095 # revalidate r1 - spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs + la %r1,4095 # validate r1 + spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer + sckc __LC_CLOCK_COMPARATOR # validate comparator + lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_MCK_OLD_PSW TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid - lghi %r14,__LC_CPU_TIMER_SAVE_AREA + TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + jno .Lmcck_panic # control registers invalid -> panic + la %r14,4095 + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs + ptlb + lg %r11,__LC_MCESAD # extended machine check save area + nill %r11,0xfc00 # MCESA_ORIGIN_MASK + TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE + jno 0f + TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID + jno 0f + .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC +0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) + TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID + jo 0f + sr %r14,%r14 +0: sfpc %r14 + TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX + jo 0f + lghi %r14,__LC_FPREGS_SAVE_AREA + ld %f0,0(%r14) + ld %f1,8(%r14) + ld %f2,16(%r14) + ld %f3,24(%r14) + ld %f4,32(%r14) + ld %f5,40(%r14) + ld %f6,48(%r14) + ld %f7,56(%r14) + ld %f8,64(%r14) + ld %f9,72(%r14) + ld %f10,80(%r14) + ld %f11,88(%r14) + ld %f12,96(%r14) + ld %f13,104(%r14) + ld %f14,112(%r14) + ld %f15,120(%r14) + j 1f +0: VLM %v0,%v15,0,%r11 + VLM %v16,%v31,256,%r11 +1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f @@ -972,9 +1014,13 @@ ENTRY(mcck_int_handler) la %r14,__LC_LAST_UPDATE_TIMER 2: spt 0(%r14) mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) -3: TSTMSK __LC_MCCK_CODE,(MCCK_CODE_PSW_MWP_VALID|MCCK_CODE_PSW_IA_VALID) - jno .Lmcck_panic # no -> skip cleanup critical - SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER +3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + jno .Lmcck_panic + tmhh %r8,0x0001 # interrupting from user ? + jnz 4f + TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + jno .Lmcck_panic +4: SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER .Lmcck_skip: lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 7f6779695a43..3f3cda41f32a 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -184,19 +184,16 @@ void s390_handle_mcck(void) EXPORT_SYMBOL_GPL(s390_handle_mcck); /* - * returns 0 if all registers could be validated + * returns 0 if all required registers are available * returns 1 otherwise */ -static int notrace s390_validate_registers(union mci mci, int umode) +static int notrace s390_check_registers(union mci mci, int umode) { union ctlreg2 cr2; int kill_task; - u64 zero; void *fpt_save_area; - struct mcesa *mcesa; kill_task = 0; - zero = 0; if (!mci.gr) { /* @@ -207,18 +204,13 @@ static int notrace s390_validate_registers(union mci mci, int umode) s390_handle_damage(); kill_task = 1; } - /* Validate control registers */ + /* Check control registers */ if (!mci.cr) { /* * Control registers have unknown contents. * Can't recover and therefore stopping machine. */ s390_handle_damage(); - } else { - asm volatile( - " lctlg 0,15,0(%0)\n" - " ptlb\n" - : : "a" (&S390_lowcore.cregs_save_area) : "memory"); } if (!mci.fp) { /* @@ -226,7 +218,6 @@ static int notrace s390_validate_registers(union mci mci, int umode) * kernel currently uses floating point registers the * system is stopped. If the process has its floating * pointer registers loaded it is terminated. - * Otherwise just revalidate the registers. */ if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) s390_handle_damage(); @@ -240,72 +231,29 @@ static int notrace s390_validate_registers(union mci mci, int umode) * If the kernel currently uses the floating pointer * registers and needs the FPC register the system is * stopped. If the process has its floating pointer - * registers loaded it is terminated. Otherwiese the - * FPC is just revalidated. + * registers loaded it is terminated. */ if (S390_lowcore.fpu_flags & KERNEL_FPC) s390_handle_damage(); - asm volatile("lfpc %0" : : "Q" (zero)); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; - } else { - asm volatile("lfpc %0" - : : "Q" (S390_lowcore.fpt_creg_save_area)); } - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (!MACHINE_HAS_VX) { - /* Validate floating point registers */ - asm volatile( - " ld 0,0(%0)\n" - " ld 1,8(%0)\n" - " ld 2,16(%0)\n" - " ld 3,24(%0)\n" - " ld 4,32(%0)\n" - " ld 5,40(%0)\n" - " ld 6,48(%0)\n" - " ld 7,56(%0)\n" - " ld 8,64(%0)\n" - " ld 9,72(%0)\n" - " ld 10,80(%0)\n" - " ld 11,88(%0)\n" - " ld 12,96(%0)\n" - " ld 13,104(%0)\n" - " ld 14,112(%0)\n" - " ld 15,120(%0)\n" - : : "a" (fpt_save_area) : "memory"); - } else { - /* Validate vector registers */ - union ctlreg0 cr0; - + if (MACHINE_HAS_VX) { if (!mci.vr) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is * stopped. If the process has its vector registers - * loaded it is terminated. Otherwise just revalidate - * the registers. + * loaded it is terminated. */ if (S390_lowcore.fpu_flags & KERNEL_VXR) s390_handle_damage(); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } - cr0.val = S390_lowcore.cregs_save_area[0]; - cr0.afp = cr0.vx = 1; - __ctl_load(cr0.val, 0, 0); - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ - : : "Q" (*(struct vx_array *) mcesa->vector_save_area) - : "1"); - __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } - /* Validate access registers */ - asm volatile( - " lam 0,15,0(%0)" - : : "a" (&S390_lowcore.access_regs_save_area)); + /* Check if access registers are valid */ if (!mci.ar) { /* * Access registers have unknown contents. @@ -313,55 +261,41 @@ static int notrace s390_validate_registers(union mci mci, int umode) */ kill_task = 1; } - /* Validate guarded storage registers */ + /* Check guarded storage registers */ cr2.val = S390_lowcore.cregs_save_area[2]; if (cr2.gse) { - if (!mci.gs) + if (!mci.gs) { /* * Guarded storage register can't be restored and * the current processes uses guarded storage. * It has to be terminated. */ kill_task = 1; - else - load_gs_cb((struct gs_cb *) - mcesa->guarded_storage_save_area); + } } - /* - * We don't even try to validate the TOD register, since we simply - * can't write something sensible into that register. - */ - /* - * See if we can validate the TOD programmable register with its - * old contents (should be zero) otherwise set it to zero. - */ - if (!mci.pr) - asm volatile( - " sr 0,0\n" - " sckpf" - : : : "0", "cc"); - else - asm volatile( - " l 0,%0\n" - " sckpf" - : : "Q" (S390_lowcore.tod_progreg_save_area) - : "0", "cc"); - /* Validate clock comparator register */ - set_clock_comparator(S390_lowcore.clock_comparator); /* Check if old PSW is valid */ - if (!mci.wp) + if (!mci.wp) { /* * Can't tell if we come from user or kernel mode * -> stopping machine. */ s390_handle_damage(); + } + /* Check for invalid kernel instruction address */ + if (!mci.ia && !umode) { + /* + * The instruction address got lost while running + * in the kernel -> stopping machine. + */ + s390_handle_damage(); + } if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; return kill_task; } -NOKPROBE_SYMBOL(s390_validate_registers); +NOKPROBE_SYMBOL(s390_check_registers); /* * Backup the guest's machine check info to its description block @@ -460,7 +394,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_validate_registers(mci, user_mode(regs))) { + if (s390_check_registers(mci, user_mode(regs))) { /* * Couldn't restore all register contents for the * user space process -> mark task for termination. From 4845688d6a86d411a7622148e4f39d29b51e92cd Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 15 Oct 2017 11:24:08 +0200 Subject: [PATCH 0455/1085] docs: dev-tools: correct Coccinelle version number There is no Coccinelle version 1.2. 1.0.2 must be what was intended. Signed-off-by: Julia Lawall Signed-off-by: Jonathan Corbet --- Documentation/dev-tools/coccinelle.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/dev-tools/coccinelle.rst b/Documentation/dev-tools/coccinelle.rst index 4a64b4c69d3f..37e474ff6911 100644 --- a/Documentation/dev-tools/coccinelle.rst +++ b/Documentation/dev-tools/coccinelle.rst @@ -209,7 +209,7 @@ err.log will now have the profiling information, while stdout will provide some progress information as Coccinelle moves forward with work. -DEBUG_FILE support is only supported when using coccinelle >= 1.2. +DEBUG_FILE support is only supported when using coccinelle >= 1.0.2. .cocciconfig support -------------------- From 796cacdda786cb5466277ae51465a798f3c665b4 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:23:36 -0500 Subject: [PATCH 0456/1085] Documentation: fix locking rt-mutex doc refs Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/locking/rt-mutex-design.txt | 2 +- Documentation/pi-futex.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt index 6c6e8c2410de..3d7b865539cc 100644 --- a/Documentation/locking/rt-mutex-design.txt +++ b/Documentation/locking/rt-mutex-design.txt @@ -8,7 +8,7 @@ RT-mutex implementation design This document tries to describe the design of the rtmutex.c implementation. It doesn't describe the reasons why rtmutex.c exists. For that please see -Documentation/rt-mutex.txt. Although this document does explain problems +Documentation/locking/rt-mutex.txt. Although this document does explain problems that happen without this code, but that is in the concept to understand what the code actually is doing. diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt index aafddbee7377..b154f6c0c36e 100644 --- a/Documentation/pi-futex.txt +++ b/Documentation/pi-futex.txt @@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex, robust-futex, PI-futex, robust+PI-futex. More details about priority inheritance can be found in -Documentation/rt-mutex.txt. +Documentation/locking/rt-mutex.txt. From f66d9066c09fd65ed6fb418a422b17dcf9bdcd94 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:23:42 -0500 Subject: [PATCH 0457/1085] Documentation: fix ref to sphinx/kerneldoc.py Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/kernel-doc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst index b24854b5d6be..0268335414ce 100644 --- a/Documentation/doc-guide/kernel-doc.rst +++ b/Documentation/doc-guide/kernel-doc.rst @@ -65,7 +65,7 @@ Without options, the kernel-doc directive includes all documentation comments from the source file. The kernel-doc extension is included in the kernel source tree, at -``Documentation/sphinx/kernel-doc.py``. Internally, it uses the +``Documentation/sphinx/kerneldoc.py``. Internally, it uses the ``scripts/kernel-doc`` script to extract the documentation comments from the source. From d354a6f150d873440975263530a2e42121d50a51 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:23:48 -0500 Subject: [PATCH 0458/1085] Documentation: fix ref to workqueue content Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- .../Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index e5d0bbd0230b..7394f034be65 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -527,7 +527,7 @@ grace period also drove it to completion. This straightforward approach had the disadvantage of needing to account for POSIX signals sent to user tasks, so more recent implemementations use the Linux kernel's -workqueues. +workqueues.

The requesting task still does counter snapshotting and funnel-lock From 2c0c5c208bb8233d2d76327e3379375c01785c4d Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:23:53 -0500 Subject: [PATCH 0459/1085] Documentation: fix ref to coccinelle content Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/process/4.Coding.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 6df19943dd4d..26b106071364 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -307,7 +307,7 @@ variety of potential coding problems; it can also propose fixes for those problems. Quite a few "semantic patches" for the kernel have been packaged under the scripts/coccinelle directory; running "make coccicheck" will run through those semantic patches and report on any problems found. See -Documentation/coccinelle.txt for more information. +Documentation/dev-tools/coccinelle.rst for more information. Other kinds of portability errors are best found by compiling your code for other architectures. If you do not happen to have an S/390 system or a From bb93e01b13821852a41f01b124cc71e94f5d3811 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:23:59 -0500 Subject: [PATCH 0460/1085] Documentation: fix ref to trace stm content Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/trace/intel_th.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/intel_th.txt b/Documentation/trace/intel_th.txt index f92070e7dde0..7a57165c2492 100644 --- a/Documentation/trace/intel_th.txt +++ b/Documentation/trace/intel_th.txt @@ -37,7 +37,7 @@ description is at Documentation/ABI/testing/sysfs-bus-intel_th-devices-gth. STH registers an stm class device, through which it provides interface to userspace and kernelspace software trace sources. See -Documentation/tracing/stm.txt for more information on that. +Documentation/trace/stm.txt for more information on that. MSU can be configured to collect trace data into a system memory buffer, which can later on be read from its device nodes via read() or From 4493c1f01b139192e2490457577dccff227987b7 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:24:05 -0500 Subject: [PATCH 0461/1085] Documentation: fix ref to power basic-pm-debugging Signed-off-by: Tom Saeger Acked-by: Rafael J. Wysocki Signed-off-by: Jonathan Corbet --- Documentation/power/interface.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index 7dc75f48e8bd..27df7f98668a 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -43,7 +43,7 @@ The currently selected option is printed in square brackets. The 'platform' option is only available if the platform provides a special mechanism to put the system to sleep after creating a hibernation image (ACPI does that, for example). The 'suspend' option is available if Suspend-to-RAM -is supported. Refer to Documentation/power/basic_pm_debugging.txt for the +is supported. Refer to Documentation/power/basic-pm-debugging.txt for the description of the 'test_resume' option. To select an option, write the string representing it to /sys/power/disk. From 718d50ec782caad13e0cc78fa6a76ff2226f3dd3 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Thu, 12 Oct 2017 15:24:10 -0500 Subject: [PATCH 0462/1085] Documentation: fix selftests related file refs Make refs to selftests files valid including: - watchdog-test.c - dnotify_test.c Signed-off-by: Tom Saeger Signed-off-by: Jonathan Corbet --- Documentation/filesystems/dnotify.txt | 2 +- Documentation/watchdog/hpwdt.txt | 2 +- Documentation/watchdog/pcwd-watchdog.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 6baf88f46859..15156883d321 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt @@ -62,7 +62,7 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. Example ------- -See Documentation/filesystems/dnotify_test.c for an example. +See tools/testing/selftests/filesystems/dnotify_test.c for an example. NOTE ---- diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt index 7a9f635d0258..6d866c537127 100644 --- a/Documentation/watchdog/hpwdt.txt +++ b/Documentation/watchdog/hpwdt.txt @@ -15,7 +15,7 @@ Last reviewed: 05/20/2016 Watchdog functionality is enabled like any other common watchdog driver. That is, an application needs to be started that kicks off the watchdog timer. A - basic application exists in the Documentation/watchdog/src directory called + basic application exists in tools/testing/selftests/watchdog/ named watchdog-test.c. Simply compile the C file and kick it off. If the system gets into a bad state and hangs, the HPE ProLiant iLO timer register will not be updated in a timely fashion and a hardware system reset (also known as diff --git a/Documentation/watchdog/pcwd-watchdog.txt b/Documentation/watchdog/pcwd-watchdog.txt index 4f68052395c0..b8e60a441a43 100644 --- a/Documentation/watchdog/pcwd-watchdog.txt +++ b/Documentation/watchdog/pcwd-watchdog.txt @@ -25,7 +25,7 @@ Last reviewed: 10/05/2007 If you want to write a program to be compatible with the PC Watchdog driver, simply use of modify the watchdog test program: - Documentation/watchdog/src/watchdog-test.c + tools/testing/selftests/watchdog/watchdog-test.c Other IOCTL functions include: From 7d7363e403ce959941f80684cc5f33e747afff17 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2017 16:32:51 -0700 Subject: [PATCH 0463/1085] documentation: kernel-api: add more info on bitmap functions There are some good comments about bitmap operations in lib/bitmap.c and include/linux/bitmap.h, so format them for document generation and pull them into core-api/kernel-api.rst. I converted the "tables" of functions from using tabs to using spaces so that they are more readable in the source file and in the generated output. Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 12 +++ include/linux/bitmap.h | 109 ++++++++++++++------------ lib/bitmap.c | 4 +- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 9c6902ba6e67..81754add8a49 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -53,6 +53,18 @@ The Linux kernel provides more basic utility functions. Bitmap Operations ----------------- +.. kernel-doc:: lib/bitmap.c + :doc: bitmap introduction + +.. kernel-doc:: include/linux/bitmap.h + :doc: declare bitmap + +.. kernel-doc:: include/linux/bitmap.h + :doc: bitmap overview + +.. kernel-doc:: include/linux/bitmap.h + :doc: bitmap bitops + .. kernel-doc:: lib/bitmap.c :export: diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 5c4178016b1e..d9974c7a0a61 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -21,65 +21,74 @@ * See lib/bitmap.c for more details. */ -/* +/** + * DOC: bitmap overview + * * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * Note that nbits should be always a compile time evaluable constant. * Otherwise many inlines will generate horrible code. * - * bitmap_zero(dst, nbits) *dst = 0UL - * bitmap_fill(dst, nbits) *dst = ~0UL - * bitmap_copy(dst, src, nbits) *dst = *src - * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 - * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 - * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 - * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) - * bitmap_complement(dst, src, nbits) *dst = ~(*src) - * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? - * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? - * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? - * bitmap_empty(src, nbits) Are all bits zero in *src? - * bitmap_full(src, nbits) Are all bits set in *src? - * bitmap_weight(src, nbits) Hamming Weight: number set bits - * bitmap_set(dst, pos, nbits) Set specified bit area - * bitmap_clear(dst, pos, nbits) Clear specified bit area - * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area - * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above - * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n - * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n - * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) - * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) - * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap - * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz - * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf - * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf - * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf - * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf - * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region - * bitmap_release_region(bitmap, pos, order) Free specified bit region - * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region - * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words) - * bitmap_to_u32array(buf, nwords, src, nbits) *buf = *dst (nwords 32b words) - */ - -/* - * Also the following operations in asm/bitops.h apply to bitmaps. + * :: + * + * bitmap_zero(dst, nbits) *dst = 0UL + * bitmap_fill(dst, nbits) *dst = ~0UL + * bitmap_copy(dst, src, nbits) *dst = *src + * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 + * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 + * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) + * bitmap_complement(dst, src, nbits) *dst = ~(*src) + * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? + * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? + * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? + * bitmap_empty(src, nbits) Are all bits zero in *src? + * bitmap_full(src, nbits) Are all bits set in *src? + * bitmap_weight(src, nbits) Hamming Weight: number set bits + * bitmap_set(dst, pos, nbits) Set specified bit area + * bitmap_clear(dst, pos, nbits) Clear specified bit area + * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area + * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above + * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n + * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n + * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) + * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) + * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap + * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz + * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf + * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf + * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf + * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf + * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region + * bitmap_release_region(bitmap, pos, order) Free specified bit region + * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region + * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words) + * bitmap_to_u32array(buf, nwords, src, nbits) *buf = *dst (nwords 32b words) * - * set_bit(bit, addr) *addr |= bit - * clear_bit(bit, addr) *addr &= ~bit - * change_bit(bit, addr) *addr ^= bit - * test_bit(bit, addr) Is bit set in *addr? - * test_and_set_bit(bit, addr) Set bit and return old value - * test_and_clear_bit(bit, addr) Clear bit and return old value - * test_and_change_bit(bit, addr) Change bit and return old value - * find_first_zero_bit(addr, nbits) Position first zero bit in *addr - * find_first_bit(addr, nbits) Position first set bit in *addr - * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit - * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit */ -/* +/** + * DOC: bitmap bitops + * + * Also the following operations in asm/bitops.h apply to bitmaps.:: + * + * set_bit(bit, addr) *addr |= bit + * clear_bit(bit, addr) *addr &= ~bit + * change_bit(bit, addr) *addr ^= bit + * test_bit(bit, addr) Is bit set in *addr? + * test_and_set_bit(bit, addr) Set bit and return old value + * test_and_clear_bit(bit, addr) Clear bit and return old value + * test_and_change_bit(bit, addr) Change bit and return old value + * find_first_zero_bit(addr, nbits) Position first zero bit in *addr + * find_first_bit(addr, nbits) Position first set bit in *addr + * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit + * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit + * + */ + +/** + * DOC: declare bitmap * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. diff --git a/lib/bitmap.c b/lib/bitmap.c index c82c61b66e16..d8f0c094b18e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -18,7 +18,9 @@ #include -/* +/** + * DOC: bitmap introduction + * * bitmaps provide an array of bits, implemented using an an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of From 0d6942348c71cc3246d166245e811841b7388587 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 16:02:25 -0700 Subject: [PATCH 0464/1085] samples: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Jonathan Corbet --- samples/connector/cn_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c index d12cc944b696..95cd06f4ec1e 100644 --- a/samples/connector/cn_test.c +++ b/samples/connector/cn_test.c @@ -125,12 +125,12 @@ static int cn_test_want_notify(void) #endif static u32 cn_test_timer_counter; -static void cn_test_timer_func(unsigned long __data) +static void cn_test_timer_func(struct timer_list *unused) { struct cn_msg *m; char data[32]; - pr_debug("%s: timer fired with data %lu\n", __func__, __data); + pr_debug("%s: timer fired\n", __func__); m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); if (m) { @@ -168,7 +168,7 @@ static int cn_test_init(void) goto err_out; } - setup_timer(&cn_test_timer, cn_test_timer_func, 0); + timer_setup(&cn_test_timer, cn_test_timer_func, 0); mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); pr_info("initialized with id={%u.%u}\n", From b88697810d7c1d102a529990f9071b0f14cfe6df Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 18 Oct 2017 08:33:44 -0700 Subject: [PATCH 0465/1085] rcu: Do not include rtmutex_common.h unconditionally This commit adjusts include files and provides definitions in preparation for suppressing lockdep false-positive ->boost_mtx complaints. Without this preparation, architectures not supporting rt_mutex will get build failures. Reported-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fed95fa941e6..969eae45f05d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -54,6 +54,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); * This probably needs to be excluded from -rt builds. */ #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) +#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) #endif /* #else #ifdef CONFIG_RCU_BOOST */ @@ -911,8 +912,6 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../locking/rtmutex_common.h" - static void rcu_wake_cond(struct task_struct *t, int status) { /* From 02a7c234e54052101164368ff981bd72f7acdd65 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Sep 2017 15:36:42 -0700 Subject: [PATCH 0466/1085] rcu: Suppress lockdep false-positive ->boost_mtx complaints RCU priority boosting uses rt_mutex_init_proxy_locked() to initialize an rt_mutex structure in locked state held by some other task. When that other task releases it, lockdep complains (quite accurately, but a bit uselessly) that the other task never acquired it. This complaint can suppress other, more helpful, lockdep complaints, and in any case it is a false positive. This commit therefore switches from rt_mutex_unlock() to rt_mutex_futex_unlock(), thereby avoiding the lockdep annotations. Of course, if lockdep ever learns about rt_mutex_init_proxy_locked(), addtional adjustments will be required. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 969eae45f05d..1eaab96d1a3c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -531,7 +531,7 @@ void rcu_read_unlock_special(struct task_struct *t) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx); /* * If this was the last task on the expedited lists, From c0da313e090d6e7130d0c3005245176296c24e4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 09:58:47 -0700 Subject: [PATCH 0467/1085] rcu: Add extended-quiescent-state testing advice If you add or remove calls to rcu_idle_enter(), rcu_user_enter(), rcu_irq_exit(), rcu_irq_exit_irqson(), rcu_idle_exit(), rcu_user_exit(), rcu_irq_enter(), rcu_irq_enter_irqson(), rcu_nmi_enter(), or rcu_nmi_exit(), you should run a full set of tests on a kernel built with CONFIG_RCU_EQS_DEBUG=y. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..d234356d7afc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -837,6 +837,9 @@ static void rcu_eqs_enter(bool user) * We crowbar the ->dynticks_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. + * + * If you add or remove a call to rcu_idle_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_enter(void) { @@ -852,6 +855,9 @@ void rcu_idle_enter(void) * is permitted between this call and rcu_user_exit(). This way the * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. + * + * If you add or remove a call to rcu_user_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_enter(void) { @@ -875,6 +881,9 @@ void rcu_user_enter(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit(void) { @@ -899,6 +908,9 @@ void rcu_irq_exit(void) /* * Wrapper for rcu_irq_exit() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit_irqson(void) { @@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user) * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. + * + * If you add or remove a call to rcu_idle_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_exit(void) { @@ -987,6 +1002,9 @@ void rcu_idle_exit(void) * * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. + * + * If you add or remove a call to rcu_user_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_exit(void) { @@ -1012,6 +1030,9 @@ void rcu_user_exit(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter(void) { @@ -1037,6 +1058,9 @@ void rcu_irq_enter(void) /* * Wrapper for rcu_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter_irqson(void) { @@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void) * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_enter(void) { @@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void) * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_exit(void) { From 56628a7fc84a6e3c4c84886d70ec26da3d7f2ce4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 22 Sep 2017 17:28:06 +0200 Subject: [PATCH 0468/1085] rcu/segcblist: Include rcupdate.h The RT build on ARM complains about non-existing ULONG_CMP_LT. This commit therefore includes rcupdate.h into rcu_segcblist.c. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7649fcd2c4c7..88cba7c2956c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu_segcblist.h" From 1843594c56bd79598a7ca4e6f7b6173e7d71b941 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 14:56:58 -0700 Subject: [PATCH 0469/1085] ahci: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adds a pointer back to link structure. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Tejun Heo Cc: linux-ide@vger.kernel.org Link: https://lkml.kernel.org/r/20171016215658.GA101965@beast --- drivers/ata/ahci.h | 1 + drivers/ata/libahci.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 8b61123d2c3c..749fd94441b0 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -303,6 +303,7 @@ struct ahci_em_priv { unsigned long saved_activity; unsigned long activity; unsigned long led_state; + struct ata_link *link; }; struct ahci_port_priv { diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 3e286d86ab42..a0de7a38430c 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -968,12 +968,12 @@ static void ahci_sw_activity(struct ata_link *link) mod_timer(&emp->timer, jiffies + msecs_to_jiffies(10)); } -static void ahci_sw_activity_blink(unsigned long arg) +static void ahci_sw_activity_blink(struct timer_list *t) { - struct ata_link *link = (struct ata_link *)arg; + struct ahci_em_priv *emp = from_timer(emp, t, timer); + struct ata_link *link = emp->link; struct ata_port *ap = link->ap; - struct ahci_port_priv *pp = ap->private_data; - struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + unsigned long led_message = emp->led_state; u32 activity_led_state; unsigned long flags; @@ -1020,7 +1020,8 @@ static void ahci_init_sw_activity(struct ata_link *link) /* init activity stats, setup timer */ emp->saved_activity = emp->activity = 0; - setup_timer(&emp->timer, ahci_sw_activity_blink, (unsigned long)link); + emp->link = link; + timer_setup(&emp->timer, ahci_sw_activity_blink, 0); /* check our blink policy and set flag for link if it's enabled */ if (emp->blink_policy) From e20824e944c3bf4352fcd8d9f446c41b53901e7b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Sep 2017 15:46:41 +0200 Subject: [PATCH 0470/1085] dt-bindings: timer: renesas, cmt: Fix SoC-specific compatible values While the new family-specific compatible values introduced by commit 6f54cc1adcc8957f ("devicetree: bindings: R-Car Gen2 CMT0 and CMT1 bindings") use the recommended order ",-", the new SoC-specific compatible values still use the old and deprecated order ",-". Switch the SoC-specific compatible values to the recommended order while there are no upstream users of these compatible values yet. Fixes: 7f03a0ecfdc786c1 ("devicetree: bindings: r8a73a4 and R-Car Gen2 CMT bindings") Fixes: 63d9e8ca0dd4bfa4 ("devicetree: bindings: Deprecate property, update example") Signed-off-by: Geert Uytterhoeven Acked-by: Rob Herring Reviewed-by: Laurent Pinchart Signed-off-by: Daniel Lezcano --- .../devicetree/bindings/timer/renesas,cmt.txt | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt index 6ca6b9e582a0..d740989eb569 100644 --- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt +++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt @@ -20,16 +20,16 @@ Required Properties: (CMT1 on sh73a0 and r8a7740) This is a fallback for the above renesas,cmt-48-* entries. - - "renesas,cmt0-r8a73a4" for the 32-bit CMT0 device included in r8a73a4. - - "renesas,cmt1-r8a73a4" for the 48-bit CMT1 device included in r8a73a4. - - "renesas,cmt0-r8a7790" for the 32-bit CMT0 device included in r8a7790. - - "renesas,cmt1-r8a7790" for the 48-bit CMT1 device included in r8a7790. - - "renesas,cmt0-r8a7791" for the 32-bit CMT0 device included in r8a7791. - - "renesas,cmt1-r8a7791" for the 48-bit CMT1 device included in r8a7791. - - "renesas,cmt0-r8a7793" for the 32-bit CMT0 device included in r8a7793. - - "renesas,cmt1-r8a7793" for the 48-bit CMT1 device included in r8a7793. - - "renesas,cmt0-r8a7794" for the 32-bit CMT0 device included in r8a7794. - - "renesas,cmt1-r8a7794" for the 48-bit CMT1 device included in r8a7794. + - "renesas,r8a73a4-cmt0" for the 32-bit CMT0 device included in r8a73a4. + - "renesas,r8a73a4-cmt1" for the 48-bit CMT1 device included in r8a73a4. + - "renesas,r8a7790-cmt0" for the 32-bit CMT0 device included in r8a7790. + - "renesas,r8a7790-cmt1" for the 48-bit CMT1 device included in r8a7790. + - "renesas,r8a7791-cmt0" for the 32-bit CMT0 device included in r8a7791. + - "renesas,r8a7791-cmt1" for the 48-bit CMT1 device included in r8a7791. + - "renesas,r8a7793-cmt0" for the 32-bit CMT0 device included in r8a7793. + - "renesas,r8a7793-cmt1" for the 48-bit CMT1 device included in r8a7793. + - "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794. + - "renesas,r8a7794-cmt1" for the 48-bit CMT1 device included in r8a7794. - "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2. - "renesas,rcar-gen2-cmt1" for 48-bit CMT1 devices included in R-Car Gen2. @@ -46,7 +46,7 @@ Required Properties: Example: R8A7790 (R-Car H2) CMT0 and CMT1 nodes cmt0: timer@ffca0000 { - compatible = "renesas,cmt0-r8a7790", "renesas,rcar-gen2-cmt0"; + compatible = "renesas,r8a7790-cmt0", "renesas,rcar-gen2-cmt0"; reg = <0 0xffca0000 0 0x1004>; interrupts = <0 142 IRQ_TYPE_LEVEL_HIGH>, <0 142 IRQ_TYPE_LEVEL_HIGH>; @@ -55,7 +55,7 @@ Example: R8A7790 (R-Car H2) CMT0 and CMT1 nodes }; cmt1: timer@e6130000 { - compatible = "renesas,cmt1-r8a7790", "renesas,rcar-gen2-cmt1"; + compatible = "renesas,r8a7790-cmt1", "renesas,rcar-gen2-cmt1"; reg = <0 0xe6130000 0 0x1004>; interrupts = <0 120 IRQ_TYPE_LEVEL_HIGH>, <0 121 IRQ_TYPE_LEVEL_HIGH>, From 464eed841f54b56df35132434497235f06b154f6 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 18 Sep 2017 15:46:42 +0200 Subject: [PATCH 0471/1085] clocksource/drivers/sh_cmt: Use 0x3f mask for SH_CMT_48BIT case Always use 0x3f as channel mask for the SH_CMT_48BIT type of devices. Once this patch is applied the "renesas,channels-mask" property will be ignored by the driver for older devices matching SH_CMT_48BIT. In the future when all CMT types store channel mask in the driver then we will be able to deprecate and remove "renesas,channels-mask" from DTS. Signed-off-by: Magnus Damm Acked-by: Laurent Pinchart Reviewed-by: Laurent Pinchart Signed-off-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index e09e8bf0bb9b..c104c80424c8 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -74,6 +74,8 @@ enum sh_cmt_model { struct sh_cmt_info { enum sh_cmt_model model; + unsigned int channels_mask; + unsigned long width; /* 16 or 32 bit version of hardware block */ unsigned long overflow_bit; unsigned long clear_bits; @@ -212,6 +214,7 @@ static const struct sh_cmt_info sh_cmt_info[] = { }, [SH_CMT_48BIT] = { .model = SH_CMT_48BIT, + .channels_mask = 0x3f, .width = 32, .overflow_bit = SH_CMT32_CMCSR_CMF, .clear_bits = ~(SH_CMT32_CMCSR_CMF | SH_CMT32_CMCSR_OVF), @@ -966,9 +969,14 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev) id = of_match_node(sh_cmt_of_table, pdev->dev.of_node); cmt->info = id->data; - ret = sh_cmt_parse_dt(cmt); - if (ret < 0) - return ret; + /* prefer in-driver channel configuration over DT */ + if (cmt->info->channels_mask) { + cmt->hw_channels = cmt->info->channels_mask; + } else { + ret = sh_cmt_parse_dt(cmt); + if (ret < 0) + return ret; + } } else if (pdev->dev.platform_data) { struct sh_timer_config *cfg = pdev->dev.platform_data; const struct platform_device_id *id = pdev->id_entry; From 83c79a6d8d7f4821ba0712da57f2f51326f0c447 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 18 Sep 2017 15:46:43 +0200 Subject: [PATCH 0472/1085] clocksource/drivers/sh_cmt: Support separate R-Car Gen2 CMT0/1 Add support for the new R-Car Gen2 CMT0 and CMT1 bindings. Support for the old DT binding is still kept around, however devices using such binding will be treated as a low-feature CMT0 device. If users want to make use of CMT1-specific features then they need to update their DTBs. No special CMT1-specific features are however implemented by his patch, only DT bindings are redone as groundwork for future feature patches. Signed-off-by: Magnus Damm Acked-by: Laurent Pinchart Reviewed-by: Laurent Pinchart Signed-off-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 38 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index c104c80424c8..45af436483f3 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -39,16 +39,16 @@ struct sh_cmt_device; * SoC but also on the particular instance. The following table lists the main * characteristics of those flavours. * - * 16B 32B 32B-F 48B 48B-2 + * 16B 32B 32B-F 48B R-Car Gen2 * ----------------------------------------------------------------------------- * Channels 2 1/4 1 6 2/8 * Control Width 16 16 16 16 32 * Counter Width 16 32 32 32/48 32/48 * Shared Start/Stop Y Y Y Y N * - * The 48-bit gen2 version has a per-channel start/stop register located in the - * channel registers block. All other versions have a shared start/stop register - * located in the global space. + * The r8a73a4 / R-Car Gen2 version has a per-channel start/stop register + * located in the channel registers block. All other versions have a shared + * start/stop register located in the global space. * * Channels are indexed from 0 to N-1 in the documentation. The channel index * infers the start/stop bit position in the control register and the channel @@ -68,7 +68,8 @@ enum sh_cmt_model { SH_CMT_32BIT, SH_CMT_32BIT_FAST, SH_CMT_48BIT, - SH_CMT_48BIT_GEN2, + SH_CMT0_RCAR_GEN2, + SH_CMT1_RCAR_GEN2, }; struct sh_cmt_info { @@ -223,8 +224,20 @@ static const struct sh_cmt_info sh_cmt_info[] = { .read_count = sh_cmt_read32, .write_count = sh_cmt_write32, }, - [SH_CMT_48BIT_GEN2] = { - .model = SH_CMT_48BIT_GEN2, + [SH_CMT0_RCAR_GEN2] = { + .model = SH_CMT0_RCAR_GEN2, + .channels_mask = 0x60, + .width = 32, + .overflow_bit = SH_CMT32_CMCSR_CMF, + .clear_bits = ~(SH_CMT32_CMCSR_CMF | SH_CMT32_CMCSR_OVF), + .read_control = sh_cmt_read32, + .write_control = sh_cmt_write32, + .read_count = sh_cmt_read32, + .write_count = sh_cmt_write32, + }, + [SH_CMT1_RCAR_GEN2] = { + .model = SH_CMT1_RCAR_GEN2, + .channels_mask = 0xff, .width = 32, .overflow_bit = SH_CMT32_CMCSR_CMF, .clear_bits = ~(SH_CMT32_CMCSR_CMF | SH_CMT32_CMCSR_OVF), @@ -862,6 +875,7 @@ static int sh_cmt_setup_channel(struct sh_cmt_channel *ch, unsigned int index, ch->cmt = cmt; ch->index = index; ch->hwidx = hwidx; + ch->timer_bit = hwidx; /* * Compute the address of the channel control register block. For the @@ -883,9 +897,11 @@ static int sh_cmt_setup_channel(struct sh_cmt_channel *ch, unsigned int index, */ ch->ioctrl = cmt->mapbase + 0x40; break; - case SH_CMT_48BIT_GEN2: + case SH_CMT0_RCAR_GEN2: + case SH_CMT1_RCAR_GEN2: ch->iostart = cmt->mapbase + ch->hwidx * 0x100; ch->ioctrl = ch->iostart + 0x10; + ch->timer_bit = 0; break; } @@ -897,8 +913,6 @@ static int sh_cmt_setup_channel(struct sh_cmt_channel *ch, unsigned int index, ch->match_value = ch->max_match_value; raw_spin_lock_init(&ch->lock); - ch->timer_bit = cmt->info->model == SH_CMT_48BIT_GEN2 ? 0 : ch->hwidx; - ret = sh_cmt_register(ch, dev_name(&cmt->pdev->dev), clockevent, clocksource); if (ret) { @@ -941,7 +955,9 @@ static const struct of_device_id sh_cmt_of_table[] __maybe_unused = { { .compatible = "renesas,cmt-32", .data = &sh_cmt_info[SH_CMT_32BIT] }, { .compatible = "renesas,cmt-32-fast", .data = &sh_cmt_info[SH_CMT_32BIT_FAST] }, { .compatible = "renesas,cmt-48", .data = &sh_cmt_info[SH_CMT_48BIT] }, - { .compatible = "renesas,cmt-48-gen2", .data = &sh_cmt_info[SH_CMT_48BIT_GEN2] }, + { .compatible = "renesas,cmt-48-gen2", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, + { .compatible = "renesas,rcar-gen2-cmt0", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, + { .compatible = "renesas,rcar-gen2-cmt1", .data = &sh_cmt_info[SH_CMT1_RCAR_GEN2] }, { } }; MODULE_DEVICE_TABLE(of, sh_cmt_of_table); From f11fb6df3c1924e3623d1afd1db23ea16c68fbb5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Sep 2017 15:46:44 +0200 Subject: [PATCH 0473/1085] clocksource/drivers/sh_cmt: Remove support for "renesas,cmt-32*" Remove driver matching support for the unused "renesas,cmt-32" and "renesas,cmt-32-fast" compatible values, cfr. commit 203bb3479958c48a ("devicetree: bindings: Remove unused 32-bit CMT bindings"). As this removes the last user of SH_CMT_32BIT_FAST, all support for this variant is removed from the driver. Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 45af436483f3..8546736e3bc8 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -66,7 +66,6 @@ struct sh_cmt_device; enum sh_cmt_model { SH_CMT_16BIT, SH_CMT_32BIT, - SH_CMT_32BIT_FAST, SH_CMT_48BIT, SH_CMT0_RCAR_GEN2, SH_CMT1_RCAR_GEN2, @@ -203,16 +202,6 @@ static const struct sh_cmt_info sh_cmt_info[] = { .read_count = sh_cmt_read32, .write_count = sh_cmt_write32, }, - [SH_CMT_32BIT_FAST] = { - .model = SH_CMT_32BIT_FAST, - .width = 32, - .overflow_bit = SH_CMT32_CMCSR_CMF, - .clear_bits = ~(SH_CMT32_CMCSR_CMF | SH_CMT32_CMCSR_OVF), - .read_control = sh_cmt_read16, - .write_control = sh_cmt_write16, - .read_count = sh_cmt_read32, - .write_count = sh_cmt_write32, - }, [SH_CMT_48BIT] = { .model = SH_CMT_48BIT, .channels_mask = 0x3f, @@ -890,13 +879,6 @@ static int sh_cmt_setup_channel(struct sh_cmt_channel *ch, unsigned int index, case SH_CMT_48BIT: ch->ioctrl = cmt->mapbase + 0x10 + ch->hwidx * 0x10; break; - case SH_CMT_32BIT_FAST: - /* - * The 32-bit "fast" timer has a single channel at hwidx 5 but - * is located at offset 0x40 instead of 0x60 for some reason. - */ - ch->ioctrl = cmt->mapbase + 0x40; - break; case SH_CMT0_RCAR_GEN2: case SH_CMT1_RCAR_GEN2: ch->iostart = cmt->mapbase + ch->hwidx * 0x100; @@ -952,8 +934,6 @@ static const struct platform_device_id sh_cmt_id_table[] = { MODULE_DEVICE_TABLE(platform, sh_cmt_id_table); static const struct of_device_id sh_cmt_of_table[] __maybe_unused = { - { .compatible = "renesas,cmt-32", .data = &sh_cmt_info[SH_CMT_32BIT] }, - { .compatible = "renesas,cmt-32-fast", .data = &sh_cmt_info[SH_CMT_32BIT_FAST] }, { .compatible = "renesas,cmt-48", .data = &sh_cmt_info[SH_CMT_48BIT] }, { .compatible = "renesas,cmt-48-gen2", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, { .compatible = "renesas,rcar-gen2-cmt0", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, From 8d50e9476bb4aea53fca12637e71d950deafdf37 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Sep 2017 15:46:45 +0200 Subject: [PATCH 0474/1085] clocksource/drivers/sh_cmt: Mark "renesas,cmt-48-gen2" deprecated Document in the driver that "renesas,cmt-48-gen2" is deprecated, but still supported for backward compatibility with old DTBs, cfr. commit 4e18111ff38f0664 ("devicetree: bindings: Remove deprecated properties"). Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 8546736e3bc8..61a922509706 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -935,7 +935,11 @@ MODULE_DEVICE_TABLE(platform, sh_cmt_id_table); static const struct of_device_id sh_cmt_of_table[] __maybe_unused = { { .compatible = "renesas,cmt-48", .data = &sh_cmt_info[SH_CMT_48BIT] }, - { .compatible = "renesas,cmt-48-gen2", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, + { + /* deprecated, preserved for backward compatibility */ + .compatible = "renesas,cmt-48-gen2", + .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] + }, { .compatible = "renesas,rcar-gen2-cmt0", .data = &sh_cmt_info[SH_CMT0_RCAR_GEN2] }, { .compatible = "renesas,rcar-gen2-cmt1", .data = &sh_cmt_info[SH_CMT1_RCAR_GEN2] }, { } From d1d285972e24b63eeee8118359dcd4c451b295c5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Sep 2017 15:46:46 +0200 Subject: [PATCH 0475/1085] clocksource/drivers/sh_cmt: Remove unused "renesas, channels-mask" handling The in-driver channel configuration in sh_cmt_info.channels_mask is now always set for all CMT devices instantiated from DT. Hence the "renesas,channels-mask" property is no longer checked, and its handling can be removed, cfr. commit 4e18111ff38f0664 ("devicetree: bindings: Remove deprecated properties"). Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 61a922509706..89c514cf59a4 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -946,14 +946,6 @@ static const struct of_device_id sh_cmt_of_table[] __maybe_unused = { }; MODULE_DEVICE_TABLE(of, sh_cmt_of_table); -static int sh_cmt_parse_dt(struct sh_cmt_device *cmt) -{ - struct device_node *np = cmt->pdev->dev.of_node; - - return of_property_read_u32(np, "renesas,channels-mask", - &cmt->hw_channels); -} - static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev) { unsigned int mask; @@ -968,15 +960,7 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev) id = of_match_node(sh_cmt_of_table, pdev->dev.of_node); cmt->info = id->data; - - /* prefer in-driver channel configuration over DT */ - if (cmt->info->channels_mask) { - cmt->hw_channels = cmt->info->channels_mask; - } else { - ret = sh_cmt_parse_dt(cmt); - if (ret < 0) - return ret; - } + cmt->hw_channels = cmt->info->channels_mask; } else if (pdev->dev.platform_data) { struct sh_timer_config *cfg = pdev->dev.platform_data; const struct platform_device_id *id = pdev->id_entry; From 2d1d5172bf843fb44fcc7d3ff61501e9a6601e74 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Sep 2017 15:46:47 +0200 Subject: [PATCH 0476/1085] clocksource/drivers/sh_cmt: Use of_device_get_match_data() helper Use the existing of_device_get_match_data() helper instead of open-coding its functionality. Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_cmt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 89c514cf59a4..70b3cf8e23d0 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -956,10 +957,7 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev) raw_spin_lock_init(&cmt->lock); if (IS_ENABLED(CONFIG_OF) && pdev->dev.of_node) { - const struct of_device_id *id; - - id = of_match_node(sh_cmt_of_table, pdev->dev.of_node); - cmt->info = id->data; + cmt->info = of_device_get_match_data(&pdev->dev); cmt->hw_channels = cmt->info->channels_mask; } else if (pdev->dev.platform_data) { struct sh_timer_config *cfg = pdev->dev.platform_data; From 1893428bd8d14e88facdf745479be0130753f6e9 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 25 Sep 2017 13:46:39 +0530 Subject: [PATCH 0477/1085] clocksource/drivers/fttmr010: pr_err() strings should end with newlines pr_err() messages should end with a new-line to avoid other messages being concatenated. Signed-off-by: Arvind Yadav Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-fttmr010.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index 66dd909960c6..32214eb45fc8 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -263,14 +263,14 @@ static int __init fttmr010_common_init(struct device_node *np, bool is_aspeed) fttmr010->base = of_iomap(np, 0); if (!fttmr010->base) { - pr_err("Can't remap registers"); + pr_err("Can't remap registers\n"); ret = -ENXIO; goto out_free; } /* IRQ for timer 1 */ irq = irq_of_parse_and_map(np, 0); if (irq <= 0) { - pr_err("Can't parse IRQ"); + pr_err("Can't parse IRQ\n"); ret = -EINVAL; goto out_unmap; } From 3c044a15ff2e4ba97eb14d772510eba974c6db58 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 25 Sep 2017 13:46:40 +0530 Subject: [PATCH 0478/1085] clocksource/drivers/owl: pr_err() strings should end with newlines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pr_err() messages should end with a new-line to avoid other messages being concatenated. Signed-off-by: Arvind Yadav Reviewed-by: Andreas Färber Signed-off-by: Daniel Lezcano --- drivers/clocksource/owl-timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/owl-timer.c b/drivers/clocksource/owl-timer.c index d19c53c11094..c68630565079 100644 --- a/drivers/clocksource/owl-timer.c +++ b/drivers/clocksource/owl-timer.c @@ -125,7 +125,7 @@ static int __init owl_timer_init(struct device_node *node) owl_timer_base = of_io_request_and_map(node, 0, "owl-timer"); if (IS_ERR(owl_timer_base)) { - pr_err("Can't map timer registers"); + pr_err("Can't map timer registers\n"); return PTR_ERR(owl_timer_base); } @@ -134,7 +134,7 @@ static int __init owl_timer_init(struct device_node *node) timer1_irq = of_irq_get_byname(node, "timer1"); if (timer1_irq <= 0) { - pr_err("Can't parse timer1 IRQ"); + pr_err("Can't parse timer1 IRQ\n"); return -EINVAL; } From 2554828b17ad16892cdd1becb43230782f0c0cfd Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 25 Sep 2017 13:46:41 +0530 Subject: [PATCH 0479/1085] clocksource/drivers/rockchip: pr_err() strings should end with newlines pr_err() messages should end with a new-line to avoid other messages being concatenated. Signed-off-by: Arvind Yadav Signed-off-by: Daniel Lezcano --- drivers/clocksource/rockchip_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c index c27f4c850d83..33f370dbd0d6 100644 --- a/drivers/clocksource/rockchip_timer.c +++ b/drivers/clocksource/rockchip_timer.c @@ -274,7 +274,7 @@ static int __init rk_clksrc_init(struct device_node *np) TIMER_NAME, rk_clksrc->freq, 250, 32, clocksource_mmio_readl_down); if (ret) { - pr_err("Failed to register clocksource"); + pr_err("Failed to register clocksource\n"); goto out_clocksource; } From 2f8a26c166eba01382765bc2fa575387e41079ac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 17:04:56 +0200 Subject: [PATCH 0480/1085] clocksource: Improve GENERIC_CLOCKEVENTS dependency We regularly run into build errors when a clocksource driver selects CONFIG_TIMER_OF while CONFIG_GENERIC_CLOCKEVENTS is disabled: In file included from drivers/clocksource/timer-of.c:25:0: drivers/clocksource/timer-of.h:35:28: error: field 'clkevt' has incomplete type At the moment, three drivers can show this behavior: ARMV7M_SYSTICK, CLKSRC_ST_LPC and CLKSRC_NPS. We could add further dependencies as we did many times, but I have looked a little bit more at what architectures are left that don't use GENERIC_CLOCKEVENTS, and this shows that there is a better solution. On arch/frv and arch/ia64, we never select CONFIG_GENERIC_CLOCKEVENTS and we also don't use ARCH_USES_GETTIMEOFFSET, which would block the clocksource Kconfig menu. On m68k, some platforms use CONFIG_GENERIC_CLOCKEVENTS, some use ARCH_USES_GETTIMEOFFSET, and some use neither of them. The good news is that there is no configuration that does not set CONFIG_GENERIC_CLOCKEVENTS but that wants to enable any of the Kconfig symbols in the menu, so we can simply replace the dependency with the stricter one. While in theory one could have a clocksource driver without the clockevent infrastructure, this seems unlikely to be relevant in the future any more. We can probably drop some of the other dependencies as well now, e.g. there should generally be no reason to depend on CONFIG_ARM unless the driver uses architecture specific assembly. Reported-by: Randy Dunlap Signed-off-by: Arnd Bergmann Reviewed-by: Linus Walleij Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 50 +++++++------------------------------ 1 file changed, 9 insertions(+), 41 deletions(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index cc6062049170..c729a88007d0 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -1,9 +1,8 @@ menu "Clock Source drivers" - depends on !ARCH_USES_GETTIMEOFFSET + depends on GENERIC_CLOCKEVENTS config TIMER_OF bool - depends on GENERIC_CLOCKEVENTS select TIMER_PROBE config TIMER_ACPI @@ -30,21 +29,18 @@ config CLKSRC_MMIO config BCM2835_TIMER bool "BCM2835 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables the support for the BCM2835 timer driver. config BCM_KONA_TIMER bool "BCM mobile timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables the support for the BCM Kona mobile timer driver. config DIGICOLOR_TIMER bool "Digicolor timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO depends on HAS_IOMEM help @@ -52,7 +48,6 @@ config DIGICOLOR_TIMER config DW_APB_TIMER bool "DW APB timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS help Enables the support for the dw_apb timer. @@ -63,7 +58,6 @@ config DW_APB_TIMER_OF config FTTMR010_TIMER bool "Faraday Technology timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO select TIMER_OF @@ -90,7 +84,6 @@ config ARMADA_370_XP_TIMER config MESON6_TIMER bool "Meson6 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables the support for the Meson6 timer driver. @@ -105,14 +98,12 @@ config ORION_TIMER config OWL_TIMER bool "Owl timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables the support for the Actions Semi Owl timer driver. config SUN4I_TIMER bool "Sun4i timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO select TIMER_OF @@ -135,7 +126,6 @@ config TEGRA_TIMER config VT8500_TIMER bool "VT8500 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM help Enables support for the VT8500 driver. @@ -148,7 +138,6 @@ config CADENCE_TTC_TIMER config ASM9260_TIMER bool "ASM9260 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO select TIMER_OF help @@ -171,28 +160,24 @@ config CLKSRC_NOMADIK_MTU_SCHED_CLOCK config CLKSRC_DBX500_PRCMU bool "Clocksource PRCMU Timer" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM help Use the always on PRCMU Timer as clocksource config CLPS711X_TIMER bool "Cirrus logic timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables support for the Cirrus Logic PS711 timer. config ATLAS7_TIMER bool "Atlas7 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables support for the Atlas7 timer. config MXS_TIMER bool "Mxs timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO select STMP_DEVICE help @@ -200,14 +185,12 @@ config MXS_TIMER config PRIMA2_TIMER bool "Prima2 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables support for the Prima2 timer. config U300_TIMER bool "U300 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on ARM select CLKSRC_MMIO help @@ -215,14 +198,12 @@ config U300_TIMER config NSPIRE_TIMER bool "NSpire timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables support for the Nspire timer. config KEYSTONE_TIMER bool "Keystone timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on ARM || ARM64 select CLKSRC_MMIO help @@ -230,7 +211,6 @@ config KEYSTONE_TIMER config INTEGRATOR_AP_TIMER bool "Integrator-ap timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables support for the Integrator-ap timer. @@ -253,7 +233,7 @@ config CLKSRC_EFM32 config CLKSRC_LPC32XX bool "Clocksource for LPC32XX" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM depends on ARM select CLKSRC_MMIO select TIMER_OF @@ -262,7 +242,7 @@ config CLKSRC_LPC32XX config CLKSRC_PISTACHIO bool "Clocksource for Pistachio SoC" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM select TIMER_OF help Enables the clocksource for the Pistachio SoC. @@ -298,7 +278,6 @@ config CLKSRC_MPS2 config ARC_TIMERS bool "Support for 32-bit TIMERn counters in ARC Cores" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select TIMER_OF help These are legacy 32-bit TIMER0 and TIMER1 counters found on all ARC cores @@ -307,7 +286,6 @@ config ARC_TIMERS config ARC_TIMERS_64BIT bool "Support for 64-bit counters in ARC HS38 cores" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on ARC_TIMERS select TIMER_OF help @@ -407,7 +385,6 @@ config ATMEL_PIT config ATMEL_ST bool "Atmel ST timer support" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select TIMER_OF select MFD_SYSCON help @@ -426,7 +403,6 @@ config CLKSRC_EXYNOS_MCT config CLKSRC_SAMSUNG_PWM bool "PWM timer driver for Samsung S3C, S5P" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM help This is a new clocksource driver for the PWM timer found in @@ -436,7 +412,6 @@ config CLKSRC_SAMSUNG_PWM config FSL_FTM_TIMER bool "Freescale FlexTimer Module driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO help @@ -450,7 +425,6 @@ config VF_PIT_TIMER config OXNAS_RPS_TIMER bool "Oxford Semiconductor OXNAS RPS Timers driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select TIMER_OF select CLKSRC_MMIO help @@ -461,7 +435,7 @@ config SYS_SUPPORTS_SH_CMT config MTK_TIMER bool "Mediatek timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM select TIMER_OF select CLKSRC_MMIO help @@ -479,7 +453,6 @@ config SYS_SUPPORTS_EM_STI config CLKSRC_JCORE_PIT bool "J-Core PIT timer driver" if COMPILE_TEST depends on OF - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO help @@ -488,7 +461,6 @@ config CLKSRC_JCORE_PIT config SH_TIMER_CMT bool "Renesas CMT timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM default SYS_SUPPORTS_SH_CMT help @@ -498,7 +470,6 @@ config SH_TIMER_CMT config SH_TIMER_MTU2 bool "Renesas MTU2 timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM default SYS_SUPPORTS_SH_MTU2 help @@ -508,14 +479,12 @@ config SH_TIMER_MTU2 config RENESAS_OSTM bool "Renesas OSTM timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO help Enables the support for the Renesas OSTM. config SH_TIMER_TMU bool "Renesas TMU timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM default SYS_SUPPORTS_SH_TMU help @@ -525,7 +494,7 @@ config SH_TIMER_TMU config EM_TIMER_STI bool "Renesas STI timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM default SYS_SUPPORTS_EM_STI help This enables build of a clocksource and clockevent driver for @@ -566,7 +535,6 @@ config CLKSRC_TANGO_XTAL config CLKSRC_PXA bool "Clocksource for PXA or SA-11x0 platform" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO help @@ -575,20 +543,20 @@ config CLKSRC_PXA config H8300_TMR8 bool "Clockevent timer for the H8300 platform" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM help This enables the 8 bits timer for the H8300 platform. config H8300_TMR16 bool "Clockevent timer for the H83069 platform" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM help This enables the 16 bits timer for the H8300 platform with the H83069 cpu. config H8300_TPU bool "Clocksource for the H8300 platform" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on HAS_IOMEM help This enables the clocksource for the H8300 platform with the H8S2678 cpu. @@ -600,7 +568,7 @@ config CLKSRC_IMX_GPT config CLKSRC_IMX_TPM bool "Clocksource using i.MX TPM" if COMPILE_TEST - depends on ARM && CLKDEV_LOOKUP && GENERIC_CLOCKEVENTS + depends on ARM && CLKDEV_LOOKUP select CLKSRC_MMIO help Enable this option to use IMX Timer/PWM Module (TPM) timer as From 2d764649e4b18409a65befb2bdf1483dd56b992b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 19 Sep 2017 07:54:40 +0200 Subject: [PATCH 0481/1085] MAINTAINERS: Fix path and add bindings to timers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As spotted by Andreas Färber, the clocksource directory path does not follow the rule where a maintained directory must end with a '/' character. Also, the timers devicetree bindings documentation is not mentioned in the entry, so every submission touching the devicetree documentation misses to Cc the maintainers of the timers. Reported-by: Andreas Färber Reviewed-by: Andreas Färber Signed-off-by: Daniel Lezcano --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 65b0c88d5ee0..67be8f8e5fa3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3444,7 +3444,8 @@ M: Thomas Gleixner L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Supported -F: drivers/clocksource +F: drivers/clocksource/ +F: Documentation/devicetree/bindings/timer/ CMPC ACPI DRIVER M: Thadeu Lima de Souza Cascardo From 21492e1333a0d07af6968667f128e19088cf5ead Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 16 Oct 2017 16:28:38 +0100 Subject: [PATCH 0482/1085] clocksource/drivers/arm_arch_timer: Validate CNTFRQ after enabling frame The ACPI GTDT code validates the CNTFRQ field of each MMIO timer frame against the CNTFRQ system register of the current CPU, to ensure that they are equal, which is mandated by the architecture. However, reading the CNTFRQ field of a frame is not possible until the RFRQ bit in the frame's CNTACRn register is set, and doing so before that willl produce the following error: arch_timer: [Firmware Bug]: CNTFRQ mismatch: frame @ 0x00000000e0be0000: (0x00000000), CPU: (0x0ee6b280) arch_timer: Disabling MMIO timers due to CNTFRQ mismatch arch_timer: Failed to initialize memory-mapped timer. The reason is that the CNTFRQ field is RES0 if access is not enabled. So move the validation of CNTFRQ into the loop that iterates over the timers to find the best frame, but defer it until after we have selected the best frame, which should also have enabled the RFRQ bit. Signed-off-by: Ard Biesheuvel Signed-off-by: Mark Rutland Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 38 +++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index fd4b7f684bd0..14e2419063e9 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1268,10 +1268,6 @@ arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) iounmap(cntctlbase); - if (!best_frame) - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer_mem->cntctlbase); - return best_frame; } @@ -1372,6 +1368,8 @@ static int __init arch_timer_mem_of_init(struct device_node *np) frame = arch_timer_mem_find_best_frame(timer_mem); if (!frame) { + pr_err("Unable to find a suitable frame in timer @ %pa\n", + &timer_mem->cntctlbase); ret = -EINVAL; goto out; } @@ -1420,7 +1418,7 @@ arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) static int __init arch_timer_mem_acpi_init(int platform_timer_count) { struct arch_timer_mem *timers, *timer; - struct arch_timer_mem_frame *frame; + struct arch_timer_mem_frame *frame, *best_frame = NULL; int timer_count, i, ret = 0; timers = kcalloc(platform_timer_count, sizeof(*timers), @@ -1432,14 +1430,6 @@ static int __init arch_timer_mem_acpi_init(int platform_timer_count) if (ret || !timer_count) goto out; - for (i = 0; i < timer_count; i++) { - ret = arch_timer_mem_verify_cntfrq(&timers[i]); - if (ret) { - pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); - goto out; - } - } - /* * While unlikely, it's theoretically possible that none of the frames * in a timer expose the combination of feature we want. @@ -1448,12 +1438,26 @@ static int __init arch_timer_mem_acpi_init(int platform_timer_count) timer = &timers[i]; frame = arch_timer_mem_find_best_frame(timer); - if (frame) - break; + if (!best_frame) + best_frame = frame; + + ret = arch_timer_mem_verify_cntfrq(timer); + if (ret) { + pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); + goto out; + } + + if (!best_frame) /* implies !frame */ + /* + * Only complain about missing suitable frames if we + * haven't already found one in a previous iteration. + */ + pr_err("Unable to find a suitable frame in timer @ %pa\n", + &timer->cntctlbase); } - if (frame) - ret = arch_timer_mem_frame_register(frame); + if (best_frame) + ret = arch_timer_mem_frame_register(best_frame); out: kfree(timers); return ret; From a7fb4577bbe307dd3dd971c7ea8f35a68fc031ca Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 16 Oct 2017 16:28:39 +0100 Subject: [PATCH 0483/1085] clocksource/drivers/arm_arch_timer: Fix DEFINE_PER_CPU expansion Our ctags mangling script can't handle newlines inside of a DEFINE_PER_CPU(), leading to an annoying message whenever tags are built: ctags: Warning: drivers/clocksource/arm_arch_timer.c:302: null expansion of name pattern "\1" This was dealt with elsewhere in commit: 25528213fe9f75f4 ("tags: Fix DEFINE_PER_CPU expansions") ... by ensuring each DEFINE_PER_CPU() was contained on a single line, even where this would violate the usual code style (checkpatch warnings and all). Let's do the same for the arch timer driver, and get rid of the distraction. Signed-off-by: Mark Rutland Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 14e2419063e9..0ecf5beb56ec 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -299,8 +299,7 @@ static u64 notrace arm64_858921_read_cntvct_el0(void) #endif #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND -DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, - timer_unstable_counter_workaround); +DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround); DEFINE_STATIC_KEY_FALSE(arch_timer_read_ool_enabled); From 7957b07b559175500b2a03e8a39738c1b4a832fe Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 19 Oct 2017 12:55:34 +0100 Subject: [PATCH 0484/1085] clocksource/drivers/mips-gic-timer: Remove pointless irq_save,restore The function gic_next_event is always called with interrupts disabled, so the local_irq_save / local_irq_restore are pointless - remove them. [Daniel Lezcano: Fixed warning by removing unused variable 'flags'] Signed-off-by: Matt Redfearn Suggested-by: Daniel Lezcano Reported-by: Thomas Gleixner Signed-off-by: Daniel Lezcano --- drivers/clocksource/mips-gic-timer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index ae3167c28b12..775dea04460d 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -39,16 +39,13 @@ static u64 notrace gic_read_count(void) static int gic_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned long flags; u64 cnt; int res; cnt = gic_read_count(); cnt += (u64)delta; - local_irq_save(flags); write_gic_vl_other(mips_cm_vp_id(cpumask_first(evt->cpumask))); write_gic_vo_compare(cnt); - local_irq_restore(flags); res = ((int)(gic_read_count() - cnt) >= 0) ? -ETIME : 0; return res; } From f16ff2bdb135e2eb35488264006b575c476ea597 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 19 Oct 2017 12:55:35 +0100 Subject: [PATCH 0485/1085] clocksource/drivers/mips-gic-timer: Add fastpath for local timer updates Always accessing the compare register via the CM redirect region is (relatively) slow. If the timer being updated is the current CPUs then this can be shortcutted by writing to the CM VP local region. Signed-off-by: Matt Redfearn Signed-off-by: Daniel Lezcano --- drivers/clocksource/mips-gic-timer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index 775dea04460d..a04808a21d4e 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -39,13 +39,18 @@ static u64 notrace gic_read_count(void) static int gic_next_event(unsigned long delta, struct clock_event_device *evt) { + int cpu = cpumask_first(evt->cpumask); u64 cnt; int res; cnt = gic_read_count(); cnt += (u64)delta; - write_gic_vl_other(mips_cm_vp_id(cpumask_first(evt->cpumask))); - write_gic_vo_compare(cnt); + if (cpu == raw_smp_processor_id()) { + write_gic_vl_compare(cnt); + } else { + write_gic_vl_other(mips_cm_vp_id(cpu)); + write_gic_vo_compare(cnt); + } res = ((int)(gic_read_count() - cnt) >= 0) ? -ETIME : 0; return res; } From db1a8922cf3f0b936595ba41774fe4b66adf091a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:05 -0400 Subject: [PATCH 0486/1085] capabilities: factor out cap_bprm_set_creds privileged root Factor out the case of privileged root from the function cap_bprm_set_creds() to make the latter easier to read and analyse. Suggested-by: Serge Hallyn Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 76 ++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index c25e0d27537f..be9bca50c312 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -695,6 +695,52 @@ out: return rc; } +/* + * handle_privileged_root - Handle case of privileged root + * @bprm: The execution parameters, including the proposed creds + * @has_fcap: Are any file capabilities set? + * @effective: Do we have effective root privilege? + * @root_uid: This namespace' root UID WRT initial USER namespace + * + * Handle the case where root is privileged and hasn't been neutered by + * SECURE_NOROOT. If file capabilities are set, they won't be combined with + * set UID root and nothing is changed. If we are root, cap_permitted is + * updated. If we have become set UID root, the effective bit is set. + */ +static void handle_privileged_root(struct linux_binprm *bprm, bool has_cap, + bool *effective, kuid_t root_uid) +{ + const struct cred *old = current_cred(); + struct cred *new = bprm->cred; + + if (issecure(SECURE_NOROOT)) + return; + /* + * If the legacy file capability is set, then don't set privs + * for a setuid root binary run by a non-root user. Do set it + * for a root user just to cause least surprise to an admin. + */ + if (has_cap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) { + warn_setuid_and_fcaps_mixed(bprm->filename); + return; + } + /* + * To support inheritance of root-permissions and suid-root + * executables under compatibility mode, we override the + * capability sets for the file. + */ + if (uid_eq(new->euid, root_uid) || uid_eq(new->uid, root_uid)) { + /* pP' = (cap_bset & ~0) | (pI & ~0) */ + new->cap_permitted = cap_combine(old->cap_bset, + old->cap_inheritable); + } + /* + * If only the real uid is 0, we do not set the effective bit. + */ + if (uid_eq(new->euid, root_uid)) + *effective = true; +} + /** * cap_bprm_set_creds - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds @@ -707,46 +753,20 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; - bool effective, has_cap = false, is_setid; + bool effective = false, has_cap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; - effective = false; ret = get_file_caps(bprm, &effective, &has_cap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); - if (!issecure(SECURE_NOROOT)) { - /* - * If the legacy file capability is set, then don't set privs - * for a setuid root binary run by a non-root user. Do set it - * for a root user just to cause least surprise to an admin. - */ - if (has_cap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) { - warn_setuid_and_fcaps_mixed(bprm->filename); - goto skip; - } - /* - * To support inheritance of root-permissions and suid-root - * executables under compatibility mode, we override the - * capability sets for the file. - * - * If only the real uid is 0, we do not set the effective bit. - */ - if (uid_eq(new->euid, root_uid) || uid_eq(new->uid, root_uid)) { - /* pP' = (cap_bset & ~0) | (pI & ~0) */ - new->cap_permitted = cap_combine(old->cap_bset, - old->cap_inheritable); - } - if (uid_eq(new->euid, root_uid)) - effective = true; - } -skip: + handle_privileged_root(bprm, has_cap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (!cap_issubset(new->cap_permitted, old->cap_permitted)) From 4c7e715fc87b6f8b652363b3515b48b3822c5b5f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:06 -0400 Subject: [PATCH 0487/1085] capabilities: intuitive names for cap gain status Introduce macros cap_gained, cap_grew, cap_full to make the use of the negation of is_subset() easier to read and analyse. Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index be9bca50c312..4c9af6ef24b6 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -741,6 +741,12 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_cap, *effective = true; } +#define __cap_gained(field, target, source) \ + !cap_issubset(target->cap_##field, source->cap_##field) +#define __cap_grew(target, source, cred) \ + !cap_issubset(cred->cap_##target, cred->cap_##source) +#define __cap_full(field, cred) \ + cap_issubset(CAP_FULL_SET, cred->cap_##field) /** * cap_bprm_set_creds - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds @@ -769,10 +775,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) handle_privileged_root(bprm, has_cap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ - if (!cap_issubset(new->cap_permitted, old->cap_permitted)) + if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; - /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * @@ -780,8 +785,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) */ is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid); - if ((is_setid || - !cap_issubset(new->cap_permitted, old->cap_permitted)) && + if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ @@ -831,8 +835,8 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. */ - if (!cap_issubset(new->cap_effective, new->cap_ambient)) { - if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || + if (__cap_grew(effective, ambient, new)) { + if (!__cap_full(effective, new) || !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || issecure(SECURE_NOROOT)) { ret = audit_log_bprm_fcaps(bprm, new, old); @@ -852,7 +856,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) bprm->cap_elevated = 1; } else if (!uid_eq(new->uid, root_uid)) { if (effective || - !cap_issubset(new->cap_permitted, new->cap_ambient)) + __cap_grew(permitted, ambient, new)) bprm->cap_elevated = 1; } From fc7eadf768a3e2c062e52eea89b52a0076d53b0c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:07 -0400 Subject: [PATCH 0488/1085] capabilities: rename has_cap to has_fcap Rename has_cap to has_fcap to clarify it applies to file capabilities since the entire source file is about capabilities. Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 4c9af6ef24b6..13661d34f842 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -536,7 +536,7 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, - bool *has_cap) + bool *has_fcap) { struct cred *new = bprm->cred; unsigned i; @@ -546,7 +546,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) - *has_cap = true; + *has_fcap = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; @@ -652,7 +652,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ -static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_cap) +static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; @@ -683,7 +683,7 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c goto out; } - rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_cap); + rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); if (rc == -EINVAL) printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", __func__, rc, bprm->filename); @@ -707,7 +707,7 @@ out: * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ -static void handle_privileged_root(struct linux_binprm *bprm, bool has_cap, +static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); @@ -720,7 +720,7 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_cap, * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ - if (has_cap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) { + if (has_fcap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } @@ -759,20 +759,20 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; - bool effective = false, has_cap = false, is_setid; + bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; - ret = get_file_caps(bprm, &effective, &has_cap); + ret = get_file_caps(bprm, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); - handle_privileged_root(bprm, has_cap, &effective, root_uid); + handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) @@ -802,7 +802,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ - if (has_cap || is_setid) + if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* From 9304b46c912d65a103a68f093b456ba3c02dca3b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:08 -0400 Subject: [PATCH 0489/1085] capabilities: use root_priveleged inline to clarify logic Introduce inline root_privileged() to make use of SECURE_NONROOT easier to read. Suggested-by: Serge Hallyn Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 13661d34f842..9b8a6e79d858 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -695,6 +695,8 @@ out: return rc; } +static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } + /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds @@ -713,7 +715,7 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, const struct cred *old = current_cred(); struct cred *new = bprm->cred; - if (issecure(SECURE_NOROOT)) + if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs @@ -838,7 +840,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) if (__cap_grew(effective, ambient, new)) { if (!__cap_full(effective, new) || !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || - issecure(SECURE_NOROOT)) { + !root_privileged()) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; From 81a6a012996b3fd47608d87b16e79412dd73578e Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:09 -0400 Subject: [PATCH 0490/1085] capabilities: use intuitive names for id changes Introduce a number of inlines to make the use of the negation of uid_eq() easier to read and analyse. Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 9b8a6e79d858..421f7438d3c8 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -697,6 +697,15 @@ out: static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } +static inline bool __is_real(kuid_t uid, struct cred *cred) +{ return uid_eq(cred->uid, uid); } + +static inline bool __is_eff(kuid_t uid, struct cred *cred) +{ return uid_eq(cred->euid, uid); } + +static inline bool __is_suid(kuid_t uid, struct cred *cred) +{ return !__is_real(uid, cred) && __is_eff(uid, cred); } + /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds @@ -722,7 +731,7 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ - if (has_fcap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) { + if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } @@ -731,7 +740,7 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, * executables under compatibility mode, we override the * capability sets for the file. */ - if (uid_eq(new->euid, root_uid) || uid_eq(new->uid, root_uid)) { + if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); @@ -739,7 +748,7 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, /* * If only the real uid is 0, we do not set the effective bit. */ - if (uid_eq(new->euid, root_uid)) + if (__is_eff(root_uid, new)) *effective = true; } @@ -749,6 +758,13 @@ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) + +static inline bool __is_setuid(struct cred *new, const struct cred *old) +{ return !uid_eq(new->euid, old->uid); } + +static inline bool __is_setgid(struct cred *new, const struct cred *old) +{ return !gid_eq(new->egid, old->gid); } + /** * cap_bprm_set_creds - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds @@ -785,7 +801,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ - is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid); + is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || @@ -839,7 +855,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) */ if (__cap_grew(effective, ambient, new)) { if (!__cap_full(effective, new) || - !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || + !__is_eff(root_uid, new) || !__is_real(root_uid, new) || !root_privileged()) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) @@ -856,7 +872,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) bprm->cap_elevated = 0; if (is_setid) { bprm->cap_elevated = 1; - } else if (!uid_eq(new->uid, root_uid)) { + } else if (!__is_real(root_uid, new)) { if (effective || __cap_grew(permitted, ambient, new)) bprm->cap_elevated = 1; From 9fbc2c79644a88a1cc40a2628ccff1bbbbc9ecc5 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:10 -0400 Subject: [PATCH 0491/1085] capabilities: move audit log decision to function Move the audit log decision logic to its own function to isolate the complexity in one place. Suggested-by: Serge Hallyn Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 50 ++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 421f7438d3c8..d7f0cbdf04c4 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -765,6 +765,32 @@ static inline bool __is_setuid(struct cred *new, const struct cred *old) static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } +/* + * Audit candidate if current->cap_effective is set + * + * We do not bother to audit if 3 things are true: + * 1) cap_effective has all caps + * 2) we are root + * 3) root is supposed to have all caps (SECURE_NOROOT) + * Since this is just a normal root execing a process. + * + * Number 1 above might fail if you don't have a full bset, but I think + * that is interesting information to audit. + */ +static inline bool nonroot_raised_pE(struct cred *cred, kuid_t root) +{ + bool ret = false; + + if (__cap_grew(effective, ambient, cred)) { + if (!__cap_full(effective, cred) || + !__is_eff(root, cred) || !__is_real(root, cred) || + !root_privileged()) { + ret = true; + } + } + return ret; +} + /** * cap_bprm_set_creds - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds @@ -841,26 +867,10 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; - /* - * Audit candidate if current->cap_effective is set - * - * We do not bother to audit if 3 things are true: - * 1) cap_effective has all caps - * 2) we are root - * 3) root is supposed to have all caps (SECURE_NOROOT) - * Since this is just a normal root execing a process. - * - * Number 1 above might fail if you don't have a full bset, but I think - * that is interesting information to audit. - */ - if (__cap_grew(effective, ambient, new)) { - if (!__cap_full(effective, new) || - !__is_eff(root_uid, new) || !__is_real(root_uid, new) || - !root_privileged()) { - ret = audit_log_bprm_fcaps(bprm, new, old); - if (ret < 0) - return ret; - } + if (nonroot_raised_pE(new, root_uid)) { + ret = audit_log_bprm_fcaps(bprm, new, old); + if (ret < 0) + return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); From 02ebbaf48cf211498a9bd2c6b65e7d1b0a901807 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:11 -0400 Subject: [PATCH 0492/1085] capabilities: remove a layer of conditional logic Remove a layer of conditional logic to make the use of conditions easier to read and analyse. Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index d7f0cbdf04c4..eac70e2b400b 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -781,13 +781,12 @@ static inline bool nonroot_raised_pE(struct cred *cred, kuid_t root) { bool ret = false; - if (__cap_grew(effective, ambient, cred)) { - if (!__cap_full(effective, cred) || - !__is_eff(root, cred) || !__is_real(root, cred) || - !root_privileged()) { - ret = true; - } - } + if (__cap_grew(effective, ambient, cred) && + (!__cap_full(effective, cred) || + !__is_eff(root, cred) || + !__is_real(root, cred) || + !root_privileged())) + ret = true; return ret; } @@ -880,13 +879,11 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) /* Check for privilege-elevated exec. */ bprm->cap_elevated = 0; - if (is_setid) { + if (is_setid || + (!__is_real(root_uid, new) && + (effective || + __cap_grew(permitted, ambient, new)))) bprm->cap_elevated = 1; - } else if (!__is_real(root_uid, new)) { - if (effective || - __cap_grew(permitted, ambient, new)) - bprm->cap_elevated = 1; - } return 0; } From c0d1adefe0a3775cc16374dc9ebdfd8504afa14b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:12 -0400 Subject: [PATCH 0493/1085] capabilities: invert logic for clarity The way the logic was presented, it was awkward to read and verify. Invert the logic using DeMorgan's Law to be more easily able to read and understand. Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Okay-ished-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index eac70e2b400b..0bd94d36e635 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -782,10 +782,10 @@ static inline bool nonroot_raised_pE(struct cred *cred, kuid_t root) bool ret = false; if (__cap_grew(effective, ambient, cred) && - (!__cap_full(effective, cred) || - !__is_eff(root, cred) || - !__is_real(root, cred) || - !root_privileged())) + !(__cap_full(effective, cred) && + __is_eff(root, cred) && + __is_real(root, cred) && + root_privileged())) ret = true; return ret; } From 588fb2c7e294753d3090a1dc2e7c34e7e3ce5aff Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:13 -0400 Subject: [PATCH 0494/1085] capabilities: fix logic for effective root or real root Now that the logic is inverted, it is much easier to see that both real root and effective root conditions had to be met to avoid printing the BPRM_FCAPS record with audit syscalls. This meant that any setuid root applications would print a full BPRM_FCAPS record when it wasn't necessary, cluttering the event output, since the SYSCALL and PATH records indicated the presence of the setuid bit and effective root user id. Require only one of effective root or real root to avoid printing the unnecessary record. Ref: commit 3fc689e96c0c ("Add audit_log_bprm_fcaps/AUDIT_BPRM_FCAPS") See: https://github.com/linux-audit/audit-kernel/issues/16 Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Acked-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 0bd94d36e635..ad7536d76820 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -770,7 +770,7 @@ static inline bool __is_setgid(struct cred *new, const struct cred *old) * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps - * 2) we are root + * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * @@ -783,8 +783,7 @@ static inline bool nonroot_raised_pE(struct cred *cred, kuid_t root) if (__cap_grew(effective, ambient, cred) && !(__cap_full(effective, cred) && - __is_eff(root, cred) && - __is_real(root, cred) && + (__is_eff(root, cred) || __is_real(root, cred)) && root_privileged())) ret = true; return ret; From dbbbe1105ea6aa0c49d78a4ea0d924e0c02307eb Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Oct 2017 20:57:14 -0400 Subject: [PATCH 0495/1085] capabilities: audit log other surprising conditions The existing condition tested for process effective capabilities set by file attributes but intended to ignore the change if the result was unsurprisingly an effective full set in the case root is special with a setuid root executable file and we are root. Stated again: - When you execute a setuid root application, it is no surprise and expected that it got all capabilities, so we do not want capabilities recorded. if (pE_grew && !(pE_fullset && (eff_root || real_root) && root_priveleged) ) Now make sure we cover other cases: - If something prevented a setuid root app getting all capabilities and it wound up with one capability only, then it is a surprise and should be logged. When it is a setuid root file, we only want capabilities when the process does not get full capabilities.. root_priveleged && setuid_root && !pE_fullset - Similarly if a non-setuid program does pick up capabilities due to file system based capabilities, then we want to know what capabilities were picked up. When it has file system based capabilities we want the capabilities. !is_setuid && (has_fcap && pP_gained) - If it is a non-setuid file and it gets ambient capabilities, we want the capabilities. !is_setuid && pA_gained - These last two are combined into one due to the common first parameter. Related: https://github.com/linux-audit/audit-kernel/issues/16 Signed-off-by: Richard Guy Briggs Reviewed-by: Serge Hallyn Acked-by: James Morris Acked-by: Kees Cook Acked-by: Paul Moore Signed-off-by: James Morris --- security/commoncap.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index ad7536d76820..5fa839c7fb3f 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -766,7 +766,7 @@ static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* - * Audit candidate if current->cap_effective is set + * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps @@ -776,16 +776,31 @@ static inline bool __is_setgid(struct cred *new, const struct cred *old) * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. + * + * A number of other conditions require logging: + * 2) something prevented setuid root getting all caps + * 3) non-setuid root gets fcaps + * 4) non-setuid root gets ambient */ -static inline bool nonroot_raised_pE(struct cred *cred, kuid_t root) +static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, + kuid_t root, bool has_fcap) { bool ret = false; - if (__cap_grew(effective, ambient, cred) && - !(__cap_full(effective, cred) && - (__is_eff(root, cred) || __is_real(root, cred)) && - root_privileged())) + if ((__cap_grew(effective, ambient, new) && + !(__cap_full(effective, new) && + (__is_eff(root, new) || __is_real(root, new)) && + root_privileged())) || + (root_privileged() && + __is_suid(root, new) && + !__cap_full(effective, new)) || + (!__is_setuid(new, old) && + ((has_fcap && + __cap_gained(permitted, new, old)) || + __cap_gained(ambient, new, old)))) + ret = true; + return ret; } @@ -865,7 +880,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; - if (nonroot_raised_pE(new, root_uid)) { + if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; From a30b85df7d599f626973e9cd3056fe755bd778e0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Oct 2017 08:43:39 +0900 Subject: [PATCH 0496/1085] kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y We want to wait for all potentially preempted kprobes trampoline execution to have completed. This guarantees that any freed trampoline memory is not in use by any task in the system anymore. synchronize_rcu_tasks() gives such a guarantee, so use it. Also, this guarantees to wait for all potentially preempted tasks on the instructions which will be replaced with a jump. Since this becomes a problem only when CONFIG_PREEMPT=y, enable CONFIG_TASKS_RCU=y for synchronize_rcu_tasks() in that case. Signed-off-by: Masami Hiramatsu Acked-by: Paul E. McKenney Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Naveen N . Rao Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150845661962.5443.17724352636247312231.stgit@devbox Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 +- kernel/kprobes.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 1aafb4efbb51..f75c8e8a229b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -90,7 +90,7 @@ config STATIC_KEYS_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - depends on !PREEMPT + select TASKS_RCU if PREEMPT config KPROBES_ON_FTRACE def_bool y diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 15fba7fe57c8..a8fc1492b308 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work) do_unoptimize_kprobes(); /* - * Step 2: Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. + * Step 2: Wait for quiesence period to ensure all potentially + * preempted tasks to have normally scheduled. Because optprobe + * may modify multiple instructions, there is a chance that Nth + * instruction is preempted. In that case, such tasks can return + * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. + * Note that on non-preemptive kernel, this is transparently converted + * to synchronoze_sched() to wait for all interrupts to have completed. */ - synchronize_sched(); + synchronize_rcu_tasks(); /* Step 3: Optimize kprobes after quiesence period */ do_optimize_kprobes(); From da20ab35180780e4a6eadc804544f1fa967f3567 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 18 Oct 2017 10:21:07 -0700 Subject: [PATCH 0497/1085] x86/entry: Use SYSCALL_DEFINE() macros for sys_modify_ldt() We do not have tracepoints for sys_modify_ldt() because we define it directly instead of using the normal SYSCALL_DEFINEx() macros. However, there is a reason sys_modify_ldt() does not use the macros: it has an 'int' return type instead of 'unsigned long'. This is a bug, but it's a bug cemented in the ABI. What does this mean? If we return -EINVAL from a function that returns 'int', we have 0x00000000ffffffea in %rax. But, if we return -EINVAL from a function returning 'unsigned long', we end up with 0xffffffffffffffea in %rax, which is wrong. To work around this and maintain the 'int' behavior while using the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int' in both implementations of sys_modify_ldt(). Signed-off-by: Dave Hansen Reviewed-by: Andy Lutomirski Reviewed-by: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 2 +- arch/x86/kernel/ldt.c | 16 +++++++++++++--- arch/x86/um/ldt.c | 7 +++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91dfcafe27a6..bad25bb80679 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f0e64db18ac8..0402d44deb4d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,8 +295,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 836a1eb5df43..3ee234b6234d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) mm->arch.ldt.entry_count = 0; } -int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { - return do_modify_ldt_skas(func, ptr, bytecount); + /* See non-um modify_ldt() for why we do this cast */ + return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); } From 590c845930457d25d27dc1fdd964a1ce18ef2d7d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:14:37 +0900 Subject: [PATCH 0498/1085] kprobes: Disable the jprobes APIs Disable the jprobes APIs and comment out the jprobes API function code. This is in preparation of removing all jprobes related code (including kprobe's break_handler). Nowadays ftrace and other tracing features are mature enough to replace jprobes use-cases. Users can safely use ftrace and perf probe etc. for their use cases. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724527741.5014.15465541485637899227.stgit@devbox Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 40 ++++++++++++++++++---------------------- kernel/kprobes.c | 2 ++ 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bd2684700b74..56b2e698dbad 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -391,10 +391,6 @@ int register_kprobes(struct kprobe **kps, int num); void unregister_kprobes(struct kprobe **kps, int num); int setjmp_pre_handler(struct kprobe *, struct pt_regs *); int longjmp_break_handler(struct kprobe *, struct pt_regs *); -int register_jprobe(struct jprobe *p); -void unregister_jprobe(struct jprobe *p); -int register_jprobes(struct jprobe **jps, int num); -void unregister_jprobes(struct jprobe **jps, int num); void jprobe_return(void); unsigned long arch_deref_entry_point(void *); @@ -443,20 +439,6 @@ static inline void unregister_kprobe(struct kprobe *p) static inline void unregister_kprobes(struct kprobe **kps, int num) { } -static inline int register_jprobe(struct jprobe *p) -{ - return -ENOSYS; -} -static inline int register_jprobes(struct jprobe **jps, int num) -{ - return -ENOSYS; -} -static inline void unregister_jprobe(struct jprobe *p) -{ -} -static inline void unregister_jprobes(struct jprobe **jps, int num) -{ -} static inline void jprobe_return(void) { } @@ -486,6 +468,20 @@ static inline int enable_kprobe(struct kprobe *kp) return -ENOSYS; } #endif /* CONFIG_KPROBES */ +static inline int __deprecated register_jprobe(struct jprobe *p) +{ + return -ENOSYS; +} +static inline int __deprecated register_jprobes(struct jprobe **jps, int num) +{ + return -ENOSYS; +} +static inline void __deprecated unregister_jprobe(struct jprobe *p) +{ +} +static inline void __deprecated unregister_jprobes(struct jprobe **jps, int num) +{ +} static inline int disable_kretprobe(struct kretprobe *rp) { return disable_kprobe(&rp->kp); @@ -494,13 +490,13 @@ static inline int enable_kretprobe(struct kretprobe *rp) { return enable_kprobe(&rp->kp); } -static inline int disable_jprobe(struct jprobe *jp) +static inline int __deprecated disable_jprobe(struct jprobe *jp) { - return disable_kprobe(&jp->kp); + return -ENOSYS; } -static inline int enable_jprobe(struct jprobe *jp) +static inline int __deprecated enable_jprobe(struct jprobe *jp) { - return enable_kprobe(&jp->kp); + return -ENOSYS; } #ifndef CONFIG_KPROBES diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a8fc1492b308..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1771,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } +#if 0 int register_jprobes(struct jprobe **jps, int num) { int ret = 0, i; @@ -1839,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num) } } EXPORT_SYMBOL_GPL(unregister_jprobes); +#endif #ifdef CONFIG_KRETPROBES /* From 2c7d662e2647aa55fa56dc449b3878ac24e17adf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:15:17 +0900 Subject: [PATCH 0499/1085] kprobes: Disable the jprobes test code Disable jprobes test code because jprobes are deprecated. This code will be completely removed when the jprobe code is removed. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724531730.5014.6377596890962355763.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/test_kprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 47106a1e645a..dd53e354f630 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,7 +22,7 @@ #define div_factor 3 -static u32 rand1, preh_val, posth_val, jph_val; +static u32 rand1, preh_val, posth_val; static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); static u32 (*target2)(u32 value); @@ -162,6 +162,9 @@ static int test_kprobes(void) } +#if 0 +static u32 jph_val; + static u32 j_kprobe_target(u32 value) { if (preemptible()) { @@ -239,6 +242,10 @@ static int test_jprobes(void) return 0; } +#else +#define test_jprobe() (0) +#define test_jprobes() (0) +#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; From 9be95bdc53c12ada23e39027237fd05e1393d893 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:15:57 +0900 Subject: [PATCH 0500/1085] kprobes: Remove the jprobes sample code Remove the jprobes sample module because jprobes are deprecated. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724535709.5014.7261513316230565780.stgit@devbox Signed-off-by: Ingo Molnar --- samples/kprobes/Makefile | 2 +- samples/kprobes/jprobe_example.c | 67 -------------------------------- 2 files changed, 1 insertion(+), 68 deletions(-) delete mode 100644 samples/kprobes/jprobe_example.c diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile index 68739bc4fc6a..880e54d2c082 100644 --- a/samples/kprobes/Makefile +++ b/samples/kprobes/Makefile @@ -1,5 +1,5 @@ # builds the kprobes example kernel modules; # then to use one (as root): insmod -obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o jprobe_example.o +obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c deleted file mode 100644 index e3c0a40909f7..000000000000 --- a/samples/kprobes/jprobe_example.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Here's a sample kernel module showing the use of jprobes to dump - * the arguments of _do_fork(). - * - * For more information on theory of operation of jprobes, see - * Documentation/kprobes.txt - * - * Build and insert the kernel module as done in the kprobe example. - * You will see the trace data in /var/log/messages and on the - * console whenever _do_fork() is invoked to create a new process. - * (Some messages may be suppressed if syslogd is configured to - * eliminate duplicate messages.) - */ - -#include -#include -#include - -/* - * Jumper probe for _do_fork. - * Mirror principle enables access to arguments of the probed routine - * from the probe handler. - */ - -/* Proxy routine having the same arguments as actual _do_fork() routine */ -static long j_do_fork(unsigned long clone_flags, unsigned long stack_start, - unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr, unsigned long tls) -{ - pr_info("jprobe: clone_flags = 0x%lx, stack_start = 0x%lx " - "stack_size = 0x%lx\n", clone_flags, stack_start, stack_size); - - /* Always end with a call to jprobe_return(). */ - jprobe_return(); - return 0; -} - -static struct jprobe my_jprobe = { - .entry = j_do_fork, - .kp = { - .symbol_name = "_do_fork", - }, -}; - -static int __init jprobe_init(void) -{ - int ret; - - ret = register_jprobe(&my_jprobe); - if (ret < 0) { - pr_err("register_jprobe failed, returned %d\n", ret); - return -1; - } - pr_info("Planted jprobe at %p, handler addr %p\n", - my_jprobe.kp.addr, my_jprobe.entry); - return 0; -} - -static void __exit jprobe_exit(void) -{ - unregister_jprobe(&my_jprobe); - pr_info("jprobe at %p unregistered\n", my_jprobe.kp.addr); -} - -module_init(jprobe_init) -module_exit(jprobe_exit) -MODULE_LICENSE("GPL"); From 9b17374e11c7ce2cf0b2b990fa4aa0360921aa2b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:16:37 +0900 Subject: [PATCH 0501/1085] kprobes/docs: Remove jprobes related documents Remove jprobes related documentation from kprobes.txt. Also add some migration advice for the people who are still using jprobes. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724539698.5014.7300022363980503141.stgit@devbox [ Fixes to the new documentation. ] Signed-off-by: Ingo Molnar --- Documentation/kprobes.txt | 159 ++++++++++++++------------------------ 1 file changed, 57 insertions(+), 102 deletions(-) diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 2335715bf471..22208bf2386d 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -8,7 +8,7 @@ Kernel Probes (Kprobes) .. CONTENTS - 1. Concepts: Kprobes, Jprobes, Return Probes + 1. Concepts: Kprobes, and Return Probes 2. Architectures Supported 3. Configuring Kprobes 4. API Reference @@ -16,12 +16,12 @@ Kernel Probes (Kprobes) 6. Probe Overhead 7. TODO 8. Kprobes Example - 9. Jprobes Example - 10. Kretprobes Example + 9. Kretprobes Example + 10. Deprecated Features Appendix A: The kprobes debugfs interface Appendix B: The kprobes sysctl interface -Concepts: Kprobes, Jprobes, Return Probes +Concepts: Kprobes and Return Probes ========================================= Kprobes enables you to dynamically break into any kernel routine and @@ -32,12 +32,10 @@ routine to be invoked when the breakpoint is hit. .. [1] some parts of the kernel code can not be trapped, see :ref:`kprobes_blacklist`) -There are currently three types of probes: kprobes, jprobes, and -kretprobes (also called return probes). A kprobe can be inserted -on virtually any instruction in the kernel. A jprobe is inserted at -the entry to a kernel function, and provides convenient access to the -function's arguments. A return probe fires when a specified function -returns. +There are currently two types of probes: kprobes, and kretprobes +(also called return probes). A kprobe can be inserted on virtually +any instruction in the kernel. A return probe fires when a specified +function returns. In the typical case, Kprobes-based instrumentation is packaged as a kernel module. The module's init function installs ("registers") @@ -82,45 +80,6 @@ After the instruction is single-stepped, Kprobes executes the "post_handler," if any, that is associated with the kprobe. Execution then continues with the instruction following the probepoint. -How Does a Jprobe Work? ------------------------ - -A jprobe is implemented using a kprobe that is placed on a function's -entry point. It employs a simple mirroring principle to allow -seamless access to the probed function's arguments. The jprobe -handler routine should have the same signature (arg list and return -type) as the function being probed, and must always end by calling -the Kprobes function jprobe_return(). - -Here's how it works. When the probe is hit, Kprobes makes a copy of -the saved registers and a generous portion of the stack (see below). -Kprobes then points the saved instruction pointer at the jprobe's -handler routine, and returns from the trap. As a result, control -passes to the handler, which is presented with the same register and -stack contents as the probed function. When it is done, the handler -calls jprobe_return(), which traps again to restore the original stack -contents and processor state and switch to the probed function. - -By convention, the callee owns its arguments, so gcc may produce code -that unexpectedly modifies that portion of the stack. This is why -Kprobes saves a copy of the stack and restores it after the jprobe -handler has run. Up to MAX_STACK_SIZE bytes are copied -- e.g., -64 bytes on i386. - -Note that the probed function's args may be passed on the stack -or in registers. The jprobe will work in either case, so long as the -handler's prototype matches that of the probed function. - -Note that in some architectures (e.g.: arm64 and sparc64) the stack -copy is not done, as the actual location of stacked parameters may be -outside of a reasonable MAX_STACK_SIZE value and because that location -cannot be determined by the jprobes code. In this case the jprobes -user must be careful to make certain the calling signature of the -function does not cause parameters to be passed on the stack (e.g.: -more than eight function arguments, an argument of more than sixteen -bytes, or more than 64 bytes of argument data, depending on -architecture). - Return Probes ------------- @@ -245,8 +204,7 @@ Pre-optimization After preparing the detour buffer, Kprobes verifies that none of the following situations exist: -- The probe has either a break_handler (i.e., it's a jprobe) or a - post_handler. +- The probe has a post_handler. - Other instructions in the optimized region are probed. - The probe is disabled. @@ -331,7 +289,7 @@ rejects registering it, if the given address is in the blacklist. Architectures Supported ======================= -Kprobes, jprobes, and return probes are implemented on the following +Kprobes and return probes are implemented on the following architectures: - i386 (Supports jump optimization) @@ -446,27 +404,6 @@ architecture-specific trap number associated with the fault (e.g., on i386, 13 for a general protection fault or 14 for a page fault). Returns 1 if it successfully handled the exception. -register_jprobe ---------------- - -:: - - #include - int register_jprobe(struct jprobe *jp) - -Sets a breakpoint at the address jp->kp.addr, which must be the address -of the first instruction of a function. When the breakpoint is hit, -Kprobes runs the handler whose address is jp->entry. - -The handler should have the same arg list and return type as the probed -function; and just before it returns, it must call jprobe_return(). -(The handler never actually returns, since jprobe_return() returns -control to Kprobes.) If the probed function is declared asmlinkage -or anything else that affects how args are passed, the handler's -declaration must match. - -register_jprobe() returns 0 on success, or a negative errno otherwise. - register_kretprobe ------------------ @@ -513,7 +450,6 @@ unregister_*probe #include void unregister_kprobe(struct kprobe *kp); - void unregister_jprobe(struct jprobe *jp); void unregister_kretprobe(struct kretprobe *rp); Removes the specified probe. The unregister function can be called @@ -532,7 +468,6 @@ register_*probes #include int register_kprobes(struct kprobe **kps, int num); int register_kretprobes(struct kretprobe **rps, int num); - int register_jprobes(struct jprobe **jps, int num); Registers each of the num probes in the specified array. If any error occurs during registration, all probes in the array, up to @@ -555,7 +490,6 @@ unregister_*probes #include void unregister_kprobes(struct kprobe **kps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); - void unregister_jprobes(struct jprobe **jps, int num); Removes each of the num probes in the specified array at once. @@ -574,7 +508,6 @@ disable_*probe #include int disable_kprobe(struct kprobe *kp); int disable_kretprobe(struct kretprobe *rp); - int disable_jprobe(struct jprobe *jp); Temporarily disables the specified ``*probe``. You can enable it again by using enable_*probe(). You must specify the probe which has been registered. @@ -587,7 +520,6 @@ enable_*probe #include int enable_kprobe(struct kprobe *kp); int enable_kretprobe(struct kretprobe *rp); - int enable_jprobe(struct jprobe *jp); Enables ``*probe`` which has been disabled by disable_*probe(). You must specify the probe which has been registered. @@ -595,12 +527,10 @@ the probe which has been registered. Kprobes Features and Limitations ================================ -Kprobes allows multiple probes at the same address. Currently, -however, there cannot be multiple jprobes on the same function at -the same time. Also, a probepoint for which there is a jprobe or -a post_handler cannot be optimized. So if you install a jprobe, -or a kprobe with a post_handler, at an optimized probepoint, the -probepoint will be unoptimized automatically. +Kprobes allows multiple probes at the same address. Also, +a probepoint for which there is a post_handler cannot be optimized. +So if you install a kprobe with a post_handler, at an optimized +probepoint, the probepoint will be unoptimized automatically. In general, you can install a probe anywhere in the kernel. In particular, you can probe interrupt handlers. Known exceptions @@ -662,7 +592,7 @@ We're unaware of other specific cases where this could be a problem. If, upon entry to or exit from a function, the CPU is running on a stack other than that of the current task, registering a return probe on that function may produce undesirable results. For this -reason, Kprobes doesn't support return probes (or kprobes or jprobes) +reason, Kprobes doesn't support return probes (or kprobes) on the x86_64 version of __switch_to(); the registration functions return -EINVAL. @@ -706,24 +636,24 @@ Probe Overhead On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 microseconds to process. Specifically, a benchmark that hits the same probepoint repeatedly, firing a simple handler each time, reports 1-2 -million hits per second, depending on the architecture. A jprobe or -return-probe hit typically takes 50-75% longer than a kprobe hit. +million hits per second, depending on the architecture. A return-probe +hit typically takes 50-75% longer than a kprobe hit. When you have a return probe set on a function, adding a kprobe at the entry to that function adds essentially no overhead. Here are sample overhead figures (in usec) for different architectures:: - k = kprobe; j = jprobe; r = return probe; kr = kprobe + return probe - on same function; jr = jprobe + return probe on same function:: + k = kprobe; r = return probe; kr = kprobe + return probe + on same function i386: Intel Pentium M, 1495 MHz, 2957.31 bogomips - k = 0.57 usec; j = 1.00; r = 0.92; kr = 0.99; jr = 1.40 + k = 0.57 usec; r = 0.92; kr = 0.99 x86_64: AMD Opteron 246, 1994 MHz, 3971.48 bogomips - k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07 + k = 0.49 usec; r = 0.80; kr = 0.82 ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) - k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 + k = 0.77 usec; r = 1.26; kr = 1.45 Optimized Probe Overhead ------------------------ @@ -755,11 +685,6 @@ Kprobes Example See samples/kprobes/kprobe_example.c -Jprobes Example -=============== - -See samples/kprobes/jprobe_example.c - Kretprobes Example ================== @@ -772,6 +697,37 @@ For additional information on Kprobes, refer to the following URLs: - http://www-users.cs.umn.edu/~boutcher/kprobes/ - http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) +Deprecated Features +=================== + +Jprobes is now a deprecated feature. People who are depending on it should +migrate to other tracing features or use older kernels. Please consider to +migrate your tool to one of the following options: + +- Use trace-event to trace target function with arguments. + + trace-event is a low-overhead (and almost no visible overhead if it + is off) statically defined event interface. You can define new events + and trace it via ftrace or any other tracing tools. + + See the following urls: + + - https://lwn.net/Articles/379903/ + - https://lwn.net/Articles/381064/ + - https://lwn.net/Articles/383362/ + +- Use ftrace dynamic events (kprobe event) with perf-probe. + + If you build your kernel with debug info (CONFIG_DEBUG_INFO=y), you can + find which register/stack is assigned to which local variable or arguments + by using perf-probe and set up new event to trace it. + + See following documents: + + - Documentation/trace/kprobetrace.txt + - Documentation/trace/events.txt + - tools/perf/Documentation/perf-probe.txt + The kprobes debugfs interface ============================= @@ -783,14 +739,13 @@ under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at / /sys/kernel/debug/kprobes/list: Lists all registered probes on the system:: c015d71a k vfs_read+0x0 - c011a316 j do_fork+0x0 c03dedc5 r tcp_v4_rcv+0x0 The first column provides the kernel address where the probe is inserted. -The second column identifies the type of probe (k - kprobe, r - kretprobe -and j - jprobe), while the third column specifies the symbol+offset of -the probe. If the probed function belongs to a module, the module name -is also specified. Following columns show probe status. If the probe is on +The second column identifies the type of probe (k - kprobe and r - kretprobe) +while the third column specifies the symbol+offset of the probe. +If the probed function belongs to a module, the module name is also +specified. Following columns show probe status. If the probe is on a virtual address that is no longer valid (module init sections, module virtual addresses that correspond to modules that've been unloaded), such probes are marked with [GONE]. If the probe is temporarily disabled, From 52f737c2da40259ac9962170ce608b6fb1b55ee4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 13:28:38 -0700 Subject: [PATCH 0502/1085] timer: Provide wrappers safe for use with LOCKDEP Under LOCKDEP, the timer lock_class_key (set up in __setup_timer) needs to be tied to the caller's context, so an inline for timer_setup() won't work. We do, however, want to keep the inline version around for argument type checking, though, so this provides macro wrappers in the LOCKDEP case. This fixes the case of different timers sharing the same LOCKDEP instance, and producing a false positive warning: [ 580.840858] ====================================================== [ 580.842299] WARNING: possible circular locking dependency detected [ 580.843684] 4.14.0-rc4+ #17 Not tainted [ 580.844554] ------------------------------------------------------ [ 580.845945] swapper/9/0 is trying to acquire lock: [ 580.847024] (slock-AF_INET){+.-.}, at: [] tcp_write_timer+0x24/0xd0 [ 580.848834] but task is already holding lock: [ 580.850107] ((timer)#2){+.-.}, at: [] call_timer_fn+0x0/0x300 [ 580.851663] which lock already depends on the new lock. [ 580.853439] the existing dependency chain (in reverse order) is: [ 580.855311] -> #1 ((timer)#2){+.-.}: [ 580.856538] __lock_acquire+0x114d/0x11a0 [ 580.857506] lock_acquire+0xb0/0x1d0 [ 580.858373] del_timer_sync+0x3c/0xb0 [ 580.859260] inet_csk_reqsk_queue_drop+0x7f/0x1b0 ... -> #0 (slock-AF_INET){+.-.}: [ 580.884980] check_prev_add+0x666/0x700 [ 580.885790] __lock_acquire+0x114d/0x11a0 [ 580.886575] lock_acquire+0xb0/0x1d0 [ 580.887289] _raw_spin_lock+0x2c/0x40 [ 580.888021] tcp_write_timer+0x24/0xd0 ... [ 580.900055] Possible unsafe locking scenario: [ 580.901043] CPU0 CPU1 [ 580.901797] ---- ---- [ 580.902540] lock((timer)#2); [ 580.903046] lock(slock-AF_INET); [ 580.904006] lock((timer)#2); [ 580.904915] lock(slock-AF_INET); [ 580.905502] In this report, del_timer_sync() is from: inet_csk_reqsk_queue_drop() reqsk_queue_unlink() del_timer_sync(&req->rsk_timer) but tcp_write_timer()'s timer is attached to icsk_retransmit_timer. Both had the same lock_class_key, since they were using timer_setup(). Switching to a macro allows for a separate context, avoiding the false positive. Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type") Reported-by: Craig Gallek Suggested-by: Eric Dumazet Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: netdev@vger.kernel.org Cc: "David S. Miller" Link: https://lkml.kernel.org/r/20171019202838.GA43223@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/timer.h b/include/linux/timer.h index 10685c33e679..09950482309b 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -150,6 +150,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define TIMER_DATA_TYPE unsigned long #define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) +#ifndef CONFIG_LOCKDEP static inline void timer_setup(struct timer_list *timer, void (*callback)(struct timer_list *), unsigned int flags) @@ -165,6 +166,19 @@ static inline void timer_setup_on_stack(struct timer_list *timer, __setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback, (TIMER_DATA_TYPE)timer, flags); } +#else +/* + * Under LOCKDEP, the timer lock_class_key (set up in __init_timer) needs + * to be tied to the caller's context, so an inline (above) won't work. We + * do want to keep the inline for argument type checking, though. + */ +# define timer_setup(timer, callback, flags) \ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, \ + (TIMER_DATA_TYPE)timer, flags) +# define timer_setup_on_stack(timer, callback, flags) \ + __setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,\ + (TIMER_DATA_TYPE)timer, flags) +#endif #define from_timer(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) From d9ee91c1b1fe56b905b767f4a0063d3624e1a1cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Oct 2017 11:15:36 +0200 Subject: [PATCH 0503/1085] irqchip/meson: Disable COMPILE_TEST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver fails to compile with CONFIG_COMPILE_TEST=y on x86: irq-meson-gpio.c: In function ‘meson_gpio_irq_parse_dt’: irq-meson-gpio.c:343:8: error: implicit declaration of function ‘of_property_read_variable_u32_array’ ret = of_property_read_variable_u32_array(node, Adding COMPILE_TEST to a driver requires at least compile testing it for x86.... Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Heiner Kallweit Cc: Jerome Brunet Cc: Marc Zyngier --- drivers/irqchip/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 1cb39405d7f8..85d0fb2b976c 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -326,7 +326,7 @@ config IRQ_UNIPHIER_AIDET config MESON_IRQ_GPIO bool "Meson GPIO Interrupt Multiplexer" - depends on ARCH_MESON || COMPILE_TEST + depends on ARCH_MESON select IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY help From 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 29 Sep 2017 17:08:16 +0300 Subject: [PATCH 0504/1085] mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y Size of the mem_section[] array depends on the size of the physical address space. In preparation for boot-time switching between paging modes on x86-64 we need to make the allocation of mem_section[] dynamic, because otherwise we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB for 4-level paging and 2MB for 5-level paging mode. The patch allocates the array on the first call to sparse_memory_present_with_active_regions(). Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/mmzone.h | 6 +++++- mm/page_alloc.c | 10 ++++++++++ mm/sparse.c | 17 +++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c8f89417740b..e796edf1296f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1150,13 +1150,17 @@ struct mem_section { #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME -extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline struct mem_section *__nr_to_section(unsigned long nr) { +#ifdef CONFIG_SPARSEMEM_EXTREME + if (!mem_section) + return NULL; +#endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..8dfd13f724d9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5646,6 +5646,16 @@ void __init sparse_memory_present_with_active_regions(int nid) unsigned long start_pfn, end_pfn; int i, this_nid; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (!mem_section) { + unsigned long size, align; + + size = sizeof(struct mem_section) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_virt_alloc(size, align); + } +#endif + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 83b3bf6461af..b00a97398795 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -22,8 +22,7 @@ * 1) mem_section - memory sections, mem_map's for valid memory */ #ifdef CONFIG_SPARSEMEM_EXTREME -struct mem_section *mem_section[NR_SECTION_ROOTS] - ____cacheline_internodealigned_in_smp; +struct mem_section **mem_section; #else struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] ____cacheline_internodealigned_in_smp; @@ -100,7 +99,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) int __section_nr(struct mem_section* ms) { unsigned long root_nr; - struct mem_section* root; + struct mem_section *root = NULL; for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); @@ -111,7 +110,7 @@ int __section_nr(struct mem_section* ms) break; } - VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + VM_BUG_ON(!root); return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } @@ -329,11 +328,17 @@ again: static void __init check_usemap_section_nr(int nid, unsigned long *usemap) { unsigned long usemap_snr, pgdat_snr; - static unsigned long old_usemap_snr = NR_MEM_SECTIONS; - static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + static unsigned long old_usemap_snr; + static unsigned long old_pgdat_snr; struct pglist_data *pgdat = NODE_DATA(nid); int usemap_nid; + /* First call */ + if (!old_usemap_snr) { + old_usemap_snr = NR_MEM_SECTIONS; + old_pgdat_snr = NR_MEM_SECTIONS; + } + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) From 12a8cc7fcf54a8575f094be1e99032ec38aa045c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:08:18 +0300 Subject: [PATCH 0505/1085] x86/kasan: Use the same shadow offset for 4- and 5-level paging We are going to support boot-time switching between 4- and 5-level paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET for different paging modes: the constant is passed to gcc to generate code and cannot be changed at runtime. This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset for both 4- and 5-level paging. For 5-level paging it means that shadow memory region is not aligned to PGD boundary anymore and we have to handle unaligned parts of the region properly. In addition, we have to exclude paravirt code from KASAN instrumentation as we now use set_pgd() before KASAN is fully ready. [kirill.shutemov@linux.intel.com: clenaup, changelog message] Signed-off-by: Andrey Ryabinin Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170929140821.37654-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 2 +- arch/x86/Kconfig | 1 - arch/x86/kernel/Makefile | 3 +- arch/x86/mm/kasan_init_64.c | 101 +++++++++++++++++++++++++------- 4 files changed, 83 insertions(+), 24 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index b0798e281aa6..3448e675b462 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... -ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) +ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..32779beb56e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -302,7 +302,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdff8000000000000 if X86_5LEVEL default 0xdffffc0000000000 config HAVE_INTEL_TXT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..a97a6b611531 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,7 +24,8 @@ endif KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n -KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index bc84b73684b7..fe5760db7b19 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,8 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; +static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + static int __init map_range(struct range *range) { unsigned long start; @@ -30,8 +32,10 @@ static void __init clear_pgds(unsigned long start, unsigned long end) { pgd_t *pgd; + /* See comment in kasan_init() */ + unsigned long pgd_end = end & PGDIR_MASK; - for (; start < end; start += PGDIR_SIZE) { + for (; start < pgd_end; start += PGDIR_SIZE) { pgd = pgd_offset_k(start); /* * With folded p4d, pgd_clear() is nop, use p4d_clear() @@ -42,29 +46,61 @@ static void __init clear_pgds(unsigned long start, else pgd_clear(pgd); } + + pgd = pgd_offset_k(start); + for (; start < end; start += P4D_SIZE) + p4d_clear(p4d_offset(pgd, start)); +} + +static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) +{ + unsigned long p4d; + + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return (p4d_t *)pgd; + + p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; + p4d += __START_KERNEL_map - phys_base; + return (p4d_t *)p4d + p4d_index(addr); +} + +static void __init kasan_early_p4d_populate(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + pgd_t pgd_entry; + p4d_t *p4d, p4d_entry; + unsigned long next; + + if (pgd_none(*pgd)) { + pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); + set_pgd(pgd, pgd_entry); + } + + p4d = early_p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (!p4d_none(*p4d)) + continue; + + p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); + set_p4d(p4d, p4d_entry); + } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); } static void __init kasan_map_early_shadow(pgd_t *pgd) { - int i; - unsigned long start = KASAN_SHADOW_START; + /* See comment in kasan_init() */ + unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; unsigned long end = KASAN_SHADOW_END; + unsigned long next; - for (i = pgd_index(start); start < end; i++) { - switch (CONFIG_PGTABLE_LEVELS) { - case 4: - pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | - _KERNPG_TABLE); - break; - case 5: - pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | - _KERNPG_TABLE); - break; - default: - BUILD_BUG(); - } - start += PGDIR_SIZE; - } + pgd += pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + kasan_early_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); } #ifdef CONFIG_KASAN_INLINE @@ -101,7 +137,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) + for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -117,12 +153,35 @@ void __init kasan_init(void) #endif memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + + /* + * We use the same shadow offset for 4- and 5-level paging to + * facilitate boot-time switching between paging modes. + * As result in 5-level paging mode KASAN_SHADOW_START and + * KASAN_SHADOW_END are not aligned to PGD boundary. + * + * KASAN_SHADOW_START doesn't share PGD with anything else. + * We claim whole PGD entry to make things easier. + * + * KASAN_SHADOW_END lands in the last PGD entry and it collides with + * bunch of things like kernel code, modules, EFI mapping, etc. + * We need to take extra steps to not overwrite them. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + void *ptr; + + ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); + set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], + __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); + } + load_cr3(early_top_pgt); __flush_tlb_all(); - clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); - kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, + kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), kasan_mem_to_shadow((void *)PAGE_OFFSET)); for (i = 0; i < E820_MAX_ENTRIES; i++) { From 4375c29985f155d7eb2346615d84e62d1b673682 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 29 Sep 2017 17:08:19 +0300 Subject: [PATCH 0506/1085] x86/xen: Provide pre-built page tables only for CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y Looks like we only need pre-built page tables in the CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y cases. Let's not provide them for other configurations. Signed-off-by: Kirill A. Shutemov Reviewed-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170929140821.37654-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..2be7d1e7fcf1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -37,11 +37,12 @@ * */ -#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) +#endif L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -361,10 +362,7 @@ NEXT_PAGE(early_dynamic_pgts) .data -#ifndef CONFIG_XEN -NEXT_PAGE(init_top_pgt) - .fill 512,8,0 -#else +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) NEXT_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 @@ -381,6 +379,9 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#else +NEXT_PAGE(init_top_pgt) + .fill 512,8,0 #endif #ifdef CONFIG_X86_5LEVEL From 773dd2fca581b0a80e5a33332cc8ee67e5a79cba Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 29 Sep 2017 17:08:20 +0300 Subject: [PATCH 0507/1085] x86/xen: Drop 5-level paging support code from the XEN_PV code It was decided 5-level paging is not going to be supported in XEN_PV. Let's drop the dead code from the XEN_PV code. Tested-by: Juergen Gross Signed-off-by: Kirill A. Shutemov Reviewed-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170929140821.37654-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu_pv.c | 159 ++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 99 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 71495f1a86d7..2ccdaba31a07 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -449,7 +449,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); -#if CONFIG_PGTABLE_LEVELS == 4 +#ifdef CONFIG_X86_64 __visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); @@ -538,7 +538,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_X86_64 */ static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, int (*func)(struct mm_struct *mm, struct page *, enum pt_level), @@ -580,21 +580,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, int (*func)(struct mm_struct *mm, struct page *, enum pt_level), bool last, unsigned long limit) { - int i, nr, flush = 0; + int flush = 0; + pud_t *pud; - nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; - for (i = 0; i < nr; i++) { - pud_t *pud; - if (p4d_none(p4d[i])) - continue; + if (p4d_none(*p4d)) + return flush; - pud = pud_offset(&p4d[i], 0); - if (PTRS_PER_PUD > 1) - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - flush |= xen_pud_walk(mm, pud, func, - last && i == nr - 1, limit); - } + pud = pud_offset(p4d, 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, last, limit); return flush; } @@ -644,8 +640,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, continue; p4d = p4d_offset(&pgd[i], 0); - if (PTRS_PER_P4D > 1) - flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); } @@ -1176,22 +1170,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) { pgd_t *pgd; p4d_t *p4d; - unsigned int i; bool unpin; unpin = (vaddr == 2 * PGDIR_SIZE); vaddr &= PMD_MASK; pgd = pgd_offset_k(vaddr); p4d = p4d_offset(pgd, 0); - for (i = 0; i < PTRS_PER_P4D; i++) { - if (p4d_none(p4d[i])) - continue; - xen_cleanmfnmap_p4d(p4d + i, unpin); - } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(0)); - xen_cleanmfnmap_free_pgtbl(p4d, unpin); - } + if (!p4d_none(*p4d)) + xen_cleanmfnmap_p4d(p4d, unpin); } static void __init xen_pagetable_p2m_free(void) @@ -1692,7 +1678,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2029,13 +2015,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) */ void __init xen_relocate_p2m(void) { - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; pte_t *pt; pmd_t *pmd; pud_t *pud; - p4d_t *p4d = NULL; pgd_t *pgd; unsigned long *new_p2m; int save_pud; @@ -2045,11 +2030,7 @@ void __init xen_relocate_p2m(void) n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; - if (PTRS_PER_P4D > 1) - n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - else - n_p4d = 0; - n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; + n_frames = n_pte + n_pt + n_pmd + n_pud; new_area = xen_find_free_area(PFN_PHYS(n_frames)); if (!new_area) { @@ -2065,76 +2046,56 @@ void __init xen_relocate_p2m(void) * To avoid any possible virtual address collision, just use * 2 * PUD_SIZE for the new area. */ - p4d_phys = new_area; - pud_phys = p4d_phys + PFN_PHYS(n_p4d); + pud_phys = new_area; pmd_phys = pud_phys + PFN_PHYS(n_pud); pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; pgd = __va(read_cr3_pa()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - idx_p4d = 0; save_pud = n_pud; - do { - if (n_p4d > 0) { - p4d = early_memremap(p4d_phys, PAGE_SIZE); - clear_page(p4d); - n_pud = min(save_pud, PTRS_PER_P4D); - } - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; - } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - if (n_p4d > 0) - set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); - else - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; } - if (n_p4d > 0) { - save_pud -= PTRS_PER_P4D; - early_memunmap(p4d, PAGE_SIZE); - make_lowmem_page_readonly(__va(p4d_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); - set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); - p4d_phys += PAGE_SIZE; - } - } while (++idx_p4d < n_p4d); + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } /* Now copy the old p2m info to the new area. */ memcpy(new_p2m, xen_p2m_addr, size); @@ -2361,7 +2322,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 pv_mmu_ops.set_p4d = xen_set_p4d; #endif @@ -2371,7 +2332,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2435,14 +2396,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_p4d = xen_set_p4d_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_X86_64 */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, From d92f842bb30f52beedad63a4a850b39ca0dbc45f Mon Sep 17 00:00:00 2001 From: Scott Tsai Date: Wed, 20 Sep 2017 02:16:00 +0800 Subject: [PATCH 0508/1085] memory-barriers.txt: Fix typo in pairing example In the "general barrier pairing with implicit control depdendency" example, the last write by CPU 1 was meant to change variable x and not y. The example would be pretty uninteresting if no CPU ever changes x and the variable was initialized to zero. Signed-off-by: Scott Tsai Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 7deee1441640..f37375544d71 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -947,7 +947,7 @@ Or even: =============== =============================== r1 = READ_ONCE(y); - WRITE_ONCE(y, 1); if (r2 = READ_ONCE(x)) { + WRITE_ONCE(x, 1); if (r2 = READ_ONCE(x)) { WRITE_ONCE(y, 1); } From 5692fcc671ac8a10bfd0e75d52f0259fe767b56c Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 21 Sep 2017 16:29:01 -0300 Subject: [PATCH 0509/1085] doc: Rewrite confusing statement about memory barriers The "Write (or store) memory barriers" bullet of the "Variety of memory barriers" section, calls out a sequential order of stores, which is confusing since sequential ordering is not guaranteed. This commit therefore rewords to avoid mentioning a sequence of stores to clarify the intent. Cc: Paul E. McKenney Signed-off-by: Guilherme G. Piccoli Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f37375544d71..519940ec767f 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -383,8 +383,8 @@ Memory barriers come in four basic varieties: to have any effect on loads. A CPU can be viewed as committing a sequence of store operations to the - memory system as time progresses. All stores before a write barrier will - occur in the sequence _before_ all the stores after the write barrier. + memory system as time progresses. All stores _before_ a write barrier + will occur _before_ all the stores after the write barrier. [!] Note that write barriers should normally be paired with read or data dependency barriers; see the "SMP barrier pairing" subsection. From e4d0b679a8467b6d0c169642f0b5f57d8d6eacc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Sep 2017 14:30:06 -0700 Subject: [PATCH 0510/1085] srcu: Add parameters to SRCU docbook comments Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 1 + kernel/rcu/srcutree.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 39af9bc0f653..62be8966e837 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -78,6 +78,7 @@ void synchronize_srcu(struct srcu_struct *sp); /** * srcu_read_lock_held - might we be in SRCU read-side critical section? + * @sp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * * The callback function will be invoked some time after a full SRCU From 927340926ed61477e34f960eec64b7532e35d2f0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 14:29:04 +0200 Subject: [PATCH 0511/1085] tomoyo: fix timestamping for y2038 Tomoyo uses an open-coded version of time_to_tm() to create a timestamp from the current time as read by get_seconds(). This will overflow and give wrong results on 32-bit systems in 2038. To correct this, this changes the code to use ktime_get_real_seconds() and the generic time64_to_tm() function that are both y2038-safe. Using the library function avoids adding an expensive 64-bit division in this code and can benefit from any optimizations we do in common code. Acked-by: Tetsuo Handa Signed-off-by: Arnd Bergmann Signed-off-by: James Morris --- security/tomoyo/audit.c | 2 +- security/tomoyo/common.c | 4 ++-- security/tomoyo/common.h | 2 +- security/tomoyo/util.c | 39 +++++++++------------------------------ 4 files changed, 13 insertions(+), 34 deletions(-) diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c index 3ffa4f5509d8..a51edfbe593b 100644 --- a/security/tomoyo/audit.c +++ b/security/tomoyo/audit.c @@ -156,7 +156,7 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r) if (!buffer) return NULL; - tomoyo_convert_time(get_seconds(), &stamp); + tomoyo_convert_time(ktime_get_real_seconds(), &stamp); pos = snprintf(buffer, tomoyo_buffer_len - 1, "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s " diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index e0fb75052550..c19970db89c4 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2256,7 +2256,7 @@ static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = { /* Timestamp counter for last updated. */ static unsigned int tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT]; /* Counter for number of updates. */ -static unsigned int tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT]; +static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT]; /** * tomoyo_update_stat - Update statistic counters. @@ -2271,7 +2271,7 @@ void tomoyo_update_stat(const u8 index) * I don't use atomic operations because race condition is not fatal. */ tomoyo_stat_updated[index]++; - tomoyo_stat_modified[index] = get_seconds(); + tomoyo_stat_modified[index] = ktime_get_real_seconds(); } /** diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h index 361e7a284699..d9628d1635b2 100644 --- a/security/tomoyo/common.h +++ b/security/tomoyo/common.h @@ -1036,7 +1036,7 @@ void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry) (struct tomoyo_request_info *, const struct tomoyo_acl_info *)); void tomoyo_check_profile(void); -void tomoyo_convert_time(time_t time, struct tomoyo_time *stamp); +void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); void tomoyo_del_condition(struct list_head *element); void tomoyo_fill_path_info(struct tomoyo_path_info *ptr); void tomoyo_get_attributes(struct tomoyo_obj_info *obj); diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index 848317fea704..cac431d381d2 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c @@ -86,38 +86,17 @@ const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = { * @stamp: Pointer to "struct tomoyo_time". * * Returns nothing. - * - * This function does not handle Y2038 problem. */ -void tomoyo_convert_time(time_t time, struct tomoyo_time *stamp) +void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp) { - static const u16 tomoyo_eom[2][12] = { - { 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, - { 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } - }; - u16 y; - u8 m; - bool r; - stamp->sec = time % 60; - time /= 60; - stamp->min = time % 60; - time /= 60; - stamp->hour = time % 24; - time /= 24; - for (y = 1970; ; y++) { - const unsigned short days = (y & 3) ? 365 : 366; - if (time < days) - break; - time -= days; - } - r = (y & 3) == 0; - for (m = 0; m < 11 && time >= tomoyo_eom[r][m]; m++) - ; - if (m) - time -= tomoyo_eom[r][m - 1]; - stamp->year = y; - stamp->month = ++m; - stamp->day = ++time; + struct tm tm; + time64_to_tm(time64, 0, &tm); + stamp->sec = tm.tm_sec; + stamp->min = tm.tm_min; + stamp->hour = tm.tm_hour; + stamp->day = tm.tm_mday; + stamp->month = tm.tm_mon + 1; + stamp->year = tm.tm_year + 1900; } /** From 95953034fb24c16ad0047a98b16427e5935830c4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 20 Oct 2017 02:16:57 -0700 Subject: [PATCH 0512/1085] x86/intel_rdt: Initialize bitmask of shareable resource if CDP enabled The platform informs via CPUID.(EAX=0x10, ECX=res#):EBX[31:0] (valid res# are only 1 for L3 and 2 for L2) which unit of the allocation may be used by other entities in the platform. This information is valid whether CDP (Code and Data Prioritization) is enabled or not. Ensure that the bitmask of shareable resource is initialized when CDP is enabled. Fixes: 0dd2d7494cd8 ("x86/intel_rdt: Show bitmask of shareable resource with other executing units" Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Acked-by: Fenghua Yu Acked-by: Vikas Shivappa Acked-by: Tony Luck Link: https://lkml.kernel.org/r/815747bddc820ca221a8924edaf4d1a7324547e4.1508490116.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index cd5fc61ba450..88dcf8479013 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -267,6 +267,7 @@ static void rdt_get_cdp_l3_config(int type) r->num_closid = r_l3->num_closid / 2; r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; + r->cache.shareable_bits = r_l3->cache.shareable_bits; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; /* From 36b6f9fcb8928c06b6638a4cf91bc9d69bb49aa2 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 20 Oct 2017 02:16:58 -0700 Subject: [PATCH 0513/1085] x86/intel_rdt: Fix potential deadlock during resctrl unmount Lockdep warns about a potential deadlock: [ 66.782842] ====================================================== [ 66.782888] WARNING: possible circular locking dependency detected [ 66.782937] 4.14.0-rc2-test-test+ #48 Not tainted [ 66.782983] ------------------------------------------------------ [ 66.783052] umount/336 is trying to acquire lock: [ 66.783117] (cpu_hotplug_lock.rw_sem){++++}, at: [] rdt_kill_sb+0x215/0x390 [ 66.783193] but task is already holding lock: [ 66.783244] (rdtgroup_mutex){+.+.}, at: [] rdt_kill_sb+0x36/0x390 [ 66.783305] which lock already depends on the new lock. [ 66.783364] the existing dependency chain (in reverse order) is: [ 66.783419] -> #3 (rdtgroup_mutex){+.+.}: [ 66.783467] __lock_acquire+0x1293/0x13f0 [ 66.783509] lock_acquire+0xaf/0x220 [ 66.783543] __mutex_lock+0x71/0x9b0 [ 66.783575] mutex_lock_nested+0x1b/0x20 [ 66.783610] intel_rdt_online_cpu+0x3b/0x430 [ 66.783649] cpuhp_invoke_callback+0xab/0x8e0 [ 66.783687] cpuhp_thread_fun+0x7a/0x150 [ 66.783722] smpboot_thread_fn+0x1cc/0x270 [ 66.783764] kthread+0x16e/0x190 [ 66.783794] ret_from_fork+0x27/0x40 [ 66.783825] -> #2 (cpuhp_state){+.+.}: [ 66.783870] __lock_acquire+0x1293/0x13f0 [ 66.783906] lock_acquire+0xaf/0x220 [ 66.783938] cpuhp_issue_call+0x102/0x170 [ 66.783974] __cpuhp_setup_state_cpuslocked+0x154/0x2a0 [ 66.784023] __cpuhp_setup_state+0xc7/0x170 [ 66.784061] page_writeback_init+0x43/0x67 [ 66.784097] pagecache_init+0x43/0x4a [ 66.784131] start_kernel+0x3ad/0x3f7 [ 66.784165] x86_64_start_reservations+0x2a/0x2c [ 66.784204] x86_64_start_kernel+0x72/0x75 [ 66.784241] verify_cpu+0x0/0xfb [ 66.784270] -> #1 (cpuhp_state_mutex){+.+.}: [ 66.784319] __lock_acquire+0x1293/0x13f0 [ 66.784355] lock_acquire+0xaf/0x220 [ 66.784387] __mutex_lock+0x71/0x9b0 [ 66.784419] mutex_lock_nested+0x1b/0x20 [ 66.784454] __cpuhp_setup_state_cpuslocked+0x52/0x2a0 [ 66.784497] __cpuhp_setup_state+0xc7/0x170 [ 66.784535] page_alloc_init+0x28/0x30 [ 66.784569] start_kernel+0x148/0x3f7 [ 66.784602] x86_64_start_reservations+0x2a/0x2c [ 66.784642] x86_64_start_kernel+0x72/0x75 [ 66.784678] verify_cpu+0x0/0xfb [ 66.784707] -> #0 (cpu_hotplug_lock.rw_sem){++++}: [ 66.784759] check_prev_add+0x32f/0x6e0 [ 66.784794] __lock_acquire+0x1293/0x13f0 [ 66.784830] lock_acquire+0xaf/0x220 [ 66.784863] cpus_read_lock+0x3d/0xb0 [ 66.784896] rdt_kill_sb+0x215/0x390 [ 66.784930] deactivate_locked_super+0x3e/0x70 [ 66.784968] deactivate_super+0x40/0x60 [ 66.785003] cleanup_mnt+0x3f/0x80 [ 66.785034] __cleanup_mnt+0x12/0x20 [ 66.785070] task_work_run+0x8b/0xc0 [ 66.785103] exit_to_usermode_loop+0x94/0xa0 [ 66.786804] syscall_return_slowpath+0xe8/0x150 [ 66.788502] entry_SYSCALL_64_fastpath+0xab/0xad [ 66.790194] other info that might help us debug this: [ 66.795139] Chain exists of: cpu_hotplug_lock.rw_sem --> cpuhp_state --> rdtgroup_mutex [ 66.800035] Possible unsafe locking scenario: [ 66.803267] CPU0 CPU1 [ 66.804867] ---- ---- [ 66.806443] lock(rdtgroup_mutex); [ 66.808002] lock(cpuhp_state); [ 66.809565] lock(rdtgroup_mutex); [ 66.811110] lock(cpu_hotplug_lock.rw_sem); [ 66.812608] *** DEADLOCK *** [ 66.816983] 2 locks held by umount/336: [ 66.818418] #0: (&type->s_umount_key#35){+.+.}, at: [] deactivate_super+0x38/0x60 [ 66.819922] #1: (rdtgroup_mutex){+.+.}, at: [] rdt_kill_sb+0x36/0x390 When the resctrl filesystem is unmounted the locks should be obtain in the locks in the same order as was done when the cpus came online: cpu_hotplug_lock before rdtgroup_mutex. This also requires to switch the static_branch_disable() calls to the _cpulocked variant because now cpu hotplug lock is held already. [ tglx: Switched to cpus_read_[un]lock ] Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Tested-by: Sai Praneeth Prakhya Acked-by: Vikas Shivappa Acked-by: Fenghua Yu Acked-by: Tony Luck Link: https://lkml.kernel.org/r/cc292e76be073f7260604651711c47b09fd0dc81.1508490116.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 8a61b20c7e51..8ce5d038c43b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1364,9 +1364,7 @@ static void rmdir_all_sub(void) kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - get_online_cpus(); update_closid_rmid(cpu_online_mask, &rdtgroup_default); - put_online_cpus(); kernfs_remove(kn_info); kernfs_remove(kn_mongrp); @@ -1377,6 +1375,7 @@ static void rdt_kill_sb(struct super_block *sb) { struct rdt_resource *r; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ @@ -1384,11 +1383,12 @@ static void rdt_kill_sb(struct super_block *sb) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); - static_branch_disable(&rdt_alloc_enable_key); - static_branch_disable(&rdt_mon_enable_key); - static_branch_disable(&rdt_enable_key); + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_disable_cpuslocked(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } static struct file_system_type rdt_fs_type = { From 87943db7dfb0c5ee5aa74a9ac06346fadd9695c8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 20 Oct 2017 02:16:59 -0700 Subject: [PATCH 0514/1085] x86/intel_rdt: Fix potential deadlock during resctrl mount Sai reported a warning during some MBA tests: [ 236.755559] ====================================================== [ 236.762443] WARNING: possible circular locking dependency detected [ 236.769328] 4.14.0-rc4-yocto-standard #8 Not tainted [ 236.774857] ------------------------------------------------------ [ 236.781738] mount/10091 is trying to acquire lock: [ 236.787071] (cpu_hotplug_lock.rw_sem){++++}, at: [] static_key_enable+0x12/0x30 [ 236.797058] but task is already holding lock: [ 236.803552] (&type->s_umount_key#37/1){+.+.}, at: [] sget_userns+0x32f/0x520 [ 236.813247] which lock already depends on the new lock. [ 236.822353] the existing dependency chain (in reverse order) is: [ 236.830686] -> #4 (&type->s_umount_key#37/1){+.+.}: [ 236.837756] __lock_acquire+0x1100/0x11a0 [ 236.842799] lock_acquire+0xdf/0x1d0 [ 236.847363] down_write_nested+0x46/0x80 [ 236.852310] sget_userns+0x32f/0x520 [ 236.856873] kernfs_mount_ns+0x7e/0x1f0 [ 236.861728] rdt_mount+0x30c/0x440 [ 236.866096] mount_fs+0x38/0x150 [ 236.870262] vfs_kern_mount+0x67/0x150 [ 236.875015] do_mount+0x1df/0xd50 [ 236.879286] SyS_mount+0x95/0xe0 [ 236.883464] entry_SYSCALL_64_fastpath+0x18/0xad [ 236.889183] -> #3 (rdtgroup_mutex){+.+.}: [ 236.895292] __lock_acquire+0x1100/0x11a0 [ 236.900337] lock_acquire+0xdf/0x1d0 [ 236.904899] __mutex_lock+0x80/0x8f0 [ 236.909459] mutex_lock_nested+0x1b/0x20 [ 236.914407] intel_rdt_online_cpu+0x3b/0x4a0 [ 236.919745] cpuhp_invoke_callback+0xce/0xb80 [ 236.925177] cpuhp_thread_fun+0x1c5/0x230 [ 236.930222] smpboot_thread_fn+0x11a/0x1e0 [ 236.935362] kthread+0x152/0x190 [ 236.939536] ret_from_fork+0x27/0x40 [ 236.944097] -> #2 (cpuhp_state-up){+.+.}: [ 236.950199] __lock_acquire+0x1100/0x11a0 [ 236.955241] lock_acquire+0xdf/0x1d0 [ 236.959800] cpuhp_issue_call+0x12e/0x1c0 [ 236.964845] __cpuhp_setup_state_cpuslocked+0x13b/0x2f0 [ 236.971242] __cpuhp_setup_state+0xa7/0x120 [ 236.976483] page_writeback_init+0x43/0x67 [ 236.981623] pagecache_init+0x38/0x3b [ 236.986281] start_kernel+0x3c6/0x41a [ 236.990931] x86_64_start_reservations+0x2a/0x2c [ 236.996650] x86_64_start_kernel+0x72/0x75 [ 237.001793] verify_cpu+0x0/0xfb [ 237.005966] -> #1 (cpuhp_state_mutex){+.+.}: [ 237.012364] __lock_acquire+0x1100/0x11a0 [ 237.017408] lock_acquire+0xdf/0x1d0 [ 237.021969] __mutex_lock+0x80/0x8f0 [ 237.026527] mutex_lock_nested+0x1b/0x20 [ 237.031475] __cpuhp_setup_state_cpuslocked+0x54/0x2f0 [ 237.037777] __cpuhp_setup_state+0xa7/0x120 [ 237.043013] page_alloc_init+0x28/0x30 [ 237.047769] start_kernel+0x148/0x41a [ 237.052425] x86_64_start_reservations+0x2a/0x2c [ 237.058145] x86_64_start_kernel+0x72/0x75 [ 237.063284] verify_cpu+0x0/0xfb [ 237.067456] -> #0 (cpu_hotplug_lock.rw_sem){++++}: [ 237.074436] check_prev_add+0x401/0x800 [ 237.079286] __lock_acquire+0x1100/0x11a0 [ 237.084330] lock_acquire+0xdf/0x1d0 [ 237.088890] cpus_read_lock+0x42/0x90 [ 237.093548] static_key_enable+0x12/0x30 [ 237.098496] rdt_mount+0x406/0x440 [ 237.102862] mount_fs+0x38/0x150 [ 237.107035] vfs_kern_mount+0x67/0x150 [ 237.111787] do_mount+0x1df/0xd50 [ 237.116058] SyS_mount+0x95/0xe0 [ 237.120233] entry_SYSCALL_64_fastpath+0x18/0xad [ 237.125952] other info that might help us debug this: [ 237.134867] Chain exists of: cpu_hotplug_lock.rw_sem --> rdtgroup_mutex --> &type->s_umount_key#37/1 [ 237.148425] Possible unsafe locking scenario: [ 237.155015] CPU0 CPU1 [ 237.160057] ---- ---- [ 237.165100] lock(&type->s_umount_key#37/1); [ 237.169952] lock(rdtgroup_mutex); [ 237.176641] lock(&type->s_umount_key#37/1); [ 237.184287] lock(cpu_hotplug_lock.rw_sem); [ 237.189041] *** DEADLOCK *** When the resctrl filesystem is mounted the locks must be acquired in the same order as was done when the cpus came online: cpu_hotplug_lock before rdtgroup_mutex. This also requires to switch the static_branch_enable() calls to the _cpulocked variant because now cpu hotplug lock is held already. [ tglx: Switched to cpus_read_[un]lock ] Reported-by: Sai Praneeth Prakhya Signed-off-by: Reinette Chatre Tested-by: Sai Praneeth Prakhya Acked-by: Vikas Shivappa Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Link: https://lkml.kernel.org/r/9c41b91bc2f47d9e95b62b213ecdb45623c47a9f.1508490116.git.reinette.chatre@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 8ce5d038c43b..64c5ff97ee0d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1149,6 +1149,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, struct dentry *dentry; int ret; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -1198,12 +1199,12 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_mondata; if (rdt_alloc_capable) - static_branch_enable(&rdt_alloc_enable_key); + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); if (rdt_mon_capable) - static_branch_enable(&rdt_mon_enable_key); + static_branch_enable_cpuslocked(&rdt_mon_enable_key); if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable(&rdt_enable_key); + static_branch_enable_cpuslocked(&rdt_enable_key); if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3]; @@ -1226,6 +1227,7 @@ out_cdp: out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return dentry; } From 21214b042d51b056b4eaa332b4cf426250d0e9e2 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 10 Oct 2017 11:25:06 +0200 Subject: [PATCH 0515/1085] s390/zcrypt: CEX6S exploitation This patch adds the full CEX6S card support to the zcrypt device driver. A CEX6A/C/P is detected and displayed as such, the card and queue device driver code is updated to recognize it and the relative weight values for CEX4, CEX5 and CEX6 have been updated. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/ap_card.c | 3 -- drivers/s390/crypto/ap_queue.c | 3 -- drivers/s390/crypto/zcrypt_api.h | 1 + drivers/s390/crypto/zcrypt_cex4.c | 48 +++++++++++++++++++++++++------ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c index 836efac96813..e8300630e197 100644 --- a/drivers/s390/crypto/ap_card.c +++ b/drivers/s390/crypto/ap_card.c @@ -182,9 +182,6 @@ struct ap_card *ap_card_create(int id, int queue_depth, int device_type, ac->ap_dev.device.release = ap_card_device_release; ac->ap_dev.device.type = &ap_card_type; ac->ap_dev.device_type = device_type; - /* CEX6 toleration: map to CEX5 */ - if (device_type == AP_DEVICE_TYPE_CEX6) - ac->ap_dev.device_type = AP_DEVICE_TYPE_CEX5; ac->raw_hwtype = device_type; ac->queue_depth = queue_depth; ac->functions = functions; diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 56b96edffd5b..d15903926f46 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -626,9 +626,6 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) aq->ap_dev.device.release = ap_queue_device_release; aq->ap_dev.device.type = &ap_queue_type; aq->ap_dev.device_type = device_type; - /* CEX6 toleration: map to CEX5 */ - if (device_type == AP_DEVICE_TYPE_CEX6) - aq->ap_dev.device_type = AP_DEVICE_TYPE_CEX5; aq->qid = qid; aq->state = AP_STATE_RESET_START; aq->interrupt = AP_INTR_DISABLED; diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h index 6c94efd23eac..73541a798db7 100644 --- a/drivers/s390/crypto/zcrypt_api.h +++ b/drivers/s390/crypto/zcrypt_api.h @@ -76,6 +76,7 @@ struct ica_z90_status { #define ZCRYPT_CEX3A 8 #define ZCRYPT_CEX4 10 #define ZCRYPT_CEX5 11 +#define ZCRYPT_CEX6 12 /** * Large random numbers are pulled in 4096 byte chunks from the crypto cards diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index 4e91163d70a6..e2eebc775a37 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -45,6 +45,8 @@ static struct ap_device_id zcrypt_cex4_card_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX5, .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX6, + .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { /* end of list */ }, }; @@ -55,6 +57,8 @@ static struct ap_device_id zcrypt_cex4_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX5, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX6, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of list */ }, }; @@ -72,17 +76,25 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY */ static const int CEX4A_SPEED_IDX[] = { - 5, 6, 59, 20, 115, 581, 0, 0}; + 14, 19, 249, 42, 228, 1458, 0, 0}; static const int CEX5A_SPEED_IDX[] = { - 3, 3, 6, 8, 32, 218, 0, 0}; + 8, 9, 20, 18, 66, 458, 0, 0}; + static const int CEX6A_SPEED_IDX[] = { + 6, 9, 20, 17, 65, 438, 0, 0}; + static const int CEX4C_SPEED_IDX[] = { - 24, 25, 82, 41, 138, 1111, 79, 8}; + 59, 69, 308, 83, 278, 2204, 209, 40}; static const int CEX5C_SPEED_IDX[] = { - 10, 14, 23, 17, 45, 242, 63, 4}; + 24, 31, 50, 37, 90, 479, 27, 10}; + static const int CEX6C_SPEED_IDX[] = { + 16, 20, 32, 27, 77, 455, 23, 9}; + static const int CEX4P_SPEED_IDX[] = { - 142, 198, 1852, 203, 331, 1563, 0, 8}; + 224, 313, 3560, 359, 605, 2827, 0, 50}; static const int CEX5P_SPEED_IDX[] = { - 49, 67, 131, 52, 85, 287, 0, 4}; + 63, 84, 156, 83, 142, 533, 0, 10}; + static const int CEX6P_SPEED_IDX[] = { + 55, 70, 121, 73, 129, 522, 0, 9}; struct ap_card *ac = to_ap_card(&ap_dev->device); struct zcrypt_card *zc; @@ -99,11 +111,16 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX4; memcpy(zc->speed_rating, CEX4A_SPEED_IDX, sizeof(CEX4A_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX5) { zc->type_string = "CEX5A"; zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5A_SPEED_IDX, sizeof(CEX5A_SPEED_IDX)); + } else { + zc->type_string = "CEX6A"; + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX6A_SPEED_IDX, + sizeof(CEX6A_SPEED_IDX)); } zc->min_mod_size = CEX4A_MIN_MOD_SIZE; if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) && @@ -125,7 +142,7 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX4C_SPEED_IDX, sizeof(CEX4C_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX5) { zc->type_string = "CEX5C"; /* wrong user space type, must be CEX5 * just keep it for cca compatibility @@ -133,6 +150,14 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX5C_SPEED_IDX, sizeof(CEX5C_SPEED_IDX)); + } else { + zc->type_string = "CEX6C"; + /* wrong user space type, must be CEX6 + * just keep it for cca compatibility + */ + zc->user_space_type = ZCRYPT_CEX3C; + memcpy(zc->speed_rating, CEX6C_SPEED_IDX, + sizeof(CEX6C_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -143,11 +168,16 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX4; memcpy(zc->speed_rating, CEX4P_SPEED_IDX, sizeof(CEX4P_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX5) { zc->type_string = "CEX5P"; zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5P_SPEED_IDX, sizeof(CEX5P_SPEED_IDX)); + } else { + zc->type_string = "CEX6P"; + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX6P_SPEED_IDX, + sizeof(CEX6P_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; From 0acb1665aa9929e5334bc7359ee3cecb6ece3604 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 6 Oct 2017 11:31:43 +0200 Subject: [PATCH 0516/1085] s390/zcrypt: Enable special header file flag for AU CPRP With the CEX6 there is a new CPRB (subfunction AU) used to generate protected keys from secure keys. This new CPRB needs to have the special flag set in the queue message header struct which is introduced with this fix. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_msgtype6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index afd20cee7ea0..785620d30504 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -474,7 +474,8 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg, *fcode = (msg->hdr.function_code[0] << 8) | msg->hdr.function_code[1]; *dom = (unsigned short *)&msg->cprbx.domain; - if (memcmp(function_code, "US", 2) == 0) + if (memcmp(function_code, "US", 2) == 0 + || memcmp(function_code, "AU", 2) == 0) ap_msg->special = 1; else ap_msg->special = 0; From 9a5641080bf433e195730e47a13de58dcd70f47f Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 16 Oct 2017 12:28:35 +0200 Subject: [PATCH 0517/1085] s390/zcrypt: Introduce QACT support for AP bus devices. This patch introduces a new ap_qact() function which exploits the PQAP(QACT) subfunction. QACT is a new interface to Query the Ap Compatilibity Type based on a given AP qid, type, mode and version. Based on this new function the AP bus scan code is slightly reworked to use this new interface for querying the compatible type for each new AP queue device detected. So new and unknown devices can get automatically mapped to a compatible type and handled without the need for toleration patches for every new hardware. The currently highest known hardware is CEX6S. With this patch a possible successor can get queried for a combatible type known by the device driver without the need for an toleration patch. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/ap_asm.h | 43 ++++++++++++++++++++ drivers/s390/crypto/ap_bus.c | 74 +++++++++++++++++++++++++++++----- drivers/s390/crypto/ap_bus.h | 4 +- drivers/s390/crypto/ap_card.c | 9 +++-- drivers/s390/crypto/ap_queue.c | 1 + 5 files changed, 116 insertions(+), 15 deletions(-) diff --git a/drivers/s390/crypto/ap_asm.h b/drivers/s390/crypto/ap_asm.h index cd350345b3d2..0c0c02f26f5b 100644 --- a/drivers/s390/crypto/ap_asm.h +++ b/drivers/s390/crypto/ap_asm.h @@ -116,6 +116,49 @@ static inline int ap_qci(void *config) return reg1; } +/* + * struct ap_qact_ap_info - used together with the + * ap_aqic() function to provide a convenient way + * to handle the ap info needed by the qact function. + */ +struct ap_qact_ap_info { + unsigned int _res1 : 3; + unsigned int mode : 3; + unsigned int _res2 : 26; + unsigned int cat : 8; + unsigned int _res3 : 8; + unsigned char ver[2]; +}; + +/** + * ap_qact(): Query AP combatibility type. + * @qid: The AP queue number + * @apinfo: On input the info about the AP queue (content of GR1 + * according to the AR). On output the alternate AP queue + * info provided by the qact function in GR2 is stored in. + * + * Returns AP queue status. Check response_code field for failures. + */ +static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, + struct ap_qact_ap_info *apinfo) +{ + register unsigned long reg0 asm ("0") = qid | (5UL << 24) + | ((ifbit & 0x01) << 22); + register struct ap_qact_ap_info reg1_in asm ("1") = *apinfo; + register struct ap_queue_status reg1_out asm ("1"); + register unsigned long reg2_in asm ("2") = 0; + register struct ap_qact_ap_info reg2_out asm ("2"); + + asm volatile( + ".long 0xb2af0000" /* PQAP(QACT) */ + : "+d" (reg0), "+d" (reg1_in), "=d" (reg1_out), + "+d" (reg2_in), "=d" (reg2_out) + : + : "cc"); + *apinfo = reg2_out; + return reg1_out; +} + /** * ap_nqap(): Send message to adjunct processor queue. * @qid: The AP queue number diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 5f0be2040272..0c1c48c476b7 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -176,6 +176,18 @@ static int ap_apft_available(void) return test_facility(15); } +/* + * ap_qact_available(): Test if the PQAP(QACT) subfunction is available. + * + * Returns 1 if the QACT subfunction is available. + */ +static inline int ap_qact_available(void) +{ + if (ap_configuration) + return ap_configuration->qact; + return 0; +} + /** * ap_test_queue(): Test adjunct processor queue. * @qid: The AP queue number @@ -987,6 +999,47 @@ static int ap_select_domain(void) return -ENODEV; } +/* + * This function checks the type and returns either 0 for not + * supported or the highest compatible type value (which may + * include the input type value). + */ +static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) +{ + int comp_type = 0; + + /* < CEX2A is not supported */ + if (rawtype < AP_DEVICE_TYPE_CEX2A) + return 0; + /* up to CEX6 known and fully supported */ + if (rawtype <= AP_DEVICE_TYPE_CEX6) + return rawtype; + /* + * unknown new type > CEX6, check for compatibility + * to the highest known and supported type which is + * currently CEX6 with the help of the QACT function. + */ + if (ap_qact_available()) { + struct ap_queue_status status; + struct ap_qact_ap_info apinfo = {0}; + + apinfo.mode = (func >> 26) & 0x07; + apinfo.cat = AP_DEVICE_TYPE_CEX6; + status = ap_qact(qid, 0, &apinfo); + if (status.response_code == AP_RESPONSE_NORMAL + && apinfo.cat >= AP_DEVICE_TYPE_CEX2A + && apinfo.cat <= AP_DEVICE_TYPE_CEX6) + comp_type = apinfo.cat; + } + if (!comp_type) + AP_DBF(DBF_WARN, "queue=%02x.%04x unable to map type %d\n", + AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); + else if (comp_type != rawtype) + AP_DBF(DBF_INFO, "queue=%02x.%04x map type %d to %d\n", + AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype, comp_type); + return comp_type; +} + /* * helper function to be used with bus_find_dev * matches for the card device with the given id @@ -1014,8 +1067,8 @@ static void ap_scan_bus(struct work_struct *unused) struct ap_card *ac; struct device *dev; ap_qid_t qid; - int depth = 0, type = 0; - unsigned int functions = 0; + int comp_type, depth = 0, type = 0; + unsigned int func = 0; int rc, id, dom, borked, domains, defdomdevs = 0; AP_DBF(DBF_DEBUG, "ap_scan_bus running\n"); @@ -1066,12 +1119,12 @@ static void ap_scan_bus(struct work_struct *unused) } continue; } - rc = ap_query_queue(qid, &depth, &type, &functions); + rc = ap_query_queue(qid, &depth, &type, &func); if (dev) { spin_lock_bh(&aq->lock); if (rc == -ENODEV || /* adapter reconfiguration */ - (ac && ac->functions != functions)) + (ac && ac->functions != func)) aq->state = AP_STATE_BORKED; borked = aq->state == AP_STATE_BORKED; spin_unlock_bh(&aq->lock); @@ -1087,11 +1140,14 @@ static void ap_scan_bus(struct work_struct *unused) } if (rc) continue; - /* new queue device needed */ + /* a new queue device is needed, check out comp type */ + comp_type = ap_get_compatible_type(qid, type, func); + if (!comp_type) + continue; + /* maybe a card device needs to be created first */ if (!ac) { - /* but first create the card device */ - ac = ap_card_create(id, depth, - type, functions); + ac = ap_card_create(id, depth, type, + comp_type, func); if (!ac) continue; ac->ap_dev.device.bus = &ap_bus_type; @@ -1109,7 +1165,7 @@ static void ap_scan_bus(struct work_struct *unused) get_device(&ac->ap_dev.device); } /* now create the new queue device */ - aq = ap_queue_create(qid, type); + aq = ap_queue_create(qid, comp_type); if (!aq) continue; aq->card = ac; diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 754cf2223cfb..3a0e19d87e7c 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -250,8 +250,8 @@ void ap_queue_remove(struct ap_queue *aq); void ap_queue_suspend(struct ap_device *ap_dev); void ap_queue_resume(struct ap_device *ap_dev); -struct ap_card *ap_card_create(int id, int queue_depth, int device_type, - unsigned int device_functions); +struct ap_card *ap_card_create(int id, int queue_depth, int raw_device_type, + int comp_device_type, unsigned int functions); int ap_module_init(void); void ap_module_exit(void); diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c index e8300630e197..428623fa517e 100644 --- a/drivers/s390/crypto/ap_card.c +++ b/drivers/s390/crypto/ap_card.c @@ -170,19 +170,20 @@ static void ap_card_device_release(struct device *dev) kfree(ac); } -struct ap_card *ap_card_create(int id, int queue_depth, int device_type, - unsigned int functions) +struct ap_card *ap_card_create(int id, int queue_depth, int raw_type, + int comp_type, unsigned int functions) { struct ap_card *ac; ac = kzalloc(sizeof(*ac), GFP_KERNEL); if (!ac) return NULL; + INIT_LIST_HEAD(&ac->list); INIT_LIST_HEAD(&ac->queues); ac->ap_dev.device.release = ap_card_device_release; ac->ap_dev.device.type = &ap_card_type; - ac->ap_dev.device_type = device_type; - ac->raw_hwtype = device_type; + ac->ap_dev.device_type = comp_type; + ac->raw_hwtype = raw_type; ac->queue_depth = queue_depth; ac->functions = functions; ac->id = id; diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index d15903926f46..6f3fcc959796 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -630,6 +630,7 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) aq->state = AP_STATE_RESET_START; aq->interrupt = AP_INTR_DISABLED; spin_lock_init(&aq->lock); + INIT_LIST_HEAD(&aq->list); INIT_LIST_HEAD(&aq->pendingq); INIT_LIST_HEAD(&aq->requestq); setup_timer(&aq->timeout, ap_request_timeout, (unsigned long) aq); From 82c62fa0c49aa305104013cee4468772799bb391 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Oct 2017 11:21:35 -0500 Subject: [PATCH 0518/1085] x86/asm: Don't use the confusing '.ifeq' directive I find the '.ifeq ' directive to be confusing. Reading it quickly seems to suggest its opposite meaning, or that it's missing an argument. Improve readability by replacing all of its x86 uses with '.if == 0'. Signed-off-by: Josh Poimboeuf Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6cdb7a1455e..846e84a1d1f7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -817,7 +817,7 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..6e50f87765e5 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -401,7 +401,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 42e32c2e51bb..311db1a73c11 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -273,7 +273,7 @@ ENDPROC(start_cpu0) ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 UNWIND_HINT_IRET_REGS pushq $0 # Dummy error code, to make stack frame uniform .else From 8776fe75dc0e263ed2056ea9896c2267599dc447 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Oct 2017 06:31:27 -0700 Subject: [PATCH 0519/1085] lkdtm, kprobes: Convert from jprobes to kprobes The jprobes subsystem is being removed, so convert to using kprobes instead. Signed-off-by: Kees Cook Acked-by: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171020133127.GA18360@beast Signed-off-by: Ingo Molnar --- drivers/misc/lkdtm_core.c | 154 +++++++++++--------------------------- 1 file changed, 45 insertions(+), 109 deletions(-) diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c index 981b3ef71e47..ed7f0c61c59a 100644 --- a/drivers/misc/lkdtm_core.c +++ b/drivers/misc/lkdtm_core.c @@ -56,122 +56,54 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf, size_t count, loff_t *off); #ifdef CONFIG_KPROBES -static void lkdtm_handler(void); +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); static ssize_t lkdtm_debugfs_entry(struct file *f, const char __user *user_buf, size_t count, loff_t *off); - - -/* jprobe entry point handlers. */ -static unsigned int jp_do_irq(unsigned int irq) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} - -static irqreturn_t jp_handle_irq_event(unsigned int irq, - struct irqaction *action) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} - -static void jp_tasklet_action(struct softirq_action *a) -{ - lkdtm_handler(); - jprobe_return(); -} - -static void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) -{ - lkdtm_handler(); - jprobe_return(); -} - -struct scan_control; - -static unsigned long jp_shrink_inactive_list(unsigned long max_scan, - struct zone *zone, - struct scan_control *sc) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} - -static int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} - -static int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} - -# ifdef CONFIG_IDE -static int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file, - struct block_device *bdev, unsigned int cmd, - unsigned long arg) -{ - lkdtm_handler(); - jprobe_return(); - return 0; -} -# endif +# define CRASHPOINT_KPROBE(_symbol) \ + .kprobe = { \ + .symbol_name = (_symbol), \ + .pre_handler = lkdtm_kprobe_handler, \ + }, +# define CRASHPOINT_WRITE(_symbol) \ + (_symbol) ? lkdtm_debugfs_entry : direct_entry +#else +# define CRASHPOINT_KPROBE(_symbol) +# define CRASHPOINT_WRITE(_symbol) direct_entry #endif /* Crash points */ struct crashpoint { const char *name; const struct file_operations fops; - struct jprobe jprobe; + struct kprobe kprobe; }; -#define CRASHPOINT(_name, _write, _symbol, _entry) \ +#define CRASHPOINT(_name, _symbol) \ { \ .name = _name, \ .fops = { \ .read = lkdtm_debugfs_read, \ .llseek = generic_file_llseek, \ .open = lkdtm_debugfs_open, \ - .write = _write, \ - }, \ - .jprobe = { \ - .kp.symbol_name = _symbol, \ - .entry = (kprobe_opcode_t *)_entry, \ + .write = CRASHPOINT_WRITE(_symbol) \ }, \ + CRASHPOINT_KPROBE(_symbol) \ } /* Define the possible places where we can trigger a crash point. */ -struct crashpoint crashpoints[] = { - CRASHPOINT("DIRECT", direct_entry, - NULL, NULL), +static struct crashpoint crashpoints[] = { + CRASHPOINT("DIRECT", NULL), #ifdef CONFIG_KPROBES - CRASHPOINT("INT_HARDWARE_ENTRY", lkdtm_debugfs_entry, - "do_IRQ", jp_do_irq), - CRASHPOINT("INT_HW_IRQ_EN", lkdtm_debugfs_entry, - "handle_IRQ_event", jp_handle_irq_event), - CRASHPOINT("INT_TASKLET_ENTRY", lkdtm_debugfs_entry, - "tasklet_action", jp_tasklet_action), - CRASHPOINT("FS_DEVRW", lkdtm_debugfs_entry, - "ll_rw_block", jp_ll_rw_block), - CRASHPOINT("MEM_SWAPOUT", lkdtm_debugfs_entry, - "shrink_inactive_list", jp_shrink_inactive_list), - CRASHPOINT("TIMERADD", lkdtm_debugfs_entry, - "hrtimer_start", jp_hrtimer_start), - CRASHPOINT("SCSI_DISPATCH_CMD", lkdtm_debugfs_entry, - "scsi_dispatch_cmd", jp_scsi_dispatch_cmd), + CRASHPOINT("INT_HARDWARE_ENTRY", "do_IRQ"), + CRASHPOINT("INT_HW_IRQ_EN", "handle_IRQ_event"), + CRASHPOINT("INT_TASKLET_ENTRY", "tasklet_action"), + CRASHPOINT("FS_DEVRW", "ll_rw_block"), + CRASHPOINT("MEM_SWAPOUT", "shrink_inactive_list"), + CRASHPOINT("TIMERADD", "hrtimer_start"), + CRASHPOINT("SCSI_DISPATCH_CMD", "scsi_dispatch_cmd"), # ifdef CONFIG_IDE - CRASHPOINT("IDE_CORE_CP", lkdtm_debugfs_entry, - "generic_ide_ioctl", jp_generic_ide_ioctl), + CRASHPOINT("IDE_CORE_CP", "generic_ide_ioctl"), # endif #endif }; @@ -254,8 +186,8 @@ struct crashtype crashtypes[] = { }; -/* Global jprobe entry and crashtype. */ -static struct jprobe *lkdtm_jprobe; +/* Global kprobe entry and crashtype. */ +static struct kprobe *lkdtm_kprobe; struct crashpoint *lkdtm_crashpoint; struct crashtype *lkdtm_crashtype; @@ -298,7 +230,8 @@ static struct crashtype *find_crashtype(const char *name) */ static noinline void lkdtm_do_action(struct crashtype *crashtype) { - BUG_ON(!crashtype || !crashtype->func); + if (WARN_ON(!crashtype || !crashtype->func)) + return; crashtype->func(); } @@ -308,22 +241,22 @@ static int lkdtm_register_cpoint(struct crashpoint *crashpoint, int ret; /* If this doesn't have a symbol, just call immediately. */ - if (!crashpoint->jprobe.kp.symbol_name) { + if (!crashpoint->kprobe.symbol_name) { lkdtm_do_action(crashtype); return 0; } - if (lkdtm_jprobe != NULL) - unregister_jprobe(lkdtm_jprobe); + if (lkdtm_kprobe != NULL) + unregister_kprobe(lkdtm_kprobe); lkdtm_crashpoint = crashpoint; lkdtm_crashtype = crashtype; - lkdtm_jprobe = &crashpoint->jprobe; - ret = register_jprobe(lkdtm_jprobe); + lkdtm_kprobe = &crashpoint->kprobe; + ret = register_kprobe(lkdtm_kprobe); if (ret < 0) { - pr_info("Couldn't register jprobe %s\n", - crashpoint->jprobe.kp.symbol_name); - lkdtm_jprobe = NULL; + pr_info("Couldn't register kprobe %s\n", + crashpoint->kprobe.symbol_name); + lkdtm_kprobe = NULL; lkdtm_crashpoint = NULL; lkdtm_crashtype = NULL; } @@ -336,13 +269,14 @@ static int lkdtm_register_cpoint(struct crashpoint *crashpoint, static int crash_count = DEFAULT_COUNT; static DEFINE_SPINLOCK(crash_count_lock); -/* Called by jprobe entry points. */ -static void lkdtm_handler(void) +/* Called by kprobe entry points. */ +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) { unsigned long flags; bool do_it = false; - BUG_ON(!lkdtm_crashpoint || !lkdtm_crashtype); + if (WARN_ON(!lkdtm_crashpoint || !lkdtm_crashtype)) + return 0; spin_lock_irqsave(&crash_count_lock, flags); crash_count--; @@ -357,6 +291,8 @@ static void lkdtm_handler(void) if (do_it) lkdtm_do_action(lkdtm_crashtype); + + return 0; } static ssize_t lkdtm_debugfs_entry(struct file *f, @@ -556,8 +492,8 @@ static void __exit lkdtm_module_exit(void) /* Handle test-specific clean-up. */ lkdtm_usercopy_exit(); - if (lkdtm_jprobe != NULL) - unregister_jprobe(lkdtm_jprobe); + if (lkdtm_kprobe != NULL) + unregister_kprobe(lkdtm_kprobe); pr_info("Crash point unregistered\n"); } From 45653c845b0c670b7a194acc6a90bc70aa049252 Mon Sep 17 00:00:00 2001 From: sayli karnik Date: Sat, 9 Sep 2017 22:26:18 +0530 Subject: [PATCH 0520/1085] scripts: Add a script to find unused documentation Add a script that finds files with kernel-doc comments for imported functions that are not included anywhere in documentation. Signed-off-by: sayli karnik Signed-off-by: Jonathan Corbet --- scripts/find-unused-docs.sh | 62 +++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100755 scripts/find-unused-docs.sh diff --git a/scripts/find-unused-docs.sh b/scripts/find-unused-docs.sh new file mode 100755 index 000000000000..3f46f8977dc4 --- /dev/null +++ b/scripts/find-unused-docs.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# (c) 2017, Jonathan Corbet +# sayli karnik +# +# This script detects files with kernel-doc comments for exported functions +# that are not included in documentation. +# +# usage: Run 'scripts/find-unused-docs.sh directory' from top level of kernel +# tree. +# +# example: $scripts/find-unused-docs.sh drivers/scsi +# +# Licensed under the terms of the GNU GPL License + +if ! [ -d "Documentation" ]; then + echo "Run from top level of kernel tree" + exit 1 +fi + +if [ "$#" -ne 1 ]; then + echo "Usage: scripts/find-unused-docs.sh directory" + exit 1 +fi + +if ! [ -d "$1" ]; then + echo "Directory $1 doesn't exist" + exit 1 +fi + +cd "$( dirname "${BASH_SOURCE[0]}" )" +cd .. + +cd Documentation/ + +echo "The following files contain kerneldoc comments for exported functions \ +that are not used in the formatted documentation" + +# FILES INCLUDED + +files_included=($(grep -rHR ".. kernel-doc" --include \*.rst | cut -d " " -f 3)) + +declare -A FILES_INCLUDED + +for each in "${files_included[@]}"; do + FILES_INCLUDED[$each]="$each" + done + +cd .. + +# FILES NOT INCLUDED + +for file in `find $1 -name '*.c'`; do + + if [[ ${FILES_INCLUDED[$file]+_} ]]; then + continue; + fi + str=$(scripts/kernel-doc -text -export "$file" 2>/dev/null) + if [[ -n "$str" ]]; then + echo "$file" + fi + done + From 008de6c69ccb9b62110188fb76b9e425cca686e4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 12:59:55 -0700 Subject: [PATCH 0521/1085] perf vendor events: Update JSON metrics for Broadwell Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwell/bdw-metrics.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index 49c5f123d811..7372c3207092 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION:c1 + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION:c1 + DTLB_LOAD_MISSES.WALK_DURATION:c1 + DTLB_STORE_MISSES.WALK_DURATION:c1 + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, @@ -97,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 663ad445640feeeffb3d405bd60a4b1e44019c56 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:07 -0700 Subject: [PATCH 0522/1085] perf vendor events: Update JSON metrics for Broadwell Server Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwellx/bdx-metrics.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index 4248b029d199..4d9a1175a5cc 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION:c1 + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles))", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, @@ -97,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 0fba08e2492027ce2c016172ba326c40c32a6ec6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:16 -0700 Subject: [PATCH 0523/1085] perf vendor events: Update JSON metrics for Haswell Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/haswell/hsw-metrics.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json index 03d894988f0f..5ab5c78fe580 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else (UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@)", + "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, From 9cd6d864662dfe7a0c5c64fe37dc243c0aea7483 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:29 -0700 Subject: [PATCH 0524/1085] perf vendor events: Update JSON metrics for Haswell Server Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/haswellx/hsx-metrics.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index 0bae5eda9fb3..5ab5c78fe580 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", + "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@", + "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, From 1de315249415863dca167623909fb7951569cf8d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:37 -0700 Subject: [PATCH 0525/1085] perf vendor events: Update JSON metrics for IvyBridge Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/ivybridge/ivb-metrics.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json index 057a832f1a6a..7c2679514efb 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, @@ -97,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 7347bba5552f479d4292ffd008d18d41a965f021 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:44 -0700 Subject: [PATCH 0526/1085] perf vendor events: Update JSON metrics for IvyTown Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/ivytown/ivt-metrics.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json index 057a832f1a6a..7c2679514efb 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -79,13 +79,13 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, @@ -97,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 984d91f4c62f64026cbfef51f609971025934cec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:00:51 -0700 Subject: [PATCH 0527/1085] perf vendor events: Update JSON metrics for JakeTown Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/jaketown/jkt-metrics.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index b35b1c153c8a..2b18008cf482 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,13 +49,13 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (( UOPS_DISPATCHED.CORE:c1 / 2) if #SMT_on else UOPS_DISPATCHED.CORE:c1)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, @@ -73,7 +73,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 41a13b74a0406955c488000aeca4021c0efdd243 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:01:01 -0700 Subject: [PATCH 0528/1085] perf vendor events: Update JSON metrics for Sandy Bridge Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/sandybridge/snb-metrics.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index b35b1c153c8a..2b18008cf482 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,13 +49,13 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (( UOPS_DISPATCHED.CORE:c1 / 2) if #SMT_on else UOPS_DISPATCHED.CORE:c1)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, @@ -73,7 +73,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From e3f2dadf76591a085e61db45736c3a4a9369b314 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:01:08 -0700 Subject: [PATCH 0529/1085] perf vendor events: Update JSON metrics for Skylake Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/skylake/skl-metrics.json | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index 411f9411ec9e..36c903faed0b 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -13,7 +13,7 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ((UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, @@ -25,7 +25,7 @@ }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -37,7 +37,7 @@ }, { "BriefDescription": "Total issue-pipeline slots", - "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,19 +49,19 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -73,19 +73,19 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS_PS + MEM_LOAD_RETIRED.FB_HIT_PS )", "MetricGroup": "Memory_Bound;Memory_Lat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles )", + "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles) )", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, @@ -97,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From ead81ee4f887c2e6205dacb83ab2e24367a003c1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Sep 2017 13:02:28 -0700 Subject: [PATCH 0530/1085] perf vendor events: Update JSON metrics for Skylake Server Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/20170914200748.GA13837@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/skylakex/skx-metrics.json | 42 ++++++------------- 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 68f12a26c816..36c903faed0b 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -13,19 +13,19 @@ }, { "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions", - "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", + "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ((UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )", "MetricGroup": "Frontend", "MetricName": "IFetch_Line_Utilization" }, { - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )", "MetricGroup": "DSB; Frontend_Bandwidth", "MetricName": "DSB_Coverage" }, { "BriefDescription": "Cycles Per Instruction (threaded)", - "MetricExpr": "1 / INST_RETIRED.ANY / cycles", + "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)", "MetricGroup": "Pipeline;Summary", "MetricName": "CPI" }, @@ -36,8 +36,8 @@ "MetricName": "CLKS" }, { - "BriefDescription": "Total issue-pipeline slots (per-core)", - "MetricExpr": "4*cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "BriefDescription": "Total issue-pipeline slots", + "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TopDownL1", "MetricName": "SLOTS" }, @@ -49,25 +49,25 @@ }, { "BriefDescription": "Instructions Per Cycle (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "SMT", "MetricName": "CoreIPC" }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1", + "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "( RS_EVENTS.EMPTY_CYCLES - (ICACHE_16B.IFDATA_STALL +2* ICACHE_16B.IFDATA_STALL:c1:e1) - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, { "BriefDescription": "Core actual clocks when any thread is active on the physical core", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )", + "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -79,34 +79,16 @@ }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES", + "MetricExpr": "L1D_PEND_MISS.PENDING / (( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)", "MetricGroup": "Memory_Bound;Memory_BW", "MetricName": "MLP" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) )", + "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles) )", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "L1 cache miss per kilo instruction for demand loads", - "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS_PS / INST_RETIRED.ANY", - "MetricGroup": "Cache_Misses;", - "MetricName": "L1MPKI" - }, - { - "BriefDescription": "L2 cache miss per kilo instruction for demand loads", - "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS_PS / INST_RETIRED.ANY", - "MetricGroup": "Cache_Misses;", - "MetricName": "L2MPKI" - }, - { - "BriefDescription": "L3 cache miss per kilo instruction for demand loads", - "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS_PS / INST_RETIRED.ANY", - "MetricGroup": "Cache_Misses;", - "MetricName": "L3MPKI" - }, { "BriefDescription": "Average CPU Utilization", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", @@ -115,7 +97,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16* FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000 / duration_time", + "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time", "MetricGroup": "FLOPS;Summary", "MetricName": "GFLOPs" }, From 1695849735752d2ace22d8c424ba579e33df691c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Oct 2017 10:31:47 -0300 Subject: [PATCH 0531/1085] perf mmap: Move perf_mmap and methods to separate mmap.[ch] files To better organize the sources, and we may end up even using it directly, without evlists and evsels. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-oiqrm7grflurnnzo2ovfnslg@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/evlist.c | 248 ---------------------------- tools/perf/util/evlist.h | 76 +-------- tools/perf/util/mmap.c | 252 +++++++++++++++++++++++++++++ tools/perf/util/mmap.h | 94 +++++++++++ tools/perf/util/python-ext-sources | 1 + 6 files changed, 349 insertions(+), 323 deletions(-) create mode 100644 tools/perf/util/mmap.c create mode 100644 tools/perf/util/mmap.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 369c3163e68c..a3de7916fe63 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -13,6 +13,7 @@ libperf-y += find_bit.o libperf-y += kallsyms.o libperf-y += levenshtein.o libperf-y += llvm-utils.o +libperf-y += mmap.o libperf-y += memswap.o libperf-y += parse-events.o libperf-y += perf_regs.o diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6a0d7ffbeba0..c6c891e154a6 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -33,9 +33,6 @@ #include #include -static void perf_mmap__munmap(struct perf_mmap *map); -static void perf_mmap__put(struct perf_mmap *map); - #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->sample_id, x, y) @@ -704,129 +701,6 @@ static int perf_evlist__resume(struct perf_evlist *evlist) return perf_evlist__set_paused(evlist, false); } -/* When check_messup is true, 'end' must points to a good entry */ -static union perf_event * -perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start, - u64 end, u64 *prev) -{ - unsigned char *data = md->base + page_size; - union perf_event *event = NULL; - int diff = end - start; - - if (check_messup) { - /* - * If we're further behind than half the buffer, there's a chance - * the writer will bite our tail and mess up the samples under us. - * - * If we somehow ended up ahead of the 'end', we got messed up. - * - * In either case, truncate and restart at 'end'. - */ - if (diff > md->mask / 2 || diff < 0) { - fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); - - /* - * 'end' points to a known good entry, start there. - */ - start = end; - diff = 0; - } - } - - if (diff >= (int)sizeof(event->header)) { - size_t size; - - event = (union perf_event *)&data[start & md->mask]; - size = event->header.size; - - if (size < sizeof(event->header) || diff < (int)size) { - event = NULL; - goto broken_event; - } - - /* - * Event straddles the mmap boundary -- header should always - * be inside due to u64 alignment of output. - */ - if ((start & md->mask) + size != ((start + size) & md->mask)) { - unsigned int offset = start; - unsigned int len = min(sizeof(*event), size), cpy; - void *dst = md->event_copy; - - do { - cpy = min(md->mask + 1 - (offset & md->mask), len); - memcpy(dst, &data[offset & md->mask], cpy); - offset += cpy; - dst += cpy; - len -= cpy; - } while (len); - - event = (union perf_event *) md->event_copy; - } - - start += size; - } - -broken_event: - if (prev) - *prev = start; - - return event; -} - -union perf_event *perf_mmap__read_forward(struct perf_mmap *md, bool check_messup) -{ - u64 head; - u64 old = md->prev; - - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&md->refcnt)) - return NULL; - - head = perf_mmap__read_head(md); - - return perf_mmap__read(md, check_messup, old, head, &md->prev); -} - -union perf_event * -perf_mmap__read_backward(struct perf_mmap *md) -{ - u64 head, end; - u64 start = md->prev; - - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&md->refcnt)) - return NULL; - - head = perf_mmap__read_head(md); - if (!head) - return NULL; - - /* - * 'head' pointer starts from 0. Kernel minus sizeof(record) form - * it each time when kernel writes to it, so in fact 'head' is - * negative. 'end' pointer is made manually by adding the size of - * the ring buffer to 'head' pointer, means the validate data can - * read is the whole ring buffer. If 'end' is positive, the ring - * buffer has not fully filled, so we must adjust 'end' to 0. - * - * However, since both 'head' and 'end' is unsigned, we can't - * simply compare 'end' against 0. Here we compare '-head' and - * the size of the ring buffer, where -head is the number of bytes - * kernel write to the ring buffer. - */ - if (-head < (u64)(md->mask + 1)) - end = 0; - else - end = head + md->mask + 1; - - return perf_mmap__read(md, false, start, end, &md->prev); -} - union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int idx) { struct perf_mmap *md = &evlist->mmap[idx]; @@ -857,96 +731,16 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) return perf_evlist__mmap_read_forward(evlist, idx); } -void perf_mmap__read_catchup(struct perf_mmap *md) -{ - u64 head; - - if (!refcount_read(&md->refcnt)) - return; - - head = perf_mmap__read_head(md); - md->prev = head; -} - void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) { perf_mmap__read_catchup(&evlist->mmap[idx]); } -static bool perf_mmap__empty(struct perf_mmap *md) -{ - return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base; -} - -static void perf_mmap__get(struct perf_mmap *map) -{ - refcount_inc(&map->refcnt); -} - -static void perf_mmap__put(struct perf_mmap *md) -{ - BUG_ON(md->base && refcount_read(&md->refcnt) == 0); - - if (refcount_dec_and_test(&md->refcnt)) - perf_mmap__munmap(md); -} - -void perf_mmap__consume(struct perf_mmap *md, bool overwrite) -{ - if (!overwrite) { - u64 old = md->prev; - - perf_mmap__write_tail(md, old); - } - - if (refcount_read(&md->refcnt) == 1 && perf_mmap__empty(md)) - perf_mmap__put(md); -} - void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx) { perf_mmap__consume(&evlist->mmap[idx], evlist->overwrite); } -int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, - struct auxtrace_mmap_params *mp __maybe_unused, - void *userpg __maybe_unused, - int fd __maybe_unused) -{ - return 0; -} - -void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused) -{ -} - -void __weak auxtrace_mmap_params__init( - struct auxtrace_mmap_params *mp __maybe_unused, - off_t auxtrace_offset __maybe_unused, - unsigned int auxtrace_pages __maybe_unused, - bool auxtrace_overwrite __maybe_unused) -{ -} - -void __weak auxtrace_mmap_params__set_idx( - struct auxtrace_mmap_params *mp __maybe_unused, - struct perf_evlist *evlist __maybe_unused, - int idx __maybe_unused, - bool per_cpu __maybe_unused) -{ -} - -static void perf_mmap__munmap(struct perf_mmap *map) -{ - if (map->base != NULL) { - munmap(map->base, perf_mmap__mmap_len(map)); - map->base = NULL; - map->fd = -1; - refcount_set(&map->refcnt, 0); - } - auxtrace_mmap__munmap(&map->auxtrace_mmap); -} - static void perf_evlist__munmap_nofree(struct perf_evlist *evlist) { int i; @@ -995,48 +789,6 @@ static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist) return map; } -struct mmap_params { - int prot; - int mask; - struct auxtrace_mmap_params auxtrace_mp; -}; - -static int perf_mmap__mmap(struct perf_mmap *map, - struct mmap_params *mp, int fd) -{ - /* - * The last one will be done at perf_evlist__mmap_consume(), so that we - * make sure we don't prevent tools from consuming every last event in - * the ring buffer. - * - * I.e. we can get the POLLHUP meaning that the fd doesn't exist - * anymore, but the last events for it are still in the ring buffer, - * waiting to be consumed. - * - * Tools can chose to ignore this at their own discretion, but the - * evlist layer can't just drop it when filtering events in - * perf_evlist__filter_pollfd(). - */ - refcount_set(&map->refcnt, 2); - map->prev = 0; - map->mask = mp->mask; - map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, - MAP_SHARED, fd, 0); - if (map->base == MAP_FAILED) { - pr_debug2("failed to mmap perf event ring buffer, error %d\n", - errno); - map->base = NULL; - return -1; - } - map->fd = fd; - - if (auxtrace_mmap__mmap(&map->auxtrace_mmap, - &mp->auxtrace_mp, map->base, fd)) - return -1; - - return 0; -} - static bool perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused, struct perf_evsel *evsel) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b1c14f1fdc27..8c433e95bd9a 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -11,8 +11,8 @@ #include "../perf.h" #include "event.h" #include "evsel.h" +#include "mmap.h" #include "util.h" -#include "auxtrace.h" #include #include @@ -24,55 +24,6 @@ struct record_opts; #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) -/** - * struct perf_mmap - perf's ring buffer mmap details - * - * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this - */ -struct perf_mmap { - void *base; - int mask; - int fd; - refcount_t refcnt; - u64 prev; - struct auxtrace_mmap auxtrace_mmap; - char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); -}; - -static inline size_t -perf_mmap__mmap_len(struct perf_mmap *map) -{ - return map->mask + 1 + page_size; -} - -/* - * State machine of bkw_mmap_state: - * - * .________________(forbid)_____________. - * | V - * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY - * ^ ^ | ^ | - * | |__(forbid)____/ |___(forbid)___/| - * | | - * \_________________(3)_______________/ - * - * NOTREADY : Backward ring buffers are not ready - * RUNNING : Backward ring buffers are recording - * DATA_PENDING : We are required to collect data from backward ring buffers - * EMPTY : We have collected data from backward ring buffers. - * - * (0): Setup backward ring buffer - * (1): Pause ring buffers for reading - * (2): Read from ring buffers - * (3): Resume ring buffers for recording - */ -enum bkw_mmap_state { - BKW_MMAP_NOTREADY, - BKW_MMAP_RUNNING, - BKW_MMAP_DATA_PENDING, - BKW_MMAP_EMPTY, -}; - struct perf_evlist { struct list_head entries; struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; @@ -177,12 +128,6 @@ struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id); void perf_evlist__toggle_bkw_mmap(struct perf_evlist *evlist, enum bkw_mmap_state state); -union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup); -union perf_event *perf_mmap__read_backward(struct perf_mmap *map); - -void perf_mmap__read_catchup(struct perf_mmap *md); -void perf_mmap__consume(struct perf_mmap *md, bool overwrite); - union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx); union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, @@ -286,25 +231,6 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp); int perf_evlist__strerror_open(struct perf_evlist *evlist, int err, char *buf, size_t size); int perf_evlist__strerror_mmap(struct perf_evlist *evlist, int err, char *buf, size_t size); -static inline u64 perf_mmap__read_head(struct perf_mmap *mm) -{ - struct perf_event_mmap_page *pc = mm->base; - u64 head = ACCESS_ONCE(pc->data_head); - rmb(); - return head; -} - -static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) -{ - struct perf_event_mmap_page *pc = md->base; - - /* - * ensure all reads are done before we write the tail out. - */ - mb(); - pc->data_tail = tail; -} - bool perf_evlist__can_select_event(struct perf_evlist *evlist, const char *str); void perf_evlist__to_front(struct perf_evlist *evlist, struct perf_evsel *move_evsel); diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c new file mode 100644 index 000000000000..dfc4a007f2c6 --- /dev/null +++ b/tools/perf/util/mmap.c @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further + * copyright notes. + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include +#include "event.h" +#include "mmap.h" +#include "util.h" /* page_size */ + +size_t perf_mmap__mmap_len(struct perf_mmap *map) +{ + return map->mask + 1 + page_size; +} + +/* When check_messup is true, 'end' must points to a good entry */ +static union perf_event *perf_mmap__read(struct perf_mmap *map, bool check_messup, + u64 start, u64 end, u64 *prev) +{ + unsigned char *data = map->base + page_size; + union perf_event *event = NULL; + int diff = end - start; + + if (check_messup) { + /* + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and mess up the samples under us. + * + * If we somehow ended up ahead of the 'end', we got messed up. + * + * In either case, truncate and restart at 'end'. + */ + if (diff > map->mask / 2 || diff < 0) { + fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); + + /* + * 'end' points to a known good entry, start there. + */ + start = end; + diff = 0; + } + } + + if (diff >= (int)sizeof(event->header)) { + size_t size; + + event = (union perf_event *)&data[start & map->mask]; + size = event->header.size; + + if (size < sizeof(event->header) || diff < (int)size) { + event = NULL; + goto broken_event; + } + + /* + * Event straddles the mmap boundary -- header should always + * be inside due to u64 alignment of output. + */ + if ((start & map->mask) + size != ((start + size) & map->mask)) { + unsigned int offset = start; + unsigned int len = min(sizeof(*event), size), cpy; + void *dst = map->event_copy; + + do { + cpy = min(map->mask + 1 - (offset & map->mask), len); + memcpy(dst, &data[offset & map->mask], cpy); + offset += cpy; + dst += cpy; + len -= cpy; + } while (len); + + event = (union perf_event *)map->event_copy; + } + + start += size; + } + +broken_event: + if (prev) + *prev = start; + + return event; +} + +union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup) +{ + u64 head; + u64 old = map->prev; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return NULL; + + head = perf_mmap__read_head(map); + + return perf_mmap__read(map, check_messup, old, head, &map->prev); +} + +union perf_event *perf_mmap__read_backward(struct perf_mmap *map) +{ + u64 head, end; + u64 start = map->prev; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return NULL; + + head = perf_mmap__read_head(map); + if (!head) + return NULL; + + /* + * 'head' pointer starts from 0. Kernel minus sizeof(record) form + * it each time when kernel writes to it, so in fact 'head' is + * negative. 'end' pointer is made manually by adding the size of + * the ring buffer to 'head' pointer, means the validate data can + * read is the whole ring buffer. If 'end' is positive, the ring + * buffer has not fully filled, so we must adjust 'end' to 0. + * + * However, since both 'head' and 'end' is unsigned, we can't + * simply compare 'end' against 0. Here we compare '-head' and + * the size of the ring buffer, where -head is the number of bytes + * kernel write to the ring buffer. + */ + if (-head < (u64)(map->mask + 1)) + end = 0; + else + end = head + map->mask + 1; + + return perf_mmap__read(map, false, start, end, &map->prev); +} + +void perf_mmap__read_catchup(struct perf_mmap *map) +{ + u64 head; + + if (!refcount_read(&map->refcnt)) + return; + + head = perf_mmap__read_head(map); + map->prev = head; +} + +static bool perf_mmap__empty(struct perf_mmap *map) +{ + return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; +} + +void perf_mmap__get(struct perf_mmap *map) +{ + refcount_inc(&map->refcnt); +} + +void perf_mmap__put(struct perf_mmap *map) +{ + BUG_ON(map->base && refcount_read(&map->refcnt) == 0); + + if (refcount_dec_and_test(&map->refcnt)) + perf_mmap__munmap(map); +} + +void perf_mmap__consume(struct perf_mmap *map, bool overwrite) +{ + if (!overwrite) { + u64 old = map->prev; + + perf_mmap__write_tail(map, old); + } + + if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) + perf_mmap__put(map); +} + +int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, + struct auxtrace_mmap_params *mp __maybe_unused, + void *userpg __maybe_unused, + int fd __maybe_unused) +{ + return 0; +} + +void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused) +{ +} + +void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused, + off_t auxtrace_offset __maybe_unused, + unsigned int auxtrace_pages __maybe_unused, + bool auxtrace_overwrite __maybe_unused) +{ +} + +void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused, + struct perf_evlist *evlist __maybe_unused, + int idx __maybe_unused, + bool per_cpu __maybe_unused) +{ +} + +void perf_mmap__munmap(struct perf_mmap *map) +{ + if (map->base != NULL) { + munmap(map->base, perf_mmap__mmap_len(map)); + map->base = NULL; + map->fd = -1; + refcount_set(&map->refcnt, 0); + } + auxtrace_mmap__munmap(&map->auxtrace_mmap); +} + +int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd) +{ + /* + * The last one will be done at perf_evlist__mmap_consume(), so that we + * make sure we don't prevent tools from consuming every last event in + * the ring buffer. + * + * I.e. we can get the POLLHUP meaning that the fd doesn't exist + * anymore, but the last events for it are still in the ring buffer, + * waiting to be consumed. + * + * Tools can chose to ignore this at their own discretion, but the + * evlist layer can't just drop it when filtering events in + * perf_evlist__filter_pollfd(). + */ + refcount_set(&map->refcnt, 2); + map->prev = 0; + map->mask = mp->mask; + map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, + MAP_SHARED, fd, 0); + if (map->base == MAP_FAILED) { + pr_debug2("failed to mmap perf event ring buffer, error %d\n", + errno); + map->base = NULL; + return -1; + } + map->fd = fd; + + if (auxtrace_mmap__mmap(&map->auxtrace_mmap, + &mp->auxtrace_mp, map->base, fd)) + return -1; + + return 0; +} diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h new file mode 100644 index 000000000000..f37ff45c8ec1 --- /dev/null +++ b/tools/perf/util/mmap.h @@ -0,0 +1,94 @@ +#ifndef __PERF_MMAP_H +#define __PERF_MMAP_H 1 + +#include +#include +#include +#include +#include +#include "auxtrace.h" +#include "event.h" + +/** + * struct perf_mmap - perf's ring buffer mmap details + * + * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this + */ +struct perf_mmap { + void *base; + int mask; + int fd; + refcount_t refcnt; + u64 prev; + struct auxtrace_mmap auxtrace_mmap; + char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); +}; + +/* + * State machine of bkw_mmap_state: + * + * .________________(forbid)_____________. + * | V + * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY + * ^ ^ | ^ | + * | |__(forbid)____/ |___(forbid)___/| + * | | + * \_________________(3)_______________/ + * + * NOTREADY : Backward ring buffers are not ready + * RUNNING : Backward ring buffers are recording + * DATA_PENDING : We are required to collect data from backward ring buffers + * EMPTY : We have collected data from backward ring buffers. + * + * (0): Setup backward ring buffer + * (1): Pause ring buffers for reading + * (2): Read from ring buffers + * (3): Resume ring buffers for recording + */ +enum bkw_mmap_state { + BKW_MMAP_NOTREADY, + BKW_MMAP_RUNNING, + BKW_MMAP_DATA_PENDING, + BKW_MMAP_EMPTY, +}; + +struct mmap_params { + int prot, mask; + struct auxtrace_mmap_params auxtrace_mp; +}; + +int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd); +void perf_mmap__munmap(struct perf_mmap *map); + +void perf_mmap__get(struct perf_mmap *map); +void perf_mmap__put(struct perf_mmap *map); + +void perf_mmap__consume(struct perf_mmap *map, bool overwrite); + +void perf_mmap__read_catchup(struct perf_mmap *md); + +static inline u64 perf_mmap__read_head(struct perf_mmap *mm) +{ + struct perf_event_mmap_page *pc = mm->base; + u64 head = ACCESS_ONCE(pc->data_head); + rmb(); + return head; +} + +static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) +{ + struct perf_event_mmap_page *pc = md->base; + + /* + * ensure all reads are done before we write the tail out. + */ + mb(); + pc->data_tail = tail; +} + +union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup); +union perf_event *perf_mmap__read_backward(struct perf_mmap *map); + +size_t perf_mmap__mmap_len(struct perf_mmap *map); + +#endif /*__PERF_MMAP_H */ diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index e66dc495809a..b4f2f06722a7 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -10,6 +10,7 @@ util/ctype.c util/evlist.c util/evsel.c util/cpumap.c +util/mmap.c util/namespaces.c ../lib/bitmap.c ../lib/find_bit.c From d37f1586d0023443e2e64937769bbb1bc078866a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Oct 2017 16:39:55 -0300 Subject: [PATCH 0532/1085] perf record: Make record__mmap_read generic It becomes a perf_mmap method, "push", that build reads from a mmap and "pushes" it to a consumer, that in the initial case, for 'perf record', just writes it to the perf.data file descriptor, but may be used by 'top', etc. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-u4l1qjbi6l76r2k0nv99220n@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 234fdf4734f6..47a761133aed 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -176,9 +176,16 @@ rb_find_range(void *data, int mask, u64 head, u64 old, return backward_rb_find_range(data, mask, head, start, end); } -static int -record__mmap_read(struct record *rec, struct perf_mmap *md, - bool overwrite, bool backward) +static int record__pushfn(void *to, void *bf, size_t size) +{ + struct record *rec = to; + + rec->samples++; + return record__write(rec, bf, size); +} + +static int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward, + void *to, int push(void *to, void *buf, size_t size)) { u64 head = perf_mmap__read_head(md); u64 old = md->prev; @@ -195,8 +202,6 @@ record__mmap_read(struct record *rec, struct perf_mmap *md, if (start == end) return 0; - rec->samples++; - size = end - start; if (size > (unsigned long)(md->mask) + 1) { WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); @@ -211,7 +216,7 @@ record__mmap_read(struct record *rec, struct perf_mmap *md, size = md->mask + 1 - (start & md->mask); start += size; - if (record__write(rec, buf, size) < 0) { + if (push(to, buf, size) < 0) { rc = -1; goto out; } @@ -221,7 +226,7 @@ record__mmap_read(struct record *rec, struct perf_mmap *md, size = end - start; start += size; - if (record__write(rec, buf, size) < 0) { + if (push(to, buf, size) < 0) { rc = -1; goto out; } @@ -576,8 +581,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap; if (maps[i].base) { - if (record__mmap_read(rec, &maps[i], - evlist->overwrite, backward) != 0) { + if (perf_mmap__push(&maps[i], evlist->overwrite, backward, rec, record__pushfn) != 0) { rc = -1; goto out; } From 73c17d815000e425aea108226bcb57491a04f534 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Oct 2017 10:46:01 -0300 Subject: [PATCH 0533/1085] perf mmap: Adopt push method from builtin-record.c The previous prep patch was just to show exactly what changed in that function, now its time to move that method and things only it uses to the right place, mmap.[ch] Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-aaxywfgw3d44x6xlu8zm1avu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 100 ------------------------------------ tools/perf/util/mmap.c | 100 ++++++++++++++++++++++++++++++++++++ tools/perf/util/mmap.h | 3 ++ 3 files changed, 103 insertions(+), 100 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 47a761133aed..a6cbf1640269 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -129,53 +129,6 @@ static int process_synthesized_event(struct perf_tool *tool, return record__write(rec, event, event->header.size); } -static int -backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end) -{ - struct perf_event_header *pheader; - u64 evt_head = head; - int size = mask + 1; - - pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head); - pheader = (struct perf_event_header *)(buf + (head & mask)); - *start = head; - while (true) { - if (evt_head - head >= (unsigned int)size) { - pr_debug("Finished reading backward ring buffer: rewind\n"); - if (evt_head - head > (unsigned int)size) - evt_head -= pheader->size; - *end = evt_head; - return 0; - } - - pheader = (struct perf_event_header *)(buf + (evt_head & mask)); - - if (pheader->size == 0) { - pr_debug("Finished reading backward ring buffer: get start\n"); - *end = evt_head; - return 0; - } - - evt_head += pheader->size; - pr_debug3("move evt_head: %"PRIx64"\n", evt_head); - } - WARN_ONCE(1, "Shouldn't get here\n"); - return -1; -} - -static int -rb_find_range(void *data, int mask, u64 head, u64 old, - u64 *start, u64 *end, bool backward) -{ - if (!backward) { - *start = old; - *end = head; - return 0; - } - - return backward_rb_find_range(data, mask, head, start, end); -} - static int record__pushfn(void *to, void *bf, size_t size) { struct record *rec = to; @@ -184,59 +137,6 @@ static int record__pushfn(void *to, void *bf, size_t size) return record__write(rec, bf, size); } -static int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward, - void *to, int push(void *to, void *buf, size_t size)) -{ - u64 head = perf_mmap__read_head(md); - u64 old = md->prev; - u64 end = head, start = old; - unsigned char *data = md->base + page_size; - unsigned long size; - void *buf; - int rc = 0; - - if (rb_find_range(data, md->mask, head, - old, &start, &end, backward)) - return -1; - - if (start == end) - return 0; - - size = end - start; - if (size > (unsigned long)(md->mask) + 1) { - WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); - - md->prev = head; - perf_mmap__consume(md, overwrite || backward); - return 0; - } - - if ((start & md->mask) + size != (end & md->mask)) { - buf = &data[start & md->mask]; - size = md->mask + 1 - (start & md->mask); - start += size; - - if (push(to, buf, size) < 0) { - rc = -1; - goto out; - } - } - - buf = &data[start & md->mask]; - size = end - start; - start += size; - - if (push(to, buf, size) < 0) { - rc = -1; - goto out; - } - - md->prev = head; - perf_mmap__consume(md, overwrite || backward); -out: - return rc; -} - static volatile int done; static volatile int signr = -1; static volatile int child_finished; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index dfc4a007f2c6..9fe5f9c7d577 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -8,6 +8,9 @@ */ #include +#include +#include +#include "debug.h" #include "event.h" #include "mmap.h" #include "util.h" /* page_size */ @@ -250,3 +253,100 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd) return 0; } + +static int backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end) +{ + struct perf_event_header *pheader; + u64 evt_head = head; + int size = mask + 1; + + pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head); + pheader = (struct perf_event_header *)(buf + (head & mask)); + *start = head; + while (true) { + if (evt_head - head >= (unsigned int)size) { + pr_debug("Finished reading backward ring buffer: rewind\n"); + if (evt_head - head > (unsigned int)size) + evt_head -= pheader->size; + *end = evt_head; + return 0; + } + + pheader = (struct perf_event_header *)(buf + (evt_head & mask)); + + if (pheader->size == 0) { + pr_debug("Finished reading backward ring buffer: get start\n"); + *end = evt_head; + return 0; + } + + evt_head += pheader->size; + pr_debug3("move evt_head: %"PRIx64"\n", evt_head); + } + WARN_ONCE(1, "Shouldn't get here\n"); + return -1; +} + +static int rb_find_range(void *data, int mask, u64 head, u64 old, + u64 *start, u64 *end, bool backward) +{ + if (!backward) { + *start = old; + *end = head; + return 0; + } + + return backward_rb_find_range(data, mask, head, start, end); +} + +int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward, + void *to, int push(void *to, void *buf, size_t size)) +{ + u64 head = perf_mmap__read_head(md); + u64 old = md->prev; + u64 end = head, start = old; + unsigned char *data = md->base + page_size; + unsigned long size; + void *buf; + int rc = 0; + + if (rb_find_range(data, md->mask, head, old, &start, &end, backward)) + return -1; + + if (start == end) + return 0; + + size = end - start; + if (size > (unsigned long)(md->mask) + 1) { + WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); + + md->prev = head; + perf_mmap__consume(md, overwrite || backward); + return 0; + } + + if ((start & md->mask) + size != (end & md->mask)) { + buf = &data[start & md->mask]; + size = md->mask + 1 - (start & md->mask); + start += size; + + if (push(to, buf, size) < 0) { + rc = -1; + goto out; + } + } + + buf = &data[start & md->mask]; + size = end - start; + start += size; + + if (push(to, buf, size) < 0) { + rc = -1; + goto out; + } + + md->prev = head; + perf_mmap__consume(md, overwrite || backward); +out: + return rc; +} diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index f37ff45c8ec1..efd78b827b05 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -89,6 +89,9 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup); union perf_event *perf_mmap__read_backward(struct perf_mmap *map); +int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward, + void *to, int push(void *to, void *buf, size_t size)); + size_t perf_mmap__mmap_len(struct perf_mmap *map); #endif /*__PERF_MMAP_H */ From 692f5a22cd284bb8233a38e3ed86881d2d9c89d4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 9 Oct 2017 15:07:12 +0200 Subject: [PATCH 0534/1085] perf tests attr: Make hw events optional Otherwise we fail on virtual machines with no support for specific HW events. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171009130712.14747-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/test-stat-C0 | 1 + tools/perf/tests/attr/test-stat-basic | 1 + tools/perf/tests/attr/test-stat-default | 4 ++++ tools/perf/tests/attr/test-stat-detailed-1 | 8 ++++++++ tools/perf/tests/attr/test-stat-detailed-2 | 13 +++++++++++++ tools/perf/tests/attr/test-stat-detailed-3 | 13 +++++++++++++ tools/perf/tests/attr/test-stat-no-inherit | 1 + 7 files changed, 41 insertions(+) diff --git a/tools/perf/tests/attr/test-stat-C0 b/tools/perf/tests/attr/test-stat-C0 index 67717fe6a65d..a2c76d10b2bb 100644 --- a/tools/perf/tests/attr/test-stat-C0 +++ b/tools/perf/tests/attr/test-stat-C0 @@ -7,3 +7,4 @@ ret = 1 # events are disabled by default when attached to cpu disabled=1 enable_on_exec=0 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-basic b/tools/perf/tests/attr/test-stat-basic index 74e17881f2ba..69867d049fda 100644 --- a/tools/perf/tests/attr/test-stat-basic +++ b/tools/perf/tests/attr/test-stat-basic @@ -4,3 +4,4 @@ args = -e cycles kill >/dev/null 2>&1 ret = 1 [event:base-stat] +optional=1 diff --git a/tools/perf/tests/attr/test-stat-default b/tools/perf/tests/attr/test-stat-default index e911dbd4eb47..d9e99b3f77e6 100644 --- a/tools/perf/tests/attr/test-stat-default +++ b/tools/perf/tests/attr/test-stat-default @@ -32,6 +32,7 @@ config=2 fd=5 type=0 config=0 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND [event6:base-stat] @@ -52,15 +53,18 @@ optional=1 fd=8 type=0 config=1 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS [event9:base-stat] fd=9 type=0 config=4 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES [event10:base-stat] fd=10 type=0 config=5 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-1 b/tools/perf/tests/attr/test-stat-detailed-1 index b39270a08e74..8b04a055d154 100644 --- a/tools/perf/tests/attr/test-stat-detailed-1 +++ b/tools/perf/tests/attr/test-stat-detailed-1 @@ -33,6 +33,7 @@ config=2 fd=5 type=0 config=0 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND [event6:base-stat] @@ -53,18 +54,21 @@ optional=1 fd=8 type=0 config=1 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS [event9:base-stat] fd=9 type=0 config=4 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES [event10:base-stat] fd=10 type=0 config=5 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -74,6 +78,7 @@ config=5 fd=11 type=3 config=0 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -83,6 +88,7 @@ config=0 fd=12 type=3 config=65536 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_LL << 0 | @@ -92,6 +98,7 @@ config=65536 fd=13 type=3 config=2 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_LL << 0 | @@ -101,3 +108,4 @@ config=2 fd=14 type=3 config=65538 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-2 b/tools/perf/tests/attr/test-stat-detailed-2 index 45f8e6ea34f8..4fca9f1bfbf8 100644 --- a/tools/perf/tests/attr/test-stat-detailed-2 +++ b/tools/perf/tests/attr/test-stat-detailed-2 @@ -33,6 +33,7 @@ config=2 fd=5 type=0 config=0 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND [event6:base-stat] @@ -53,18 +54,21 @@ optional=1 fd=8 type=0 config=1 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS [event9:base-stat] fd=9 type=0 config=4 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES [event10:base-stat] fd=10 type=0 config=5 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -74,6 +78,7 @@ config=5 fd=11 type=3 config=0 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -83,6 +88,7 @@ config=0 fd=12 type=3 config=65536 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_LL << 0 | @@ -92,6 +98,7 @@ config=65536 fd=13 type=3 config=2 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_LL << 0 | @@ -101,6 +108,7 @@ config=2 fd=14 type=3 config=65538 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_L1I << 0 | @@ -120,6 +128,7 @@ optional=1 fd=16 type=3 config=65537 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_DTLB << 0 | @@ -129,6 +138,7 @@ config=65537 fd=17 type=3 config=3 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_DTLB << 0 | @@ -138,6 +148,7 @@ config=3 fd=18 type=3 config=65539 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_ITLB << 0 | @@ -147,6 +158,7 @@ config=65539 fd=19 type=3 config=4 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_ITLB << 0 | @@ -156,3 +168,4 @@ config=4 fd=20 type=3 config=65540 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-3 b/tools/perf/tests/attr/test-stat-detailed-3 index 30ae0fb7a3fd..4bb58e1c82a6 100644 --- a/tools/perf/tests/attr/test-stat-detailed-3 +++ b/tools/perf/tests/attr/test-stat-detailed-3 @@ -33,6 +33,7 @@ config=2 fd=5 type=0 config=0 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND [event6:base-stat] @@ -53,18 +54,21 @@ optional=1 fd=8 type=0 config=1 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS [event9:base-stat] fd=9 type=0 config=4 +optional=1 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES [event10:base-stat] fd=10 type=0 config=5 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -74,6 +78,7 @@ config=5 fd=11 type=3 config=0 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | @@ -83,6 +88,7 @@ config=0 fd=12 type=3 config=65536 +optional=1 # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_LL << 0 | @@ -92,6 +98,7 @@ config=65536 fd=13 type=3 config=2 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_LL << 0 | @@ -101,6 +108,7 @@ config=2 fd=14 type=3 config=65538 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_L1I << 0 | @@ -120,6 +128,7 @@ optional=1 fd=16 type=3 config=65537 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_DTLB << 0 | @@ -129,6 +138,7 @@ config=65537 fd=17 type=3 config=3 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_DTLB << 0 | @@ -138,6 +148,7 @@ config=3 fd=18 type=3 config=65539 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_ITLB << 0 | @@ -147,6 +158,7 @@ config=65539 fd=19 type=3 config=4 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_ITLB << 0 | @@ -156,6 +168,7 @@ config=4 fd=20 type=3 config=65540 +optional=1 # PERF_TYPE_HW_CACHE, # PERF_COUNT_HW_CACHE_L1D << 0 | diff --git a/tools/perf/tests/attr/test-stat-no-inherit b/tools/perf/tests/attr/test-stat-no-inherit index d54b2a1e3e28..924fbb9300d1 100644 --- a/tools/perf/tests/attr/test-stat-no-inherit +++ b/tools/perf/tests/attr/test-stat-no-inherit @@ -5,3 +5,4 @@ ret = 1 [event:base-stat] inherit=0 +optional=1 From 98ad761bd3989a71be2b100002be923466617394 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 10 Oct 2017 15:43:22 -0700 Subject: [PATCH 0535/1085] perf list: Fix group description in the man page Fix an incorrect description in the 'perf list' manpage. When a group does not fit into the hardware it is partially scheduled, but does not error out. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20171010224322.15861-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 24679aed90b7..e2a897ae3596 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -204,7 +204,7 @@ For example Intel Core CPUs typically have four generic performance counters for the core, plus three fixed counters for instructions, cycles and ref-cycles. Some special events have restrictions on which counter they can schedule, and may not support multiple instances in a single group. -When too many events are specified in the group none of them will not +When too many events are specified in the group some of them will not be measured. Globally pinned events can limit the number of counters available for From 696e2457e9fd285034cd30cd8c93ece5e6cfe35a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 11 Oct 2017 17:01:24 +0200 Subject: [PATCH 0536/1085] perf annotate: Remove arch::cpuid_parse callback There's no need for extra cpuid_parse arch callback, it can be handled directly in init callback. Adding the init function to x86 to cover the cpuid initialization. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171011150158.11895-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/annotate/instructions.c | 3 ++- tools/perf/arch/arm64/annotate/instructions.c | 3 ++- tools/perf/arch/powerpc/annotate/instructions.c | 4 +++- tools/perf/arch/s390/annotate/instructions.c | 4 +++- tools/perf/arch/x86/annotate/instructions.c | 14 ++++++++++++++ tools/perf/util/annotate.c | 10 +++------- 6 files changed, 27 insertions(+), 11 deletions(-) diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c index 1ce0872b1726..6dfec7c23696 100644 --- a/tools/perf/arch/arm/annotate/instructions.c +++ b/tools/perf/arch/arm/annotate/instructions.c @@ -1,3 +1,4 @@ +#include #include #include @@ -23,7 +24,7 @@ static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const c return ops; } -static int arm__annotate_init(struct arch *arch) +static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { struct arm_annotate *arm; int err; diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 8f1908756cb6..a2c32be4132a 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -1,3 +1,4 @@ +#include #include #include @@ -25,7 +26,7 @@ static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const return ops; } -static int arm64__annotate_init(struct arch *arch) +static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { struct arm64_annotate *arm; int err; diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index 3c4004db81b9..b6b0ef5952d0 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -1,3 +1,5 @@ +#include + static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name) { int i; @@ -46,7 +48,7 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con return ops; } -static int powerpc__annotate_init(struct arch *arch) +static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { if (!arch->initialized) { arch->initialized = true; diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index 745b4b1b8b21..b8676ccbed76 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -1,3 +1,5 @@ +#include + static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) { struct ins_ops *ops = NULL; @@ -19,7 +21,7 @@ static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *na return ops; } -static int s390__annotate_init(struct arch *arch) +static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { if (!arch->initialized) { arch->initialized = true; diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index d84b72063a30..563cd4564041 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -122,3 +122,17 @@ static int x86__cpuid_parse(struct arch *arch, char *cpuid) return -1; } + +static int x86__annotate_init(struct arch *arch, char *cpuid) +{ + int err = 0; + + if (arch->initialized) + return 0; + + if (cpuid) + err = x86__cpuid_parse(arch, cpuid); + + arch->initialized = true; + return err; +} diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 4397a8b6e6cd..08164162c345 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -49,10 +49,9 @@ struct arch { void *priv; unsigned int model; unsigned int family; - int (*init)(struct arch *arch); + int (*init)(struct arch *arch, char *cpuid); bool (*ins_is_fused)(struct arch *arch, const char *ins1, const char *ins2); - int (*cpuid_parse)(struct arch *arch, char *cpuid); struct { char comment_char; char skip_functions_char; @@ -132,10 +131,10 @@ static struct arch architectures[] = { }, { .name = "x86", + .init = x86__annotate_init, .instructions = x86__instructions, .nr_instructions = ARRAY_SIZE(x86__instructions), .ins_is_fused = x86__ins_is_fused, - .cpuid_parse = x86__cpuid_parse, .objdump = { .comment_char = '#', }, @@ -1447,16 +1446,13 @@ int symbol__disassemble(struct symbol *sym, struct map *map, *parch = arch; if (arch->init) { - err = arch->init(arch); + err = arch->init(arch, cpuid); if (err) { pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name); return err; } } - if (arch->cpuid_parse && cpuid) - arch->cpuid_parse(arch, cpuid); - pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, symfs_filename, sym->name, map->unmap_ip(map, sym->start), map->unmap_ip(map, sym->end)); From d7e05ceaa93417f6f0077444eb111f64df823d25 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 16 Oct 2017 15:53:08 -0300 Subject: [PATCH 0537/1085] perf tools: Do not check ABI headers in a detached tarball build When we use one of: [acme@jouet linux]$ make help | grep perf perf-tar-src-pkg - Build perf-4.14.0-rc3.tar source tarball perf-targz-src-pkg - Build perf-4.14.0-rc3.tar.gz source tarball perf-tarbz2-src-pkg - Build perf-4.14.0-rc3.tar.bz2 source tarball perf-tarxz-src-pkg - Build perf-4.14.0-rc3.tar.xz source tarball [acme@jouet linux]$ I.e. when we create a detached tarball to build perf outside outside the enveloping kernel sources (from a kernel tarball or a checked out linux.git directory) we by definition can't check for differences among the tools/{include,arch}, etc files we originally copied from the kernel, so bail out in that case, to avoid warnings when doing the detached builds. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-vbrga0mhplv7niwxr3ghjyxv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/check-headers.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 932fda54b8a6..322629423b49 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -57,6 +57,11 @@ check () { } +# Check if we have the kernel headers (tools/perf/../../include), else +# we're probably on a detached tarball, so no point in trying to check +# differences. +test -d ../../include || exit 0 + # simple diff check for i in $HEADERS; do check $i -B From 7958e541495d7f99cefef2300b2c388865626c2e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 16:31:56 -0700 Subject: [PATCH 0538/1085] perf vendor events: Fix incorrect cmask syntax for some Intel metrics Some of the metrics use an incorrect syntax for specifying the cmask for an event. Convert to perf syntax so that they can be resolved. Fixes metrics on Broadwell, SandyBridge. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/n/tip-3k3fkfj8obek9dkmryyrqzhu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json | 4 ++-- tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json | 2 +- tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json | 2 +- tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index 7372c3207092..00bfdb5c5acb 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -61,7 +61,7 @@ }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION:c1 + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, @@ -85,7 +85,7 @@ }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION:c1 + DTLB_LOAD_MISSES.WALK_DURATION:c1 + DTLB_STORE_MISSES.WALK_DURATION:c1 + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", + "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)", "MetricGroup": "TLB", "MetricName": "Page_Walks_Utilization" }, diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index 4d9a1175a5cc..5a7f1ec24200 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -61,7 +61,7 @@ }, { "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)", - "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION:c1 + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", + "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)", "MetricGroup": "Unknown_Branches", "MetricName": "BAClear_Cost" }, diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index 2b18008cf482..fd7d7c438226 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -55,7 +55,7 @@ }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (( UOPS_DISPATCHED.CORE:c1 / 2) if #SMT_on else UOPS_DISPATCHED.CORE:c1)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index 2b18008cf482..fd7d7c438226 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -55,7 +55,7 @@ }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (( UOPS_DISPATCHED.CORE:c1 / 2) if #SMT_on else UOPS_DISPATCHED.CORE:c1)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Pipeline;Ports_Utilization", "MetricName": "ILP" }, From 923d0c9ae570c3f33a0b5a9517c23ccc816d9b75 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 17 Oct 2017 10:35:00 -0300 Subject: [PATCH 0539/1085] perf tools: Introduce binary__fprintf() Out of print_binary() but receiving a fp pointer and expecting that the printer be a fprintf like function, i.e. receive a FILE pointer and return the number of characters printed. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Cc: yuzhoujian Link: http://lkml.kernel.org/n/tip-6oqnxr6lmgqe6q6p3iugnscx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 32 +++++++++++++++++--------------- tools/perf/builtin-trace.c | 14 +++++++------- tools/perf/util/debug.c | 31 +++++++++++++++++-------------- tools/perf/util/print_binary.c | 30 ++++++++++++++++-------------- tools/perf/util/print_binary.h | 18 +++++++++++++----- 5 files changed, 70 insertions(+), 55 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 7167df215a42..5a7dfc5691a1 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1164,40 +1164,40 @@ struct printer_data { bool is_printable; }; -static void -print_sample_bpf_output_printer(enum binary_printer_ops op, - unsigned int val, - void *extra) +static int sample__fprintf_bpf_output(enum binary_printer_ops op, + unsigned int val, + void *extra, FILE *fp) { unsigned char ch = (unsigned char)val; struct printer_data *printer_data = extra; + int printed = 0; switch (op) { case BINARY_PRINT_DATA_BEGIN: - printf("\n"); + printed += fprintf(fp, "\n"); break; case BINARY_PRINT_LINE_BEGIN: - printf("%17s", !printer_data->line_no ? "BPF output:" : + printed += fprintf(fp, "%17s", !printer_data->line_no ? "BPF output:" : " "); break; case BINARY_PRINT_ADDR: - printf(" %04x:", val); + printed += fprintf(fp, " %04x:", val); break; case BINARY_PRINT_NUM_DATA: - printf(" %02x", val); + printed += fprintf(fp, " %02x", val); break; case BINARY_PRINT_NUM_PAD: - printf(" "); + printed += fprintf(fp, " "); break; case BINARY_PRINT_SEP: - printf(" "); + printed += fprintf(fp, " "); break; case BINARY_PRINT_CHAR_DATA: if (printer_data->hit_nul && ch) printer_data->is_printable = false; if (!isprint(ch)) { - printf("%c", '.'); + printed += fprintf(fp, "%c", '.'); if (!printer_data->is_printable) break; @@ -1207,20 +1207,22 @@ print_sample_bpf_output_printer(enum binary_printer_ops op, else printer_data->is_printable = false; } else { - printf("%c", ch); + printed += fprintf(fp, "%c", ch); } break; case BINARY_PRINT_CHAR_PAD: - printf(" "); + printed += fprintf(fp, " "); break; case BINARY_PRINT_LINE_END: - printf("\n"); + printed += fprintf(fp, "\n"); printer_data->line_no++; break; case BINARY_PRINT_DATA_END: default: break; } + + return printed; } static void print_sample_bpf_output(struct perf_sample *sample) @@ -1229,7 +1231,7 @@ static void print_sample_bpf_output(struct perf_sample *sample) struct printer_data printer_data = {0, false, true}; print_binary(sample->raw_data, nr_bytes, 8, - print_sample_bpf_output_printer, &printer_data); + sample__fprintf_bpf_output, &printer_data); if (printer_data.is_printable && printer_data.hit_nul) printf("%17s \"%s\"\n", "BPF string:", diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index afef6fe46c45..8b23982dd9f2 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1828,16 +1828,14 @@ out_dump: goto out_put; } -static void bpf_output__printer(enum binary_printer_ops op, - unsigned int val, void *extra) +static int bpf_output__printer(enum binary_printer_ops op, + unsigned int val, void *extra __maybe_unused, FILE *fp) { - FILE *output = extra; unsigned char ch = (unsigned char)val; switch (op) { case BINARY_PRINT_CHAR_DATA: - fprintf(output, "%c", isprint(ch) ? ch : '.'); - break; + return fprintf(fp, "%c", isprint(ch) ? ch : '.'); case BINARY_PRINT_DATA_BEGIN: case BINARY_PRINT_LINE_BEGIN: case BINARY_PRINT_ADDR: @@ -1850,13 +1848,15 @@ static void bpf_output__printer(enum binary_printer_ops op, default: break; } + + return 0; } static void bpf_output__fprintf(struct trace *trace, struct perf_sample *sample) { - print_binary(sample->raw_data, sample->raw_size, 8, - bpf_output__printer, trace->output); + binary__fprintf(sample->raw_data, sample->raw_size, 8, + bpf_output__printer, NULL, trace->output); } static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index a5b3777ffee6..cd24ebf0da2f 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -111,50 +111,53 @@ int dump_printf(const char *fmt, ...) return ret; } -static void trace_event_printer(enum binary_printer_ops op, - unsigned int val, void *extra) +static int trace_event_printer(enum binary_printer_ops op, + unsigned int val, void *extra, FILE *fp) { const char *color = PERF_COLOR_BLUE; union perf_event *event = (union perf_event *)extra; unsigned char ch = (unsigned char)val; + int printed = 0; switch (op) { case BINARY_PRINT_DATA_BEGIN: - printf("."); - color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n", - event->header.size); + printed += fprintf(fp, "."); + printed += color_fprintf(fp, color, "\n. ... raw event: size %d bytes\n", + event->header.size); break; case BINARY_PRINT_LINE_BEGIN: - printf("."); + printed += fprintf(fp, "."); break; case BINARY_PRINT_ADDR: - color_fprintf(stdout, color, " %04x: ", val); + printed += color_fprintf(fp, color, " %04x: ", val); break; case BINARY_PRINT_NUM_DATA: - color_fprintf(stdout, color, " %02x", val); + printed += color_fprintf(fp, color, " %02x", val); break; case BINARY_PRINT_NUM_PAD: - color_fprintf(stdout, color, " "); + printed += color_fprintf(fp, color, " "); break; case BINARY_PRINT_SEP: - color_fprintf(stdout, color, " "); + printed += color_fprintf(fp, color, " "); break; case BINARY_PRINT_CHAR_DATA: - color_fprintf(stdout, color, "%c", + printed += color_fprintf(fp, color, "%c", isprint(ch) ? ch : '.'); break; case BINARY_PRINT_CHAR_PAD: - color_fprintf(stdout, color, " "); + printed += color_fprintf(fp, color, " "); break; case BINARY_PRINT_LINE_END: - color_fprintf(stdout, color, "\n"); + printed += color_fprintf(fp, color, "\n"); break; case BINARY_PRINT_DATA_END: - printf("\n"); + printed += fprintf(fp, "\n"); break; default: break; } + + return printed; } void trace_event(union perf_event *event) diff --git a/tools/perf/util/print_binary.c b/tools/perf/util/print_binary.c index e908177b9976..df55ad3b47a0 100644 --- a/tools/perf/util/print_binary.c +++ b/tools/perf/util/print_binary.c @@ -2,40 +2,42 @@ #include #include "sane_ctype.h" -void print_binary(unsigned char *data, size_t len, - size_t bytes_per_line, print_binary_t printer, - void *extra) +int binary__fprintf(unsigned char *data, size_t len, + size_t bytes_per_line, binary__fprintf_t printer, + void *extra, FILE *fp) { size_t i, j, mask; + int printed = 0; if (!printer) - return; + return 0; bytes_per_line = roundup_pow_of_two(bytes_per_line); mask = bytes_per_line - 1; - printer(BINARY_PRINT_DATA_BEGIN, 0, extra); + printed += printer(BINARY_PRINT_DATA_BEGIN, 0, extra, fp); for (i = 0; i < len; i++) { if ((i & mask) == 0) { - printer(BINARY_PRINT_LINE_BEGIN, -1, extra); - printer(BINARY_PRINT_ADDR, i, extra); + printed += printer(BINARY_PRINT_LINE_BEGIN, -1, extra, fp); + printed += printer(BINARY_PRINT_ADDR, i, extra, fp); } - printer(BINARY_PRINT_NUM_DATA, data[i], extra); + printed += printer(BINARY_PRINT_NUM_DATA, data[i], extra, fp); if (((i & mask) == mask) || i == len - 1) { for (j = 0; j < mask-(i & mask); j++) - printer(BINARY_PRINT_NUM_PAD, -1, extra); + printed += printer(BINARY_PRINT_NUM_PAD, -1, extra, fp); - printer(BINARY_PRINT_SEP, i, extra); + printer(BINARY_PRINT_SEP, i, extra, fp); for (j = i & ~mask; j <= i; j++) - printer(BINARY_PRINT_CHAR_DATA, data[j], extra); + printed += printer(BINARY_PRINT_CHAR_DATA, data[j], extra, fp); for (j = 0; j < mask-(i & mask); j++) - printer(BINARY_PRINT_CHAR_PAD, i, extra); - printer(BINARY_PRINT_LINE_END, -1, extra); + printed += printer(BINARY_PRINT_CHAR_PAD, i, extra, fp); + printed += printer(BINARY_PRINT_LINE_END, -1, extra, fp); } } - printer(BINARY_PRINT_DATA_END, -1, extra); + printed += printer(BINARY_PRINT_DATA_END, -1, extra, fp); + return printed; } int is_printable_array(char *p, unsigned int len) diff --git a/tools/perf/util/print_binary.h b/tools/perf/util/print_binary.h index da0427263d2d..f97918a179db 100644 --- a/tools/perf/util/print_binary.h +++ b/tools/perf/util/print_binary.h @@ -2,6 +2,7 @@ #define PERF_PRINT_BINARY_H #include +#include enum binary_printer_ops { BINARY_PRINT_DATA_BEGIN, @@ -16,12 +17,19 @@ enum binary_printer_ops { BINARY_PRINT_DATA_END, }; -typedef void (*print_binary_t)(enum binary_printer_ops op, - unsigned int val, void *extra); +typedef int (*binary__fprintf_t)(enum binary_printer_ops op, + unsigned int val, void *extra, FILE *fp); -void print_binary(unsigned char *data, size_t len, - size_t bytes_per_line, print_binary_t printer, - void *extra); +int binary__fprintf(unsigned char *data, size_t len, + size_t bytes_per_line, binary__fprintf_t printer, + void *extra, FILE *fp); + +static inline void print_binary(unsigned char *data, size_t len, + size_t bytes_per_line, binary__fprintf_t printer, + void *extra) +{ + binary__fprintf(data, len, bytes_per_line, printer, extra, stdout); +} int is_printable_array(char *p, unsigned int len); From a1a587073ccdc6ffae414259f65d0a94cadd0a72 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 17 Oct 2017 10:54:24 -0300 Subject: [PATCH 0540/1085] perf script: Use fprintf like printing uniformly We've been mixing print() with fprintf() style printing for a while, but now we need to use fprintf() like syntax uniformly as a preparatory patch for supporting printing to different files, one per event. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Cc: yuzhoujian Link: http://lkml.kernel.org/n/tip-kv5z3v8ptfghbarv3a9usvin@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 491 +++++++++++++++++++----------------- 1 file changed, 259 insertions(+), 232 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 5a7dfc5691a1..3e83f4735b21 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -500,69 +500,76 @@ out: return 0; } -static void print_sample_iregs(struct perf_sample *sample, - struct perf_event_attr *attr) +static int perf_sample__fprintf_iregs(struct perf_sample *sample, + struct perf_event_attr *attr, FILE *fp) { struct regs_dump *regs = &sample->intr_regs; uint64_t mask = attr->sample_regs_intr; unsigned i = 0, r; + int printed = 0; if (!regs) - return; + return 0; for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { u64 val = regs->regs[i++]; - printf("%5s:0x%"PRIx64" ", perf_reg_name(r), val); + printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r), val); } + + return printed; } -static void print_sample_uregs(struct perf_sample *sample, - struct perf_event_attr *attr) +static int perf_sample__fprintf_uregs(struct perf_sample *sample, + struct perf_event_attr *attr, FILE *fp) { struct regs_dump *regs = &sample->user_regs; uint64_t mask = attr->sample_regs_user; unsigned i = 0, r; + int printed = 0; if (!regs || !regs->regs) - return; + return 0; - printf(" ABI:%" PRIu64 " ", regs->abi); + printed += fprintf(fp, " ABI:%" PRIu64 " ", regs->abi); for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { u64 val = regs->regs[i++]; - printf("%5s:0x%"PRIx64" ", perf_reg_name(r), val); + printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r), val); } + + return printed; } -static void print_sample_start(struct perf_sample *sample, - struct thread *thread, - struct perf_evsel *evsel) +static int perf_sample__fprintf_start(struct perf_sample *sample, + struct thread *thread, + struct perf_evsel *evsel, FILE *fp) { struct perf_event_attr *attr = &evsel->attr; unsigned long secs; unsigned long long nsecs; + int printed = 0; if (PRINT_FIELD(COMM)) { if (latency_format) - printf("%8.8s ", thread__comm_str(thread)); + printed += fprintf(fp, "%8.8s ", thread__comm_str(thread)); else if (PRINT_FIELD(IP) && symbol_conf.use_callchain) - printf("%s ", thread__comm_str(thread)); + printed += fprintf(fp, "%s ", thread__comm_str(thread)); else - printf("%16s ", thread__comm_str(thread)); + printed += fprintf(fp, "%16s ", thread__comm_str(thread)); } if (PRINT_FIELD(PID) && PRINT_FIELD(TID)) - printf("%5d/%-5d ", sample->pid, sample->tid); + printed += fprintf(fp, "%5d/%-5d ", sample->pid, sample->tid); else if (PRINT_FIELD(PID)) - printf("%5d ", sample->pid); + printed += fprintf(fp, "%5d ", sample->pid); else if (PRINT_FIELD(TID)) - printf("%5d ", sample->tid); + printed += fprintf(fp, "%5d ", sample->tid); if (PRINT_FIELD(CPU)) { if (latency_format) - printf("%3d ", sample->cpu); + printed += fprintf(fp, "%3d ", sample->cpu); else - printf("[%03d] ", sample->cpu); + printed += fprintf(fp, "[%03d] ", sample->cpu); } if (PRINT_FIELD(TIME)) { @@ -571,13 +578,15 @@ static void print_sample_start(struct perf_sample *sample, nsecs -= secs * NSEC_PER_SEC; if (nanosecs) - printf("%5lu.%09llu: ", secs, nsecs); + printed += fprintf(fp, "%5lu.%09llu: ", secs, nsecs); else { char sample_time[32]; timestamp__scnprintf_usec(sample->time, sample_time, sizeof(sample_time)); - printf("%12s: ", sample_time); + printed += fprintf(fp, "%12s: ", sample_time); } } + + return printed; } static inline char @@ -589,16 +598,17 @@ mispred_str(struct branch_entry *br) return br->flags.predicted ? 'P' : 'M'; } -static void print_sample_brstack(struct perf_sample *sample, - struct thread *thread, - struct perf_event_attr *attr) +static int perf_sample__fprintf_brstack(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; struct addr_location alf, alt; u64 i, from, to; + int printed = 0; if (!(br && br->nr)) - return; + return 0; for (i = 0; i < br->nr; i++) { from = br->entries[i].from; @@ -611,38 +621,41 @@ static void print_sample_brstack(struct perf_sample *sample, thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt); } - printf(" 0x%"PRIx64, from); + printed += fprintf(fp, " 0x%"PRIx64, from); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alf.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alf.map, fp); + printed += fprintf(fp, ")"); } - printf("/0x%"PRIx64, to); + printed += fprintf(fp, "/0x%"PRIx64, to); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alt.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alt.map, fp); + printed += fprintf(fp, ")"); } - printf("/%c/%c/%c/%d ", + printed += fprintf(fp, "/%c/%c/%c/%d ", mispred_str( br->entries + i), br->entries[i].flags.in_tx? 'X' : '-', br->entries[i].flags.abort? 'A' : '-', br->entries[i].flags.cycles); } + + return printed; } -static void print_sample_brstacksym(struct perf_sample *sample, - struct thread *thread, - struct perf_event_attr *attr) +static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; struct addr_location alf, alt; u64 i, from, to; + int printed = 0; if (!(br && br->nr)) - return; + return 0; for (i = 0; i < br->nr; i++) { @@ -659,37 +672,40 @@ static void print_sample_brstacksym(struct perf_sample *sample, if (alt.map) alt.sym = map__find_symbol(alt.map, alt.addr); - symbol__fprintf_symname_offs(alf.sym, &alf, stdout); + printed += symbol__fprintf_symname_offs(alf.sym, &alf, fp); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alf.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alf.map, fp); + printed += fprintf(fp, ")"); } - putchar('/'); - symbol__fprintf_symname_offs(alt.sym, &alt, stdout); + printed += fprintf(fp, "%c", '/'); + printed += symbol__fprintf_symname_offs(alt.sym, &alt, fp); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alt.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alt.map, fp); + printed += fprintf(fp, ")"); } - printf("/%c/%c/%c/%d ", + printed += fprintf(fp, "/%c/%c/%c/%d ", mispred_str( br->entries + i), br->entries[i].flags.in_tx? 'X' : '-', br->entries[i].flags.abort? 'A' : '-', br->entries[i].flags.cycles); } + + return printed; } -static void print_sample_brstackoff(struct perf_sample *sample, - struct thread *thread, - struct perf_event_attr *attr) +static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; struct addr_location alf, alt; u64 i, from, to; + int printed = 0; if (!(br && br->nr)) - return; + return 0; for (i = 0; i < br->nr; i++) { @@ -706,24 +722,26 @@ static void print_sample_brstackoff(struct perf_sample *sample, if (alt.map && !alt.map->dso->adjust_symbols) to = map__map_ip(alt.map, to); - printf(" 0x%"PRIx64, from); + printed += fprintf(fp, " 0x%"PRIx64, from); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alf.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alf.map, fp); + printed += fprintf(fp, ")"); } - printf("/0x%"PRIx64, to); + printed += fprintf(fp, "/0x%"PRIx64, to); if (PRINT_FIELD(DSO)) { - printf("("); - map__fprintf_dsoname(alt.map, stdout); - printf(")"); + printed += fprintf(fp, "("); + printed += map__fprintf_dsoname(alt.map, fp); + printed += fprintf(fp, ")"); } - printf("/%c/%c/%c/%d ", + printed += fprintf(fp, "/%c/%c/%c/%d ", mispred_str(br->entries + i), br->entries[i].flags.in_tx ? 'X' : '-', br->entries[i].flags.abort ? 'A' : '-', br->entries[i].flags.cycles); } + + return printed; } #define MAXBB 16384UL @@ -789,31 +807,30 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, return len; } -static void print_jump(uint64_t ip, struct branch_entry *en, - struct perf_insn *x, u8 *inbuf, int len, - int insn) +static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, + struct perf_insn *x, u8 *inbuf, int len, + int insn, FILE *fp) { - printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s", - ip, - dump_insn(x, ip, inbuf, len, NULL), - en->flags.predicted ? " PRED" : "", - en->flags.mispred ? " MISPRED" : "", - en->flags.in_tx ? " INTX" : "", - en->flags.abort ? " ABORT" : ""); + int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t#%s%s%s%s", ip, + dump_insn(x, ip, inbuf, len, NULL), + en->flags.predicted ? " PRED" : "", + en->flags.mispred ? " MISPRED" : "", + en->flags.in_tx ? " INTX" : "", + en->flags.abort ? " ABORT" : ""); if (en->flags.cycles) { - printf(" %d cycles", en->flags.cycles); + printed += fprintf(fp, " %d cycles", en->flags.cycles); if (insn) - printf(" %.2f IPC", (float)insn / en->flags.cycles); + printed += fprintf(fp, " %.2f IPC", (float)insn / en->flags.cycles); } - putchar('\n'); + return printed + fprintf(fp, "\n"); } -static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu, - uint64_t addr, struct symbol **lastsym, - struct perf_event_attr *attr) +static int ip__fprintf_sym(uint64_t addr, struct thread *thread, + u8 cpumode, int cpu, struct symbol **lastsym, + struct perf_event_attr *attr, FILE *fp) { struct addr_location al; - int off; + int off, printed = 0; memset(&al, 0, sizeof(al)); @@ -822,7 +839,7 @@ static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu, thread__find_addr_map(thread, cpumode, MAP__VARIABLE, addr, &al); if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end) - return; + return 0; al.cpu = cpu; al.sym = NULL; @@ -830,37 +847,39 @@ static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu, al.sym = map__find_symbol(al.map, al.addr); if (!al.sym) - return; + return 0; if (al.addr < al.sym->end) off = al.addr - al.sym->start; else off = al.addr - al.map->start - al.sym->start; - printf("\t%s", al.sym->name); + printed += fprintf(fp, "\t%s", al.sym->name); if (off) - printf("%+d", off); - putchar(':'); + printed += fprintf(fp, "%+d", off); + printed += fprintf(fp, ":"); if (PRINT_FIELD(SRCLINE)) - map__fprintf_srcline(al.map, al.addr, "\t", stdout); - putchar('\n'); + printed += map__fprintf_srcline(al.map, al.addr, "\t", fp); + printed += fprintf(fp, "\n"); *lastsym = al.sym; + + return printed; } -static void print_sample_brstackinsn(struct perf_sample *sample, - struct thread *thread, - struct perf_event_attr *attr, - struct machine *machine) +static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr, + struct machine *machine, FILE *fp) { struct branch_stack *br = sample->branch_stack; u64 start, end; - int i, insn, len, nr, ilen; + int i, insn, len, nr, ilen, printed = 0; struct perf_insn x; u8 buffer[MAXBB]; unsigned off; struct symbol *lastsym = NULL; if (!(br && br->nr)) - return; + return 0; nr = br->nr; if (max_blocks && nr > max_blocks + 1) nr = max_blocks + 1; @@ -868,17 +887,17 @@ static void print_sample_brstackinsn(struct perf_sample *sample, x.thread = thread; x.cpu = sample->cpu; - putchar('\n'); + printed += fprintf(fp, "%c", '\n'); /* Handle first from jump, of which we don't know the entry. */ len = grab_bb(buffer, br->entries[nr-1].from, br->entries[nr-1].from, machine, thread, &x.is64bit, &x.cpumode, false); if (len > 0) { - print_ip_sym(thread, x.cpumode, x.cpu, - br->entries[nr - 1].from, &lastsym, attr); - print_jump(br->entries[nr - 1].from, &br->entries[nr - 1], - &x, buffer, len, 0); + printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, + x.cpumode, x.cpu, &lastsym, attr, fp); + printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], + &x, buffer, len, 0, fp); } /* Print all blocks */ @@ -904,13 +923,13 @@ static void print_sample_brstackinsn(struct perf_sample *sample, for (off = 0;; off += ilen) { uint64_t ip = start + off; - print_ip_sym(thread, x.cpumode, x.cpu, ip, &lastsym, attr); + printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - print_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn); + printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn, fp); break; } else { - printf("\t%016" PRIx64 "\t%s\n", ip, - dump_insn(&x, ip, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", ip, + dump_insn(&x, ip, buffer + off, len - off, &ilen)); if (ilen == 0) break; insn++; @@ -923,9 +942,9 @@ static void print_sample_brstackinsn(struct perf_sample *sample, * has not been executed yet. */ if (br->entries[0].from == sample->ip) - return; + goto out; if (br->entries[0].flags.abort) - return; + goto out; /* * Print final block upto sample @@ -933,58 +952,61 @@ static void print_sample_brstackinsn(struct perf_sample *sample, start = br->entries[0].to; end = sample->ip; len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, true); - print_ip_sym(thread, x.cpumode, x.cpu, start, &lastsym, attr); + printed += ip__fprintf_sym(start, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (len <= 0) { /* Print at least last IP if basic block did not work */ len = grab_bb(buffer, sample->ip, sample->ip, machine, thread, &x.is64bit, &x.cpumode, false); if (len <= 0) - return; + goto out; - printf("\t%016" PRIx64 "\t%s\n", sample->ip, + printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", sample->ip, dump_insn(&x, sample->ip, buffer, len, NULL)); - return; + goto out; } for (off = 0; off <= end - start; off += ilen) { - printf("\t%016" PRIx64 "\t%s\n", start + off, - dump_insn(&x, start + off, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", start + off, + dump_insn(&x, start + off, buffer + off, len - off, &ilen)); if (ilen == 0) break; } +out: + return printed; } -static void print_sample_addr(struct perf_sample *sample, - struct thread *thread, - struct perf_event_attr *attr) +static int perf_sample__fprintf_addr(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr, FILE *fp) { struct addr_location al; - - printf("%16" PRIx64, sample->addr); + int printed = fprintf(fp, "%16" PRIx64, sample->addr); if (!sample_addr_correlates_sym(attr)) - return; + goto out; thread__resolve(thread, &al, sample); if (PRINT_FIELD(SYM)) { - printf(" "); + printed += fprintf(fp, " "); if (PRINT_FIELD(SYMOFFSET)) - symbol__fprintf_symname_offs(al.sym, &al, stdout); + printed += symbol__fprintf_symname_offs(al.sym, &al, fp); else - symbol__fprintf_symname(al.sym, stdout); + printed += symbol__fprintf_symname(al.sym, fp); } if (PRINT_FIELD(DSO)) { - printf(" ("); - map__fprintf_dsoname(al.map, stdout); - printf(")"); + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(al.map, fp); + printed += fprintf(fp, ")"); } +out: + return printed; } -static void print_sample_callindent(struct perf_sample *sample, - struct perf_evsel *evsel, - struct thread *thread, - struct addr_location *al) +static int perf_sample__fprintf_callindent(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al, FILE *fp) { struct perf_event_attr *attr = &evsel->attr; size_t depth = thread_stack__depth(thread); @@ -1019,12 +1041,12 @@ static void print_sample_callindent(struct perf_sample *sample, } if (name) - len = printf("%*s%s", (int)depth * 4, "", name); + len = fprintf(fp, "%*s%s", (int)depth * 4, "", name); else if (ip) - len = printf("%*s%16" PRIx64, (int)depth * 4, "", ip); + len = fprintf(fp, "%*s%16" PRIx64, (int)depth * 4, "", ip); if (len < 0) - return; + return len; /* * Try to keep the output length from changing frequently so that the @@ -1034,39 +1056,46 @@ static void print_sample_callindent(struct perf_sample *sample, spacing = round_up(len + 4, 32); if (len < spacing) - printf("%*s", spacing - len, ""); + len += fprintf(fp, "%*s", spacing - len, ""); + + return len; } -static void print_insn(struct perf_sample *sample, - struct perf_event_attr *attr, - struct thread *thread, - struct machine *machine) +static int perf_sample__fprintf_insn(struct perf_sample *sample, + struct perf_event_attr *attr, + struct thread *thread, + struct machine *machine, FILE *fp) { + int printed = 0; + if (PRINT_FIELD(INSNLEN)) - printf(" ilen: %d", sample->insn_len); + printed += fprintf(fp, " ilen: %d", sample->insn_len); if (PRINT_FIELD(INSN)) { int i; - printf(" insn:"); + printed += fprintf(fp, " insn:"); for (i = 0; i < sample->insn_len; i++) - printf(" %02x", (unsigned char)sample->insn[i]); + printed += fprintf(fp, " %02x", (unsigned char)sample->insn[i]); } if (PRINT_FIELD(BRSTACKINSN)) - print_sample_brstackinsn(sample, thread, attr, machine); + printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp); + + return printed; } -static void print_sample_bts(struct perf_sample *sample, - struct perf_evsel *evsel, - struct thread *thread, - struct addr_location *al, - struct machine *machine) +static int perf_sample__fprintf_bts(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al, + struct machine *machine, FILE *fp) { struct perf_event_attr *attr = &evsel->attr; unsigned int type = output_type(attr->type); bool print_srcline_last = false; + int printed = 0; if (PRINT_FIELD(CALLINDENT)) - print_sample_callindent(sample, evsel, thread, al); + printed += perf_sample__fprintf_callindent(sample, evsel, thread, al, fp); /* print branch_from information */ if (PRINT_FIELD(IP)) { @@ -1079,31 +1108,30 @@ static void print_sample_bts(struct perf_sample *sample, cursor = &callchain_cursor; if (cursor == NULL) { - putchar(' '); + printed += fprintf(fp, " "); if (print_opts & EVSEL__PRINT_SRCLINE) { print_srcline_last = true; print_opts &= ~EVSEL__PRINT_SRCLINE; } } else - putchar('\n'); + printed += fprintf(fp, "\n"); - sample__fprintf_sym(sample, al, 0, print_opts, cursor, stdout); + printed += sample__fprintf_sym(sample, al, 0, print_opts, cursor, fp); } /* print branch_to information */ if (PRINT_FIELD(ADDR) || ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) && !output[type].user_set)) { - printf(" => "); - print_sample_addr(sample, thread, attr); + printed += fprintf(fp, " => "); + printed += perf_sample__fprintf_addr(sample, thread, attr, fp); } if (print_srcline_last) - map__fprintf_srcline(al->map, al->addr, "\n ", stdout); + printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - print_insn(sample, attr, thread, machine); - - printf("\n"); + printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp); + return printed + fprintf(fp, "\n"); } static struct { @@ -1126,7 +1154,7 @@ static struct { {0, NULL} }; -static void print_sample_flags(u32 flags) +static int perf_sample__fprintf_flags(u32 flags, FILE *fp) { const char *chars = PERF_IP_FLAG_CHARS; const int n = strlen(PERF_IP_FLAG_CHARS); @@ -1153,9 +1181,9 @@ static void print_sample_flags(u32 flags) str[pos] = 0; if (name) - printf(" %-7s%4s ", name, in_tx ? "(x)" : ""); - else - printf(" %-11s ", str); + return fprintf(fp, " %-7s%4s ", name, in_tx ? "(x)" : ""); + + return fprintf(fp, " %-11s ", str); } struct printer_data { @@ -1225,138 +1253,136 @@ static int sample__fprintf_bpf_output(enum binary_printer_ops op, return printed; } -static void print_sample_bpf_output(struct perf_sample *sample) +static int perf_sample__fprintf_bpf_output(struct perf_sample *sample, FILE *fp) { unsigned int nr_bytes = sample->raw_size; struct printer_data printer_data = {0, false, true}; - - print_binary(sample->raw_data, nr_bytes, 8, - sample__fprintf_bpf_output, &printer_data); + int printed = binary__fprintf(sample->raw_data, nr_bytes, 8, + sample__fprintf_bpf_output, &printer_data, fp); if (printer_data.is_printable && printer_data.hit_nul) - printf("%17s \"%s\"\n", "BPF string:", - (char *)(sample->raw_data)); + printed += fprintf(fp, "%17s \"%s\"\n", "BPF string:", (char *)(sample->raw_data)); + + return printed; } -static void print_sample_spacing(int len, int spacing) +static int perf_sample__fprintf_spacing(int len, int spacing, FILE *fp) { if (len > 0 && len < spacing) - printf("%*s", spacing - len, ""); + return fprintf(fp, "%*s", spacing - len, ""); + + return 0; } -static void print_sample_pt_spacing(int len) +static int perf_sample__fprintf_pt_spacing(int len, FILE *fp) { - print_sample_spacing(len, 34); + return perf_sample__fprintf_spacing(len, 34, fp); } -static void print_sample_synth_ptwrite(struct perf_sample *sample) +static int perf_sample__fprintf_synth_ptwrite(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_ptwrite *data = perf_sample__synth_ptr(sample); int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; - len = printf(" IP: %u payload: %#" PRIx64 " ", + len = fprintf(fp, " IP: %u payload: %#" PRIx64 " ", data->ip, le64_to_cpu(data->payload)); - print_sample_pt_spacing(len); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth_mwait(struct perf_sample *sample) +static int perf_sample__fprintf_synth_mwait(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_mwait *data = perf_sample__synth_ptr(sample); int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; - len = printf(" hints: %#x extensions: %#x ", - data->hints, data->extensions); - print_sample_pt_spacing(len); + len = fprintf(fp, " hints: %#x extensions: %#x ", + data->hints, data->extensions); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth_pwre(struct perf_sample *sample) +static int perf_sample__fprintf_synth_pwre(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_pwre *data = perf_sample__synth_ptr(sample); int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; - len = printf(" hw: %u cstate: %u sub-cstate: %u ", - data->hw, data->cstate, data->subcstate); - print_sample_pt_spacing(len); + len = fprintf(fp, " hw: %u cstate: %u sub-cstate: %u ", + data->hw, data->cstate, data->subcstate); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth_exstop(struct perf_sample *sample) +static int perf_sample__fprintf_synth_exstop(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_exstop *data = perf_sample__synth_ptr(sample); int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; - len = printf(" IP: %u ", data->ip); - print_sample_pt_spacing(len); + len = fprintf(fp, " IP: %u ", data->ip); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth_pwrx(struct perf_sample *sample) +static int perf_sample__fprintf_synth_pwrx(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_pwrx *data = perf_sample__synth_ptr(sample); int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; - len = printf(" deepest cstate: %u last cstate: %u wake reason: %#x ", + len = fprintf(fp, " deepest cstate: %u last cstate: %u wake reason: %#x ", data->deepest_cstate, data->last_cstate, data->wake_reason); - print_sample_pt_spacing(len); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth_cbr(struct perf_sample *sample) +static int perf_sample__fprintf_synth_cbr(struct perf_sample *sample, FILE *fp) { struct perf_synth_intel_cbr *data = perf_sample__synth_ptr(sample); unsigned int percent, freq; int len; if (perf_sample__bad_synth_size(sample, *data)) - return; + return 0; freq = (le32_to_cpu(data->freq) + 500) / 1000; - len = printf(" cbr: %2u freq: %4u MHz ", data->cbr, freq); + len = fprintf(fp, " cbr: %2u freq: %4u MHz ", data->cbr, freq); if (data->max_nonturbo) { percent = (5 + (1000 * data->cbr) / data->max_nonturbo) / 10; - len += printf("(%3u%%) ", percent); + len += fprintf(fp, "(%3u%%) ", percent); } - print_sample_pt_spacing(len); + return len + perf_sample__fprintf_pt_spacing(len, fp); } -static void print_sample_synth(struct perf_sample *sample, - struct perf_evsel *evsel) +static int perf_sample__fprintf_synth(struct perf_sample *sample, + struct perf_evsel *evsel, FILE *fp) { switch (evsel->attr.config) { case PERF_SYNTH_INTEL_PTWRITE: - print_sample_synth_ptwrite(sample); - break; + return perf_sample__fprintf_synth_ptwrite(sample, fp); case PERF_SYNTH_INTEL_MWAIT: - print_sample_synth_mwait(sample); - break; + return perf_sample__fprintf_synth_mwait(sample, fp); case PERF_SYNTH_INTEL_PWRE: - print_sample_synth_pwre(sample); - break; + return perf_sample__fprintf_synth_pwre(sample, fp); case PERF_SYNTH_INTEL_EXSTOP: - print_sample_synth_exstop(sample); - break; + return perf_sample__fprintf_synth_exstop(sample, fp); case PERF_SYNTH_INTEL_PWRX: - print_sample_synth_pwrx(sample); - break; + return perf_sample__fprintf_synth_pwrx(sample, fp); case PERF_SYNTH_INTEL_CBR: - print_sample_synth_cbr(sample); - break; + return perf_sample__fprintf_synth_cbr(sample, fp); default: break; } + + return 0; } struct perf_script { @@ -1388,7 +1414,7 @@ static int perf_evlist__max_name_len(struct perf_evlist *evlist) return max; } -static size_t data_src__printf(u64 data_src) +static int data_src__fprintf(u64 data_src, FILE *fp) { struct mem_info mi = { .data_src.val = data_src }; char decode[100]; @@ -1402,7 +1428,7 @@ static size_t data_src__printf(u64 data_src) if (maxlen < len) maxlen = len; - return printf("%-*s", maxlen, out); + return fprintf(fp, "%-*s", maxlen, out); } static void process_event(struct perf_script *script, @@ -1413,11 +1439,12 @@ static void process_event(struct perf_script *script, struct thread *thread = al->thread; struct perf_event_attr *attr = &evsel->attr; unsigned int type = output_type(attr->type); + FILE *fp = stdout; if (output[type].fields == 0) return; - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, fp); if (PRINT_FIELD(PERIOD)) printf("%10" PRIu64 " ", sample->period); @@ -1433,10 +1460,10 @@ static void process_event(struct perf_script *script, } if (print_flags) - print_sample_flags(sample->flags); + perf_sample__fprintf_flags(sample->flags, fp); if (is_bts_event(attr)) { - print_sample_bts(sample, evsel, thread, al, machine); + perf_sample__fprintf_bts(sample, evsel, thread, al, machine, fp); return; } @@ -1445,16 +1472,16 @@ static void process_event(struct perf_script *script, sample->raw_data, sample->raw_size); if (attr->type == PERF_TYPE_SYNTH && PRINT_FIELD(SYNTH)) - print_sample_synth(sample, evsel); + perf_sample__fprintf_synth(sample, evsel, fp); if (PRINT_FIELD(ADDR)) - print_sample_addr(sample, thread, attr); + perf_sample__fprintf_addr(sample, thread, attr, fp); if (PRINT_FIELD(DATA_SRC)) - data_src__printf(sample->data_src); + data_src__fprintf(sample->data_src, fp); if (PRINT_FIELD(WEIGHT)) - printf("%16" PRIu64, sample->weight); + fprintf(fp, "%16" PRIu64, sample->weight); if (PRINT_FIELD(IP)) { struct callchain_cursor *cursor = NULL; @@ -1464,26 +1491,26 @@ static void process_event(struct perf_script *script, sample, NULL, NULL, scripting_max_stack) == 0) cursor = &callchain_cursor; - putchar(cursor ? '\n' : ' '); - sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, stdout); + fputc(cursor ? '\n' : ' ', fp); + sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, fp); } if (PRINT_FIELD(IREGS)) - print_sample_iregs(sample, attr); + perf_sample__fprintf_iregs(sample, attr, fp); if (PRINT_FIELD(UREGS)) - print_sample_uregs(sample, attr); + perf_sample__fprintf_uregs(sample, attr, fp); if (PRINT_FIELD(BRSTACK)) - print_sample_brstack(sample, thread, attr); + perf_sample__fprintf_brstack(sample, thread, attr, fp); else if (PRINT_FIELD(BRSTACKSYM)) - print_sample_brstacksym(sample, thread, attr); + perf_sample__fprintf_brstacksym(sample, thread, attr, fp); else if (PRINT_FIELD(BRSTACKOFF)) - print_sample_brstackoff(sample, thread, attr); + perf_sample__fprintf_brstackoff(sample, thread, attr, fp); if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) - print_sample_bpf_output(sample); - print_insn(sample, attr, thread, machine); + perf_sample__fprintf_bpf_output(sample, fp); + perf_sample__fprintf_insn(sample, attr, thread, machine, fp); if (PRINT_FIELD(PHYS_ADDR)) printf("%16" PRIx64, sample->phys_addr); @@ -1661,7 +1688,7 @@ static int process_comm_event(struct perf_tool *tool, sample->tid = event->comm.tid; sample->pid = event->comm.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); ret = 0; out: @@ -1696,7 +1723,7 @@ static int process_namespaces_event(struct perf_tool *tool, sample->tid = event->namespaces.tid; sample->pid = event->namespaces.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); ret = 0; out: @@ -1729,7 +1756,7 @@ static int process_fork_event(struct perf_tool *tool, sample->tid = event->fork.tid; sample->pid = event->fork.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); thread__put(thread); @@ -1758,7 +1785,7 @@ static int process_exit_event(struct perf_tool *tool, sample->tid = event->fork.tid; sample->pid = event->fork.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); if (perf_event__process_exit(tool, event, sample, machine) < 0) @@ -1793,7 +1820,7 @@ static int process_mmap_event(struct perf_tool *tool, sample->tid = event->mmap.tid; sample->pid = event->mmap.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); thread__put(thread); return 0; @@ -1824,7 +1851,7 @@ static int process_mmap2_event(struct perf_tool *tool, sample->tid = event->mmap2.tid; sample->pid = event->mmap2.pid; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); thread__put(thread); return 0; @@ -1850,7 +1877,7 @@ static int process_switch_event(struct perf_tool *tool, return -1; } - print_sample_start(sample, thread, evsel); + perf_sample__fprintf_start(sample, thread, evsel, stdout); perf_event__fprintf(event, stdout); thread__put(thread); return 0; From db49bc155ad9f04ea3c4e1c9ae87850610feb1ce Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 16 Sep 2017 08:25:37 +0200 Subject: [PATCH 0541/1085] perf script: Fix error handling path If the string passed in '--time' is invalid, or if failed to set libtraceevent function resolver, we must do some cleanup before leaving. As in the other error handling paths of this function. Signed-off-by: Christophe JAILLET Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20170916062537.28921-1-christophe.jaillet@wanadoo.fr Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3e83f4735b21..a3add2cd7856 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3074,7 +3074,8 @@ int cmd_script(int argc, const char **argv) machine__resolve_kernel_addr, &session->machines.host) < 0) { pr_err("%s: failed to set libtraceevent function resolver\n", __func__); - return -1; + err = -1; + goto out_delete; } if (generate_script_lang) { @@ -3134,7 +3135,8 @@ int cmd_script(int argc, const char **argv) /* needs to be parsed after looking up reference time */ if (perf_time__parse_str(&script.ptime, script.time_str) != 0) { pr_err("Invalid time string\n"); - return -EINVAL; + err = -EINVAL; + goto out_delete; } err = __cmd_script(&script); From 79f56ebe2ae33aa54c69bbc9854a9a31f622913e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 16 Sep 2017 08:09:36 +0200 Subject: [PATCH 0542/1085] perf kmem: Perform some cleanup if '--time' is given an invalid value If the string passed in '--time' is invalid, we must do some cleanup before leaving. As in the other error handling paths of this function. Signed-off-by: Christophe JAILLET Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Fixes: 2a865bd8dddd ("perf kmem: Add option to specify time window of interest") Link: http://lkml.kernel.org/r/20170916060936.28199-1-christophe.jaillet@wanadoo.fr Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 24ee68ecdd42..d8f25ef8157b 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1983,7 +1983,8 @@ int cmd_kmem(int argc, const char **argv) if (perf_time__parse_str(&ptime, time_str) != 0) { pr_err("Invalid time string\n"); - return -EINVAL; + ret = -EINVAL; + goto out_delete; } if (!strcmp(argv[0], "stat")) { From b1f03ca4ee784390457dfb2009601cd7dcac025e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 19 Oct 2017 15:11:17 -0300 Subject: [PATCH 0543/1085] perf namespaces: Add more appropriate set of headers We don't need perf.h, that is a kitchen sink, all we need is perf_events.h for perf_ns_link_info, sys/types.h for pid_t and linux/types.h for u64, list_head. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Li Zhijian Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-f2uxyaj4s2hmntkrezpa6dqz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/namespaces.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h index 05d82601c9a6..760558dcfd18 100644 --- a/tools/perf/util/namespaces.h +++ b/tools/perf/util/namespaces.h @@ -9,9 +9,10 @@ #ifndef __PERF_NAMESPACES_H #define __PERF_NAMESPACES_H -#include "../perf.h" -#include +#include +#include #include +#include struct namespaces_event; From 65db92e0965ab56e8031d5c804f26d5be0e47047 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 18 Oct 2017 06:05:07 -0700 Subject: [PATCH 0544/1085] perf vendor events: Add Goldmont Plus V1 event file Add a Intel event file for perf. Signed-off-by: Kan Liang Acked-by: Andi Kleen Cc: Jiri Olsa Cc: Kan Liang Cc: Sukadev Bhattiprolu Link: http://lkml.kernel.org/r/1508331907-395162-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/goldmontplus/cache.json | 1453 +++++++++++++++++ .../arch/x86/goldmontplus/frontend.json | 62 + .../arch/x86/goldmontplus/memory.json | 38 + .../arch/x86/goldmontplus/other.json | 98 ++ .../arch/x86/goldmontplus/pipeline.json | 544 ++++++ .../arch/x86/goldmontplus/virtual-memory.json | 218 +++ tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 7 files changed, 2414 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/cache.json create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/memory.json create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/other.json create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json b/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json new file mode 100644 index 000000000000..b4791b443a66 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json @@ -0,0 +1,1453 @@ +[ + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts memory requests originating from the core that miss in the L2 cache.", + "EventCode": "0x2E", + "Counter": "0,1,2,3", + "UMask": "0x41", + "PEBScounters": "0,1,2,3", + "EventName": "LONGEST_LAT_CACHE.MISS", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "L2 cache request misses" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts memory requests originating from the core that reference a cache line in the L2 cache.", + "EventCode": "0x2E", + "Counter": "0,1,2,3", + "UMask": "0x4f", + "PEBScounters": "0,1,2,3", + "EventName": "LONGEST_LAT_CACHE.REFERENCE", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "L2 cache requests" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of demand and prefetch transactions that the L2 XQ rejects due to a full or near full condition which likely indicates back pressure from the intra-die interconnect (IDI) fabric. The XQ may reject transactions from the L2Q (non-cacheable requests), L2 misses and L2 write-back victims.", + "EventCode": "0x30", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "L2_REJECT_XQ.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Requests rejected by the XQ" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of demand and L1 prefetcher requests rejected by the L2Q due to a full or nearly full condition which likely indicates back pressure from L2Q. It also counts requests that would have gone directly to the XQ, but are rejected due to a full or nearly full condition, indicating back pressure from the IDI link. The L2Q may also reject transactions from a core to insure fairness between cores, or to delay a core's dirty eviction when the address conflicts with incoming external snoops.", + "EventCode": "0x31", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "CORE_REJECT_L2Q.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Requests rejected by the L2Q" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts when a modified (dirty) cache line is evicted from the data L1 cache and needs to be written back to memory. No count will occur if the evicted line is clean, and hence does not require a writeback.", + "EventCode": "0x51", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "DL1.REPLACEMENT", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "L1 Cache evictions for dirty data" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts cycles that fetch is stalled due to an outstanding ICache miss. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes due to an ICache miss. Note: this event is not the same as the total number of cycles spent retrieving instruction cache lines from the memory hierarchy.", + "EventCode": "0x86", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "FETCH_STALL.ICACHE_FILL_PENDING_CYCLES", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles code-fetch stalled due to an outstanding ICache miss." + }, + { + "CollectPEBSRecord": "1", + "EventCode": "0xB7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE", + "PDIR_COUNTER": "na", + "SampleAfterValue": "100007", + "BriefDescription": "Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts locked memory uops retired. This includes regular locks and bus locks. (To specifically count bus locks only, see the Offcore response event.) A locked access is one with a lock prefix, or an exchange to memory. See the SDM for a complete description of which memory load accesses are locks.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x21", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS", + "SampleAfterValue": "200003", + "BriefDescription": "Locked load uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired where the data requested spans a 64 byte cache line boundary.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x41", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that split a cache-line (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts store uops retired where the data requested spans a 64 byte cache line boundary.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x42", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES", + "SampleAfterValue": "200003", + "BriefDescription": "Stores uops retired that split a cache-line (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts memory uops retired where the data requested spans a 64 byte cache line boundary.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x43", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.SPLIT", + "SampleAfterValue": "200003", + "BriefDescription": "Memory uops retired that split a cache-line (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts the number of load uops retired.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x81", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.ALL_LOADS", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts the number of store uops retired.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x82", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.ALL_STORES", + "SampleAfterValue": "200003", + "BriefDescription": "Store uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts the number of memory uops retired that is either a loads or a store or both.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x83", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.ALL", + "SampleAfterValue": "200003", + "BriefDescription": "Memory uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired that hit the L1 data cache.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that hit L1 data cache (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired that hit in the L2 cache.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that hit L2 (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired that miss the L1 data cache.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that missed L1 data cache (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired that miss in the L2 cache.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that missed L2 (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired where the cache line containing the data was in the modified state of another core or modules cache (HITM). More specifically, this means that when the load address was checked by other caching agents (typically another processor) in the system, one of those caching agents indicated that they had a dirty copy of the data. Loads that obtain a HITM response incur greater latency than most is typical for a load. In addition, since HITM indicates that some other processor had this data in its cache, it implies that the data was shared between processors, or potentially was a lock or semaphore value. This event is useful for locating sharing, false sharing, and contended locks.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x20", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.HITM", + "SampleAfterValue": "200003", + "BriefDescription": "Memory uop retired where cross core or cross module HITM occurred (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts memory load uops retired where the data is retrieved from the WCB (or fill buffer), indicating that the load found its data while that data was in the process of being brought into the L1 cache. Typically a load will receive this indication when some other load or prefetch missed the L1 cache and was in the process of retrieving the cache line containing the data, but that process had not yet finished (and written the data back to the cache). For example, consider load X and Y, both referencing the same cache line that is not in the L1 cache. If load X misses cache first, it obtains and WCB (or fill buffer) and begins the process of requesting the data. When load Y requests the data, it will either hit the WCB, or the L1 cache, depending on exactly what time the request to Y occurs.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x40", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT", + "SampleAfterValue": "200003", + "BriefDescription": "Loads retired that hit WCB (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts memory load uops retired where the data is retrieved from DRAM. Event is counted at retirement, so the speculative loads are ignored. A memory load can hit (or miss) the L1 cache, hit (or miss) the L2 cache, hit DRAM, hit in the WCB or receive a HITM response.", + "EventCode": "0xD1", + "Counter": "0,1,2,3", + "UMask": "0x80", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_LOAD_UOPS_RETIRED.DRAM_HIT", + "SampleAfterValue": "200003", + "BriefDescription": "Loads retired that came from DRAM (Precise event capable)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand cacheable data reads of full cache lines have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010001", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand cacheable data reads of full cache lines have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand cacheable data reads of full cache lines hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040001", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand cacheable data reads of full cache lines hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand cacheable data reads of full cache lines true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000001", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand cacheable data reads of full cache lines true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand cacheable data reads of full cache lines miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000001", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand cacheable data reads of full cache lines miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand cacheable data reads of full cache lines outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000001", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand cacheable data reads of full cache lines outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010002", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040002", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000002", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000002", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000002", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010004", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040004", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000004", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000004", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000004", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010008", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.COREWB.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040008", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.COREWB.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000008", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.COREWB.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000008", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.COREWB.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000008", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.COREWB.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010020", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040020", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000020", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000020", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000020", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts bus lock and split lock requests have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010400", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts bus lock and split lock requests have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts bus lock and split lock requests hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040400", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts bus lock and split lock requests hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts bus lock and split lock requests true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000400", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts bus lock and split lock requests true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts bus lock and split lock requests miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000400", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts bus lock and split lock requests miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts bus lock and split lock requests outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000400", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts bus lock and split lock requests outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache lines requests by software prefetch instructions have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000011000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache lines requests by software prefetch instructions have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache lines requests by software prefetch instructions hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000041000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache lines requests by software prefetch instructions hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache lines requests by software prefetch instructions true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200001000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache lines requests by software prefetch instructions true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache lines requests by software prefetch instructions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000001000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache lines requests by software prefetch instructions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache lines requests by software prefetch instructions outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000001000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache lines requests by software prefetch instructions outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000012000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000042000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200002000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000002000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000002000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000014800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000044800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200004800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000004800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000004800", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the uncore subsystem have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000018000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts requests to the uncore subsystem have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the uncore subsystem hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000048000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts requests to the uncore subsystem hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the uncore subsystem true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200008000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts requests to the uncore subsystem true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the uncore subsystem miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000008000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts requests to the uncore subsystem miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the uncore subsystem outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000008000", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts requests to the uncore subsystem outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000013010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000043010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200003010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000003010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000003010", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads (demand & prefetch) have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000013091", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads (demand & prefetch) have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads (demand & prefetch) hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000043091", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads (demand & prefetch) hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200003091", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000003091", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data reads (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000003091", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data reads (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000010022", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_RFO.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0000040022", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_RFO.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x0200000022", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x1000000022", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_RFO.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x4000000022", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_RFO.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x00000132b7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_READ.ANY_RESPONSE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) hit the L2 cache. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x00000432b7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_READ.L2_HIT", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) hit the L2 cache.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x02000032b7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_READ.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x10000032b7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_READ.L2_MISS.HITM_OTHER_CORE", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6, 0x1a7", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.", + "Offcore": "1" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)", + "EventCode": "0xB7", + "MSRValue": "0x40000032b7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "OFFCORE_RESPONSE.ANY_READ.OUTSTANDING", + "PDIR_COUNTER": "na", + "MSRIndex": "0x1a6", + "SampleAfterValue": "100007", + "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.", + "Offcore": "1" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json b/tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json new file mode 100644 index 000000000000..a7878965ceab --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json @@ -0,0 +1,62 @@ +[ + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is in the ICache (hit). The event strives to count on a cache line basis, so that multiple accesses which hit in a single cache line count as one ICACHE.HIT. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture.", + "EventCode": "0x80", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "ICACHE.HIT", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "References per ICache line that are available in the ICache (hit). This event counts differently than Intel processors based on Silvermont microarchitecture" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is not in the ICache (miss). The event strives to count on a cache line basis, so that multiple accesses which miss in a single cache line count as one ICACHE.MISS. Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is not in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture.", + "EventCode": "0x80", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "ICACHE.MISSES", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "References per ICache line that are not available in the ICache (miss). This event counts differently than Intel processors based on Silvermont microarchitecture" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line. The event strives to count on a cache line basis, so that multiple fetches to a single cache line count as one ICACHE.ACCESS. Specifically, the event counts when accesses from straight line code crosses the cache line boundary, or when a branch target is to a new line.\r\nThis event counts differently than Intel processors based on Silvermont microarchitecture.", + "EventCode": "0x80", + "Counter": "0,1,2,3", + "UMask": "0x3", + "PEBScounters": "0,1,2,3", + "EventName": "ICACHE.ACCESSES", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "References per ICache line. This event counts differently than Intel processors based on Silvermont microarchitecture" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times the Microcode Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM. The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine. Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops. The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear.", + "EventCode": "0xE7", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "MS_DECODED.MS_ENTRY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "MS decode starts" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times the prediction (from the predecode cache) for instruction length is incorrect.", + "EventCode": "0xE9", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "DECODE_RESTRICTION.PREDECODE_WRONG", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Decode restrictions due to predicting wrong instruction length" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/memory.json b/tools/perf/pmu-events/arch/x86/goldmontplus/memory.json new file mode 100644 index 000000000000..91e0815f3ffb --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/memory.json @@ -0,0 +1,38 @@ +[ + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts when a memory load of a uop spans a page boundary (a split) is retired.", + "EventCode": "0x13", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops that split a page (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts when a memory store of a uop spans a page boundary (a split) is retired.", + "EventCode": "0x13", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT", + "SampleAfterValue": "200003", + "BriefDescription": "Store uops that split a page (Precise event capable)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts machine clears due to memory ordering issues. This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved - as another core is in the process of modifying the data.", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "Machine clears due to memory ordering issue" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/other.json b/tools/perf/pmu-events/arch/x86/goldmontplus/other.json new file mode 100644 index 000000000000..b860374418ab --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/other.json @@ -0,0 +1,98 @@ +[ + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts cycles that fetch is stalled due to any reason. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes. This will include cycles due to an ITLB miss, ICache miss and other events.", + "EventCode": "0x86", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "FETCH_STALL.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles code-fetch stalled due to any reason." + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts cycles that fetch is stalled due to an outstanding ITLB miss. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes due to an ITLB miss. Note: this event is not the same as page walk cycles to retrieve an instruction translation.", + "EventCode": "0x86", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "FETCH_STALL.ITLB_FILL_PENDING_CYCLES", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles the code-fetch stalls and an ITLB miss is outstanding." + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY).", + "EventCode": "0xCA", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "ISSUE_SLOTS_NOT_CONSUMED.ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Unfilled issue slots per cycle" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend. Including but not limited to resources such as the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable. Note that uops must be available for consumption in order for this event to fire. If a uop is not available (Instruction Queue is empty), this event will not count.", + "EventCode": "0xCA", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "ISSUE_SLOTS_NOT_CONSUMED.RESOURCE_FULL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Unfilled issue slots per cycle because of a full resource in the backend" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows). Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue.", + "EventCode": "0xCA", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "ISSUE_SLOTS_NOT_CONSUMED.RECOVERY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Unfilled issue slots per cycle to recover" + }, + { + "CollectPEBSRecord": "2", + "PublicDescription": "Counts hardware interrupts received by the processor.", + "EventCode": "0xCB", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "HW_INTERRUPTS.RECEIVED", + "PDIR_COUNTER": "na", + "SampleAfterValue": "203", + "BriefDescription": "Hardware interrupts received" + }, + { + "CollectPEBSRecord": "2", + "PublicDescription": "Counts the number of core cycles during which interrupts are masked (disabled). Increments by 1 each core cycle that EFLAGS.IF is 0, regardless of whether interrupts are pending or not.", + "EventCode": "0xCB", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "HW_INTERRUPTS.MASKED", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles hardware interrupts are masked" + }, + { + "CollectPEBSRecord": "2", + "PublicDescription": "Counts core cycles during which there are pending interrupts, but interrupts are masked (EFLAGS.IF = 0).", + "EventCode": "0xCB", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "HW_INTERRUPTS.PENDING_AND_MASKED", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles pending interrupts are masked" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json b/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json new file mode 100644 index 000000000000..ccf1aed69197 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json @@ -0,0 +1,544 @@ +[ + { + "PEBS": "2", + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. You cannot collect a PEBs record for this event.", + "EventCode": "0x00", + "Counter": "Fixed counter 0", + "UMask": "0x1", + "PEBScounters": "32", + "EventName": "INST_RETIRED.ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Instructions retired (Fixed event)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1. You cannot collect a PEBs record for this event.", + "EventCode": "0x00", + "Counter": "Fixed counter 1", + "UMask": "0x2", + "PEBScounters": "33", + "EventName": "CPU_CLK_UNHALTED.CORE", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Core cycles when core is not halted (Fixed event)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time. This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time. This event uses fixed counter 2. You cannot collect a PEBs record for this event.", + "EventCode": "0x00", + "Counter": "Fixed counter 2", + "UMask": "0x3", + "PEBScounters": "34", + "EventName": "CPU_CLK_UNHALTED.REF_TSC", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Reference cycles when core is not halted (Fixed event)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts a load blocked from using a store forward, but did not occur because the store data was not available at the right time. The forward might occur subsequently when the data is available.", + "EventCode": "0x03", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "LD_BLOCKS.DATA_UNKNOWN", + "SampleAfterValue": "200003", + "BriefDescription": "Loads blocked due to store data not ready (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts a load blocked from using a store forward because of an address/size mismatch, only one of the loads blocked from each store will be counted.", + "EventCode": "0x03", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "LD_BLOCKS.STORE_FORWARD", + "SampleAfterValue": "200003", + "BriefDescription": "Loads blocked due to store forward restriction (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts loads that block because their address modulo 4K matches a pending store.", + "EventCode": "0x03", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "LD_BLOCKS.4K_ALIAS", + "SampleAfterValue": "200003", + "BriefDescription": "Loads blocked because address has 4k partial address false dependence (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts loads blocked because they are unable to find their physical address in the micro TLB (UTLB).", + "EventCode": "0x03", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "LD_BLOCKS.UTLB_MISS", + "SampleAfterValue": "200003", + "BriefDescription": "Loads blocked because address in not in the UTLB (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts anytime a load that retires is blocked for any reason.", + "EventCode": "0x03", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "LD_BLOCKS.ALL_BLOCK", + "SampleAfterValue": "200003", + "BriefDescription": "Loads blocked (Precise event capable)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts uops issued by the front end and allocated into the back end of the machine. This event counts uops that retire as well as uops that were speculatively executed but didn't retire. The sort of speculative uops that might be counted includes, but is not limited to those uops issued in the shadow of a miss-predicted branch, those uops that are inserted during an assist (such as for a denormal floating point result), and (previously allocated) uops that might be canceled during a machine clear.", + "EventCode": "0x0E", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_ISSUED.ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Uops issued to the back end per cycle" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Core cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter.", + "EventCode": "0x3C", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "CPU_CLK_UNHALTED.CORE_P", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Core cycles when core is not halted" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Reference cycles when core is not halted. This event uses a (_P)rogrammable general purpose performance counter.", + "EventCode": "0x3C", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "CPU_CLK_UNHALTED.REF", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Reference cycles when core is not halted" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "This event used to measure front-end inefficiencies. I.e. when front-end of the machine is not delivering uops to the back-end and the back-end has is not stalled. This event can be used to identify if the machine is truly front-end bound. When this event occurs, it is an indication that the front-end of the machine is operating at less than its theoretical peak performance. Background: We can think of the processor pipeline as being divided into 2 broader parts: Front-end and Back-end. Front-end is responsible for fetching the instruction, decoding into uops in machine understandable format and putting them into a uop queue to be consumed by back end. The back-end then takes these uops, allocates the required resources. When all resources are ready, uops are executed. If the back-end is not ready to accept uops from the front-end, then we do not want to count these as front-end bottlenecks. However, whenever we have bottlenecks in the back-end, we will have allocation unit stalls and eventually forcing the front-end to wait until the back-end is ready to receive more uops. This event counts only when back-end is requesting more uops and front-end is not able to provide them. When 3 uops are requested and no uops are delivered, the event counts 3. When 3 are requested, and only 1 is delivered, the event counts 2. When only 2 are delivered, the event counts 1. Alternatively stated, the event will not count if 3 uops are delivered, or if the back end is stalled and not requesting any uops at all. Counts indicate missed opportunities for the front-end to deliver a uop to the back end. Some examples of conditions that cause front-end efficiencies are: ICache misses, ITLB misses, and decoder restrictions that limit the front-end bandwidth. Known Issues: Some uops require multiple allocation slots. These uops will not be charged as a front end 'not delivered' opportunity, and will be regarded as a back end problem. For example, the INC instruction has one uop that requires 2 issue slots. A stream of INC instructions will not count as UOPS_NOT_DELIVERED, even though only one instruction can be issued per clock. The low uop issue rate for a stream of INC instructions is considered to be a back end issue.", + "EventCode": "0x9C", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_NOT_DELIVERED.ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Uops requested but not-delivered to the back-end per cycle" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The event continues counting during hardware interrupts, traps, and inside interrupt handlers. This is an architectural performance event. This event uses a (_P)rogrammable general purpose performance counter. *This event is Precise Event capable: The EventingRIP field in the PEBS record is precise to the address of the instruction which caused the event. Note: Because PEBS records can be collected only on IA32_PMC0, only one event can use the PEBS facility at a time.", + "EventCode": "0xC0", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "INST_RETIRED.ANY_P", + "SampleAfterValue": "2000003", + "BriefDescription": "Instructions retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts INST_RETIRED.ANY using the Reduced Skid PEBS feature that reduces the shadow in which events aren't counted allowing for a more unbiased distribution of samples across instructions retired.", + "EventCode": "0xC0", + "Counter": "0,1,2,3", + "UMask": "0x0", + "EventName": "INST_RETIRED.PREC_DIST", + "SampleAfterValue": "2000003", + "BriefDescription": "Instructions retired - using Reduced Skid PEBS feature" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts uops which retired.", + "EventCode": "0xC2", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_RETIRED.ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts uops retired that are from the complex flows issued by the micro-sequencer (MS). Counts both the uops from a micro-coded instruction, and the uops that might be generated from a micro-coded assist.", + "EventCode": "0xC2", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_RETIRED.MS", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "MS uops retired (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of floating point divide uops retired.", + "EventCode": "0xC2", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_RETIRED.FPDIV", + "SampleAfterValue": "2000003", + "BriefDescription": "Floating point divide uops retired (Precise Event Capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of integer divide uops retired.", + "EventCode": "0xC2", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "UOPS_RETIRED.IDIV", + "SampleAfterValue": "2000003", + "BriefDescription": "Integer divide uops retired (Precise Event Capable)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts machine clears for any reason.", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "All machine clears" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification. Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors.", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.SMC", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "Self-Modifying Code detected" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts machine clears due to floating point (FP) operations needing assists. For instance, if the result was a floating point denormal, the hardware clears the pipeline and reissues uops to produce the correct IEEE compliant denormal result.", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.FP_ASSIST", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "Machine clears due to FP assists" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts machine clears due to memory disambiguation. Memory disambiguation happens when a load which has been issued conflicts with a previous unretired store in the pipeline whose address was not known at issue time, but is later resolved to be the same as the load address.", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.DISAMBIGUATION", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "Machine clears due to memory disambiguation" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times that the machines clears due to a page fault. Covers both I-side and D-side(Loads/Stores) page faults. A page fault occurs when either page is not present, or an access violation", + "EventCode": "0xC3", + "Counter": "0,1,2,3", + "UMask": "0x20", + "PEBScounters": "0,1,2,3", + "EventName": "MACHINE_CLEARS.PAGE_FAULT", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "Machines clear due to a page fault" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts branch instructions retired for all branch types. This is an architectural performance event.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.ALL_BRANCHES", + "SampleAfterValue": "200003", + "BriefDescription": "Retired branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was taken and when it was not taken.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0x7e", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.JCC", + "SampleAfterValue": "200003", + "BriefDescription": "Retired conditional branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts the number of taken branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0x80", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.ALL_TAKEN_BRANCHES", + "SampleAfterValue": "200003", + "BriefDescription": "Retired taken branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts far branch instructions retired. This includes far jump, far call and return, and Interrupt call and return.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xbf", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.FAR_BRANCH", + "SampleAfterValue": "200003", + "BriefDescription": "Retired far branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts near indirect call or near indirect jmp branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xeb", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.NON_RETURN_IND", + "SampleAfterValue": "200003", + "BriefDescription": "Retired instructions of near indirect Jmp or call (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts near return branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xf7", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.RETURN", + "SampleAfterValue": "200003", + "BriefDescription": "Retired near return instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts near CALL branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xf9", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.CALL", + "SampleAfterValue": "200003", + "BriefDescription": "Retired near call instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts near indirect CALL branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xfb", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.IND_CALL", + "SampleAfterValue": "200003", + "BriefDescription": "Retired near indirect call instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts near relative CALL branch instructions retired.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xfd", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.REL_CALL", + "SampleAfterValue": "200003", + "BriefDescription": "Retired near relative call instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were taken and does not count when the Jcc branch instruction were not taken.", + "EventCode": "0xC4", + "Counter": "0,1,2,3", + "UMask": "0xfe", + "PEBScounters": "0,1,2,3", + "EventName": "BR_INST_RETIRED.TAKEN_JCC", + "SampleAfterValue": "200003", + "BriefDescription": "Retired conditional branch instructions that were taken (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted branch instructions retired including all branch types.", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired, including both when the branch was supposed to be taken and when it was not supposed to be taken (but the processor predicted the opposite condition).", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0x7e", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.JCC", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted conditional branch instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted branch instructions retired that were near indirect call or near indirect jmp, where the target address taken was not what the processor predicted.", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0xeb", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.NON_RETURN_IND", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted instructions of near indirect Jmp or near indirect call (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted near RET branch instructions retired, where the return address taken was not what the processor predicted.", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0xf7", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.RETURN", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted near return instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted near indirect CALL branch instructions retired, where the target address taken was not what the processor predicted.", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0xfb", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.IND_CALL", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted near indirect call instructions (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts mispredicted retired Jcc (Jump on Conditional Code/Jump if Condition is Met) branch instructions retired that were supposed to be taken but the processor predicted that it would not be taken.", + "EventCode": "0xC5", + "Counter": "0,1,2,3", + "UMask": "0xfe", + "PEBScounters": "0,1,2,3", + "EventName": "BR_MISP_RETIRED.TAKEN_JCC", + "SampleAfterValue": "200003", + "BriefDescription": "Retired mispredicted conditional branch instructions that were taken (Precise event capable)" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts core cycles if either divide unit is busy.", + "EventCode": "0xCD", + "Counter": "0,1,2,3", + "UMask": "0x0", + "PEBScounters": "0,1,2,3", + "EventName": "CYCLES_DIV_BUSY.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Cycles a divider is busy" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts core cycles the integer divide unit is busy.", + "EventCode": "0xCD", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "CYCLES_DIV_BUSY.IDIV", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles the integer divide unit is busy" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts core cycles the floating point divide unit is busy.", + "EventCode": "0xCD", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "CYCLES_DIV_BUSY.FPDIV", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Cycles the FP divide unit is busy" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times a BACLEAR is signaled for any reason, including, but not limited to indirect branch/call, Jcc (Jump on Conditional Code/Jump if Condition is Met) branch, unconditional branch/call, and returns.", + "EventCode": "0xE6", + "Counter": "0,1,2,3", + "UMask": "0x1", + "PEBScounters": "0,1,2,3", + "EventName": "BACLEARS.ALL", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "BACLEARs asserted for any branch type" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts BACLEARS on return instructions.", + "EventCode": "0xE6", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "BACLEARS.RETURN", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "BACLEARs asserted for return branch" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts BACLEARS on Jcc (Jump on Conditional Code/Jump if Condition is Met) branches.", + "EventCode": "0xE6", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "BACLEARS.COND", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "BACLEARs asserted for conditional branch" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json b/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json new file mode 100644 index 000000000000..0b53a3b0dfb8 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json @@ -0,0 +1,218 @@ +[ + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data loads (including SW prefetches) whose address translations missed in all TLB levels and were mapped to 4K pages. The page walks can end with or without a page fault.", + "EventCode": "0x08", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walk completed due to a demand load to a 4K page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data loads (including SW prefetches) whose address translations missed in all TLB levels and were mapped to 2M or 4M pages. The page walks can end with or without a page fault.", + "EventCode": "0x08", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walk completed due to a demand load to a 2M or 4M page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data loads (including SW prefetches) whose address translations missed in all TLB levels and were mapped to 1GB pages. The page walks can end with or without a page fault.", + "EventCode": "0x08", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1GB", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walk completed due to a demand load to a 1GB page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts once per cycle for each page walk occurring due to a load (demand data loads or SW prefetches). Includes cycles spent traversing the Extended Page Table (EPT). Average cycles per walk can be calculated by dividing by the number of walks.", + "EventCode": "0x08", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walks outstanding due to a demand load every cycle." + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data stores whose address translations missed in the TLB and were mapped to 4K pages. The page walks can end with or without a page fault.", + "EventCode": "0x49", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to a demand data store to a 4K page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data stores whose address translations missed in the TLB and were mapped to 2M or 4M pages. The page walks can end with or without a page fault.", + "EventCode": "0x49", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to a demand data store to a 2M or 4M page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to demand data stores whose address translations missed in the TLB and were mapped to 1GB pages. The page walks can end with or without a page fault.", + "EventCode": "0x49", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1GB", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to a demand data store to a 1GB page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts once per cycle for each page walk occurring due to a demand data store. Includes cycles spent traversing the Extended Page Table (EPT). Average cycles per walk can be calculated by dividing by the number of walks.", + "EventCode": "0x49", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "DTLB_STORE_MISSES.WALK_PENDING", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walks outstanding due to a demand data store every cycle." + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts once per cycle for each page walk only while traversing the Extended Page Table (EPT), and does not count during the rest of the translation. The EPT is used for translating Guest-Physical Addresses to Physical Addresses for Virtual Machine Monitors (VMMs). Average cycles per walk can be calculated by dividing the count by number of walks.", + "EventCode": "0x4F", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "EPT.WALK_PENDING", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walks outstanding due to walking the EPT every cycle" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts the number of times the machine was unable to find a translation in the Instruction Translation Lookaside Buffer (ITLB) for a linear address of an instruction fetch. It counts when new translation are filled into the ITLB. The event is speculative in nature, but will not count translations (page walks) that are begun and not finished, or translations that are finished but not filled into the ITLB.", + "EventCode": "0x81", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "ITLB.MISS", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "ITLB misses" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to instruction fetches whose address translations missed in the TLB and were mapped to 4K pages. The page walks can end with or without a page fault.", + "EventCode": "0x85", + "Counter": "0,1,2,3", + "UMask": "0x2", + "PEBScounters": "0,1,2,3", + "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to an instruction fetch in a 4K page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to instruction fetches whose address translations missed in the TLB and were mapped to 2M or 4M pages. The page walks can end with or without a page fault.", + "EventCode": "0x85", + "Counter": "0,1,2,3", + "UMask": "0x4", + "PEBScounters": "0,1,2,3", + "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to an instruction fetch in a 2M or 4M page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts page walks completed due to instruction fetches whose address translations missed in the TLB and were mapped to 1GB pages. The page walks can end with or without a page fault.", + "EventCode": "0x85", + "Counter": "0,1,2,3", + "UMask": "0x8", + "PEBScounters": "0,1,2,3", + "EventName": "ITLB_MISSES.WALK_COMPLETED_1GB", + "PDIR_COUNTER": "na", + "SampleAfterValue": "2000003", + "BriefDescription": "Page walk completed due to an instruction fetch in a 1GB page" + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts once per cycle for each page walk occurring due to an instruction fetch. Includes cycles spent traversing the Extended Page Table (EPT). Average cycles per walk can be calculated by dividing by the number of walks.", + "EventCode": "0x85", + "Counter": "0,1,2,3", + "UMask": "0x10", + "PEBScounters": "0,1,2,3", + "EventName": "ITLB_MISSES.WALK_PENDING", + "PDIR_COUNTER": "na", + "SampleAfterValue": "200003", + "BriefDescription": "Page walks outstanding due to an instruction fetch every cycle." + }, + { + "CollectPEBSRecord": "1", + "PublicDescription": "Counts STLB flushes. The TLBs are flushed on instructions like INVLPG and MOV to CR3.", + "EventCode": "0xBD", + "Counter": "0,1,2,3", + "UMask": "0x20", + "PEBScounters": "0,1,2,3", + "EventName": "TLB_FLUSHES.STLB_ANY", + "PDIR_COUNTER": "na", + "SampleAfterValue": "20003", + "BriefDescription": "STLB flushes" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts load uops retired that caused a DTLB miss.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x11", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_LOADS", + "SampleAfterValue": "200003", + "BriefDescription": "Load uops retired that missed the DTLB (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts store uops retired that caused a DTLB miss.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x12", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_STORES", + "SampleAfterValue": "200003", + "BriefDescription": "Store uops retired that missed the DTLB (Precise event capable)" + }, + { + "PEBS": "2", + "CollectPEBSRecord": "2", + "PublicDescription": "Counts uops retired that had a DTLB miss on load, store or either. Note that when two distinct memory operations to the same page miss the DTLB, only one of them will be recorded as a DTLB miss.", + "EventCode": "0xD0", + "Counter": "0,1,2,3", + "UMask": "0x13", + "PEBScounters": "0,1,2,3", + "EventName": "MEM_UOPS_RETIRED.DTLB_MISS", + "SampleAfterValue": "200003", + "BriefDescription": "Memory uops retired that missed the DTLB (Precise event capable)" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 4ea068366c3e..fe1a2c47cabf 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -9,6 +9,7 @@ GenuineIntel-6-27,v4,bonnell,core GenuineIntel-6-36,v4,bonnell,core GenuineIntel-6-35,v4,bonnell,core GenuineIntel-6-5C,v8,goldmont,core +GenuineIntel-6-7A,v1,goldmontplus,core GenuineIntel-6-3C,v24,haswell,core GenuineIntel-6-45,v24,haswell,core GenuineIntel-6-46,v24,haswell,core From b3270a5210229ee543339d34b74ba527f978c55b Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Mon, 23 Oct 2017 14:18:42 -0500 Subject: [PATCH 0545/1085] x86/platform/UV: Mark tsc_check_sync as an init function Fix build problem: >> WARNING: vmlinux.o(.text+0x4223a): Section mismatch in reference from the function uv_tsc_check_sync() to the function .init.text:uv_early_read_mmr() The function uv_tsc_check_sync() references the function __init uv_early_read_mmr(). This is often because uv_tsc_check_sync lacks a __init Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Dimitri Sivanich Cc: Russ Anderson Cc: Andrew Banman Cc: Peter Zijlstra Cc: Bin Gao Link: https://lkml.kernel.org/r/20171023191841.985614692@stormcage.americas.sgi.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 440825478bbd..5eda48a72617 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -154,7 +154,7 @@ static int __init early_get_pnodeid(void) return pnode; } -static void uv_tsc_check_sync(void) +static void __init uv_tsc_check_sync(void) { u64 mmr; int sync_state; From c6cd924efe941ef62eb805c59e4a09e219ac5c6d Mon Sep 17 00:00:00 2001 From: Yanjiang Jin Date: Tue, 24 Oct 2017 14:23:41 +0800 Subject: [PATCH 0546/1085] cpu/hotplug: Remove obsolete notifier macros commit 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions")' removed the below macros: - #define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */ - #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ - #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ But "CPU_UP_CANCELED_FROZEN, CPU_DOWN_PREPARE_FROZEN and CPU_DOWN_FAILED_FROZEN" still refer to them, and nobody uses these "FROZEN" macros now, so remove them too. Signed-off-by: Yanjiang Jin Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: bigeasy@linutronix.de Cc: jinyanjiang@gmail.com Link: https://lkml.kernel.org/r/20171024062341.179678-1-yanjiang.jin@windriver.com --- include/linux/cpu.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ca73bc1563f4..cd4771b772c0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -72,9 +72,6 @@ struct notifier_block; #define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN) #define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN) -#define CPU_UP_CANCELED_FROZEN (CPU_UP_CANCELED | CPU_TASKS_FROZEN) -#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) -#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) #ifdef CONFIG_SMP From e22cdc3fc5991956146b9856d36b4971fe54dcd6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 23 Oct 2017 19:01:54 +0600 Subject: [PATCH 0547/1085] sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK cpulist_parse() uses nr_cpumask_bits as a limit to parse the passed buffer from kernel commandline. What nr_cpumask_bits represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option: - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as NR_CPUS, which might not represent the # of CPUs that really exist (default 64). So, there's a chance of a gap between nr_cpu_ids and NR_CPUS, which ultimately lead towards invalid cpulist_parse() operation. For example, if isolcpus=9 is passed on an 8 cpu system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error that it's supposed to. This patch fixes this bug by finding the last CPU of the passed isolcpus= list and checking it against nr_cpu_ids. It also fixes the error message where the nr_cpu_ids should be nr_cpu_ids-1, since CPU numbering starts from 0. Signed-off-by: Rakib Mullick Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adobriyan@gmail.com Cc: akpm@linux-foundation.org Cc: longman@redhat.com Cc: mka@chromium.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com [ Enhanced the changelog and the kernel message. ] Signed-off-by: Ingo Molnar include/linux/cpumask.h | 16 ++++++++++++++++ kernel/sched/topology.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) --- include/linux/cpumask.h | 16 ++++++++++++++++ kernel/sched/topology.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index cd415b733c2a..63661de67ad4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_last(const struct cpumask *srcp) +{ + return 0; +} + /* Valid inputs for n are -1 and 0. */ static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { @@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_last - get the last CPU in a cpumask + * @srcp: - the cpumask pointer + * + * Returns >= nr_cpumask_bits if no CPUs set. + */ +static inline unsigned int cpumask_last(const struct cpumask *srcp) +{ + return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + unsigned int cpumask_next(int n, const struct cpumask *srcp); /** diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e50450c2fed8..e3d31b0880dc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); + if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1); return 0; } return 1; From d15155824c5014803d91b829736d249c500bdda6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:46 +0100 Subject: [PATCH 0548/1085] linux/compiler.h: Split into compiler.h and compiler_types.h linux/compiler.h is included indirectly by linux/types.h via uapi/linux/types.h -> uapi/linux/posix_types.h -> linux/stddef.h -> uapi/linux/stddef.h and is needed to provide a proper definition of offsetof. Unfortunately, compiler.h requires a definition of smp_read_barrier_depends() for defining lockless_dereference() and soon for defining READ_ONCE(), which means that all users of READ_ONCE() will need to include asm/barrier.h to avoid splats such as: In file included from include/uapi/linux/stddef.h:1:0, from include/linux/stddef.h:4, from arch/h8300/kernel/asm-offsets.c:11: include/linux/list.h: In function 'list_empty': >> include/linux/compiler.h:343:2: error: implicit declaration of function 'smp_read_barrier_depends' [-Werror=implicit-function-declaration] smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ ^ A better alternative is to include asm/barrier.h in linux/compiler.h, but this requires a type definition for "bool" on some architectures (e.g. x86), which is defined later by linux/types.h. Type "bool" is also used directly in linux/compiler.h, so the whole thing is pretty fragile. This patch splits compiler.h in two: compiler_types.h contains type annotations, definitions and the compiler-specific parts, whereas compiler.h #includes compiler-types.h and additionally defines macros such as {READ,WRITE.ACCESS}_ONCE(). uapi/linux/stddef.h and linux/linkage.h are then moved over to include linux/compiler_types.h, which fixes the build for h8 and blackfin. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/arm/include/asm/ptrace.h | 3 +- arch/sparc/include/asm/ptrace.h | 1 + arch/um/include/shared/init.h | 2 +- include/linux/compiler-clang.h | 2 +- include/linux/compiler-gcc.h | 2 +- include/linux/compiler-intel.h | 2 +- include/linux/compiler.h | 265 +----------------------------- include/linux/compiler_types.h | 274 ++++++++++++++++++++++++++++++++ include/linux/linkage.h | 2 +- include/uapi/linux/stddef.h | 2 +- scripts/headers_install.sh | 2 +- 11 files changed, 286 insertions(+), 271 deletions(-) create mode 100644 include/linux/compiler_types.h diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index e9c9a117bd25..c7cdbb43ae7c 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -126,8 +126,7 @@ extern unsigned long profile_pc(struct pt_regs *regs); /* * kprobe-based event tracer support */ -#include -#include +#include #define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0)) extern int regs_query_register_offset(const char *name); diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index d73428e4333c..b383484edcd3 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -6,6 +6,7 @@ #if defined(__sparc__) && defined(__arch64__) #ifndef __ASSEMBLY__ +#include #include #include diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h index 233e2593eee0..094e96ce653b 100644 --- a/arch/um/include/shared/init.h +++ b/arch/um/include/shared/init.h @@ -40,7 +40,7 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -#include +#include /* These are for everybody (although not all archs will actually discard it in modules) */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index de179993e039..5947a3e6c0e6 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -1,4 +1,4 @@ -#ifndef __LINUX_COMPILER_H +#ifndef __LINUX_COMPILER_TYPES_H #error "Please don't include directly, include instead." #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 16d41de92ee3..ce8e965646ef 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -1,4 +1,4 @@ -#ifndef __LINUX_COMPILER_H +#ifndef __LINUX_COMPILER_TYPES_H #error "Please don't include directly, include instead." #endif diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index d4c71132d07f..e438ac89c692 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -1,4 +1,4 @@ -#ifndef __LINUX_COMPILER_H +#ifndef __LINUX_COMPILER_TYPES_H #error "Please don't include directly, include instead." #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e95a2631e545..08083186e54f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -1,111 +1,12 @@ #ifndef __LINUX_COMPILER_H #define __LINUX_COMPILER_H +#include + #ifndef __ASSEMBLY__ -#ifdef __CHECKER__ -# define __user __attribute__((noderef, address_space(1))) -# define __kernel __attribute__((address_space(0))) -# define __safe __attribute__((safe)) -# define __force __attribute__((force)) -# define __nocast __attribute__((nocast)) -# define __iomem __attribute__((noderef, address_space(2))) -# define __must_hold(x) __attribute__((context(x,1,1))) -# define __acquires(x) __attribute__((context(x,0,1))) -# define __releases(x) __attribute__((context(x,1,0))) -# define __acquire(x) __context__(x,1) -# define __release(x) __context__(x,-1) -# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) -# define __percpu __attribute__((noderef, address_space(3))) -# define __rcu __attribute__((noderef, address_space(4))) -# define __private __attribute__((noderef)) -extern void __chk_user_ptr(const volatile void __user *); -extern void __chk_io_ptr(const volatile void __iomem *); -# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) -#else /* __CHECKER__ */ -# ifdef STRUCTLEAK_PLUGIN -# define __user __attribute__((user)) -# else -# define __user -# endif -# define __kernel -# define __safe -# define __force -# define __nocast -# define __iomem -# define __chk_user_ptr(x) (void)0 -# define __chk_io_ptr(x) (void)0 -# define __builtin_warning(x, y...) (1) -# define __must_hold(x) -# define __acquires(x) -# define __releases(x) -# define __acquire(x) (void)0 -# define __release(x) (void)0 -# define __cond_lock(x,c) (c) -# define __percpu -# define __rcu -# define __private -# define ACCESS_PRIVATE(p, member) ((p)->member) -#endif /* __CHECKER__ */ - -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ -#ifdef __GNUC__ -#include -#endif - -#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) -#define notrace __attribute__((hotpatch(0,0))) -#else -#define notrace __attribute__((no_instrument_function)) -#endif - -/* Intel compiler defines __GNUC__. So we will overwrite implementations - * coming from above header files here - */ -#ifdef __INTEL_COMPILER -# include -#endif - -/* Clang compiler defines __GNUC__. So we will overwrite implementations - * coming from above header files here - */ -#ifdef __clang__ -#include -#endif - -/* - * Generic compiler-dependent macros required for kernel - * build go below this comment. Actual compiler/compiler version - * specific implementations come from the above header files - */ - -struct ftrace_branch_data { - const char *func; - const char *file; - unsigned line; - union { - struct { - unsigned long correct; - unsigned long incorrect; - }; - struct { - unsigned long miss; - unsigned long hit; - }; - unsigned long miss_hit[2]; - }; -}; - -struct ftrace_likely_data { - struct ftrace_branch_data data; - unsigned long constant; -}; - /* * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code * to disable branch tracing on a per file basis. @@ -332,6 +233,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ +#include #define __READ_ONCE(x, check) \ ({ \ @@ -362,167 +264,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #endif /* __ASSEMBLY__ */ -#ifdef __KERNEL__ -/* - * Allow us to mark functions as 'deprecated' and have gcc emit a nice - * warning for each use, in hopes of speeding the functions removal. - * Usage is: - * int __deprecated foo(void) - */ -#ifndef __deprecated -# define __deprecated /* unimplemented */ -#endif - -#ifdef MODULE -#define __deprecated_for_modules __deprecated -#else -#define __deprecated_for_modules -#endif - -#ifndef __must_check -#define __must_check -#endif - -#ifndef CONFIG_ENABLE_MUST_CHECK -#undef __must_check -#define __must_check -#endif -#ifndef CONFIG_ENABLE_WARN_DEPRECATED -#undef __deprecated -#undef __deprecated_for_modules -#define __deprecated -#define __deprecated_for_modules -#endif - -#ifndef __malloc -#define __malloc -#endif - -/* - * Allow us to avoid 'defined but not used' warnings on functions and data, - * as well as force them to be emitted to the assembly file. - * - * As of gcc 3.4, static functions that are not marked with attribute((used)) - * may be elided from the assembly file. As of gcc 3.4, static data not so - * marked will not be elided, but this may change in a future gcc version. - * - * NOTE: Because distributions shipped with a backported unit-at-a-time - * compiler in gcc 3.3, we must define __used to be __attribute__((used)) - * for gcc >=3.3 instead of 3.4. - * - * In prior versions of gcc, such functions and data would be emitted, but - * would be warned about except with attribute((unused)). - * - * Mark functions that are referenced only in inline assembly as __used so - * the code is emitted even though it appears to be unreferenced. - */ -#ifndef __used -# define __used /* unimplemented */ -#endif - -#ifndef __maybe_unused -# define __maybe_unused /* unimplemented */ -#endif - -#ifndef __always_unused -# define __always_unused /* unimplemented */ -#endif - -#ifndef noinline -#define noinline -#endif - -/* - * Rather then using noinline to prevent stack consumption, use - * noinline_for_stack instead. For documentation reasons. - */ -#define noinline_for_stack noinline - -#ifndef __always_inline -#define __always_inline inline -#endif - -#endif /* __KERNEL__ */ - -/* - * From the GCC manual: - * - * Many functions do not examine any values except their arguments, - * and have no effects except the return value. Basically this is - * just slightly more strict class than the `pure' attribute above, - * since function is not allowed to read global memory. - * - * Note that a function that has pointer arguments and examines the - * data pointed to must _not_ be declared `const'. Likewise, a - * function that calls a non-`const' function usually must not be - * `const'. It does not make sense for a `const' function to return - * `void'. - */ -#ifndef __attribute_const__ -# define __attribute_const__ /* unimplemented */ -#endif - -#ifndef __designated_init -# define __designated_init -#endif - -#ifndef __latent_entropy -# define __latent_entropy -#endif - -#ifndef __randomize_layout -# define __randomize_layout __designated_init -#endif - -#ifndef __no_randomize_layout -# define __no_randomize_layout -#endif - -#ifndef randomized_struct_fields_start -# define randomized_struct_fields_start -# define randomized_struct_fields_end -#endif - -/* - * Tell gcc if a function is cold. The compiler will assume any path - * directly leading to the call is unlikely. - */ - -#ifndef __cold -#define __cold -#endif - -/* Simple shorthand for a section definition */ -#ifndef __section -# define __section(S) __attribute__ ((__section__(#S))) -#endif - -#ifndef __visible -#define __visible -#endif - -#ifndef __nostackprotector -# define __nostackprotector -#endif - -/* - * Assume alignment of return value. - */ -#ifndef __assume_aligned -#define __assume_aligned(a, ...) -#endif - - -/* Are two types/vars the same type (ignoring qualifiers)? */ -#ifndef __same_type -# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -#endif - -/* Is this type a native word size -- useful for atomic operations */ -#ifndef __native_word -# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) -#endif - /* Compile time object size, -1 for unknown */ #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h new file mode 100644 index 000000000000..6b79a9bba9a7 --- /dev/null +++ b/include/linux/compiler_types.h @@ -0,0 +1,274 @@ +#ifndef __LINUX_COMPILER_TYPES_H +#define __LINUX_COMPILER_TYPES_H + +#ifndef __ASSEMBLY__ + +#ifdef __CHECKER__ +# define __user __attribute__((noderef, address_space(1))) +# define __kernel __attribute__((address_space(0))) +# define __safe __attribute__((safe)) +# define __force __attribute__((force)) +# define __nocast __attribute__((nocast)) +# define __iomem __attribute__((noderef, address_space(2))) +# define __must_hold(x) __attribute__((context(x,1,1))) +# define __acquires(x) __attribute__((context(x,0,1))) +# define __releases(x) __attribute__((context(x,1,0))) +# define __acquire(x) __context__(x,1) +# define __release(x) __context__(x,-1) +# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) +# define __percpu __attribute__((noderef, address_space(3))) +# define __rcu __attribute__((noderef, address_space(4))) +# define __private __attribute__((noderef)) +extern void __chk_user_ptr(const volatile void __user *); +extern void __chk_io_ptr(const volatile void __iomem *); +# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) +#else /* __CHECKER__ */ +# ifdef STRUCTLEAK_PLUGIN +# define __user __attribute__((user)) +# else +# define __user +# endif +# define __kernel +# define __safe +# define __force +# define __nocast +# define __iomem +# define __chk_user_ptr(x) (void)0 +# define __chk_io_ptr(x) (void)0 +# define __builtin_warning(x, y...) (1) +# define __must_hold(x) +# define __acquires(x) +# define __releases(x) +# define __acquire(x) (void)0 +# define __release(x) (void)0 +# define __cond_lock(x,c) (c) +# define __percpu +# define __rcu +# define __private +# define ACCESS_PRIVATE(p, member) ((p)->member) +#endif /* __CHECKER__ */ + +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a,b) a##b +#define __PASTE(a,b) ___PASTE(a,b) + +#ifdef __KERNEL__ + +#ifdef __GNUC__ +#include +#endif + +#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) +#define notrace __attribute__((hotpatch(0,0))) +#else +#define notrace __attribute__((no_instrument_function)) +#endif + +/* Intel compiler defines __GNUC__. So we will overwrite implementations + * coming from above header files here + */ +#ifdef __INTEL_COMPILER +# include +#endif + +/* Clang compiler defines __GNUC__. So we will overwrite implementations + * coming from above header files here + */ +#ifdef __clang__ +#include +#endif + +/* + * Generic compiler-dependent macros required for kernel + * build go below this comment. Actual compiler/compiler version + * specific implementations come from the above header files + */ + +struct ftrace_branch_data { + const char *func; + const char *file; + unsigned line; + union { + struct { + unsigned long correct; + unsigned long incorrect; + }; + struct { + unsigned long miss; + unsigned long hit; + }; + unsigned long miss_hit[2]; + }; +}; + +struct ftrace_likely_data { + struct ftrace_branch_data data; + unsigned long constant; +}; + +#endif /* __KERNEL__ */ + +#endif /* __ASSEMBLY__ */ + +#ifdef __KERNEL__ +/* + * Allow us to mark functions as 'deprecated' and have gcc emit a nice + * warning for each use, in hopes of speeding the functions removal. + * Usage is: + * int __deprecated foo(void) + */ +#ifndef __deprecated +# define __deprecated /* unimplemented */ +#endif + +#ifdef MODULE +#define __deprecated_for_modules __deprecated +#else +#define __deprecated_for_modules +#endif + +#ifndef __must_check +#define __must_check +#endif + +#ifndef CONFIG_ENABLE_MUST_CHECK +#undef __must_check +#define __must_check +#endif +#ifndef CONFIG_ENABLE_WARN_DEPRECATED +#undef __deprecated +#undef __deprecated_for_modules +#define __deprecated +#define __deprecated_for_modules +#endif + +#ifndef __malloc +#define __malloc +#endif + +/* + * Allow us to avoid 'defined but not used' warnings on functions and data, + * as well as force them to be emitted to the assembly file. + * + * As of gcc 3.4, static functions that are not marked with attribute((used)) + * may be elided from the assembly file. As of gcc 3.4, static data not so + * marked will not be elided, but this may change in a future gcc version. + * + * NOTE: Because distributions shipped with a backported unit-at-a-time + * compiler in gcc 3.3, we must define __used to be __attribute__((used)) + * for gcc >=3.3 instead of 3.4. + * + * In prior versions of gcc, such functions and data would be emitted, but + * would be warned about except with attribute((unused)). + * + * Mark functions that are referenced only in inline assembly as __used so + * the code is emitted even though it appears to be unreferenced. + */ +#ifndef __used +# define __used /* unimplemented */ +#endif + +#ifndef __maybe_unused +# define __maybe_unused /* unimplemented */ +#endif + +#ifndef __always_unused +# define __always_unused /* unimplemented */ +#endif + +#ifndef noinline +#define noinline +#endif + +/* + * Rather then using noinline to prevent stack consumption, use + * noinline_for_stack instead. For documentation reasons. + */ +#define noinline_for_stack noinline + +#ifndef __always_inline +#define __always_inline inline +#endif + +#endif /* __KERNEL__ */ + +/* + * From the GCC manual: + * + * Many functions do not examine any values except their arguments, + * and have no effects except the return value. Basically this is + * just slightly more strict class than the `pure' attribute above, + * since function is not allowed to read global memory. + * + * Note that a function that has pointer arguments and examines the + * data pointed to must _not_ be declared `const'. Likewise, a + * function that calls a non-`const' function usually must not be + * `const'. It does not make sense for a `const' function to return + * `void'. + */ +#ifndef __attribute_const__ +# define __attribute_const__ /* unimplemented */ +#endif + +#ifndef __designated_init +# define __designated_init +#endif + +#ifndef __latent_entropy +# define __latent_entropy +#endif + +#ifndef __randomize_layout +# define __randomize_layout __designated_init +#endif + +#ifndef __no_randomize_layout +# define __no_randomize_layout +#endif + +#ifndef randomized_struct_fields_start +# define randomized_struct_fields_start +# define randomized_struct_fields_end +#endif + +/* + * Tell gcc if a function is cold. The compiler will assume any path + * directly leading to the call is unlikely. + */ + +#ifndef __cold +#define __cold +#endif + +/* Simple shorthand for a section definition */ +#ifndef __section +# define __section(S) __attribute__ ((__section__(#S))) +#endif + +#ifndef __visible +#define __visible +#endif + +#ifndef __nostackprotector +# define __nostackprotector +#endif + +/* + * Assume alignment of return value. + */ +#ifndef __assume_aligned +#define __assume_aligned(a, ...) +#endif + + +/* Are two types/vars the same type (ignoring qualifiers)? */ +#ifndef __same_type +# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#endif + +/* Is this type a native word size -- useful for atomic operations */ +#ifndef __native_word +# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#endif + +#endif /* __LINUX_COMPILER_TYPES_H */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index a6a42dd02466..ebd61b80fed4 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -1,7 +1,7 @@ #ifndef _LINUX_LINKAGE_H #define _LINUX_LINKAGE_H -#include +#include #include #include #include diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 621fa8ac4425..d1f7cb732dfc 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -1,4 +1,4 @@ -#include +#include #ifndef __always_inline #define __always_inline inline diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index fdebd66f8fc1..63b8cc26456a 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -33,7 +33,7 @@ do sed -r \ -e 's/([ \t(])(__user|__force|__iomem)[ \t]/\1/g' \ -e 's/__attribute_const__([ \t]|$)/\1/g' \ - -e 's@^#include @@' \ + -e 's@^#include @@' \ -e 's/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g' \ -e 's/(^|[ \t(])(inline|asm|volatile)([ \t(]|$)/\1__\2__\3/g' \ -e 's@#(ifndef|define|endif[ \t]*/[*])[ \t]*_UAPI@#\1 @' \ From 76ebbe78f7390aee075a7f3768af197ded1bdfbb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:47 +0100 Subject: [PATCH 0549/1085] locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE() In preparation for the removal of lockless_dereference(), which is the same as READ_ONCE() on all architectures other than Alpha, add an implicit smp_read_barrier_depends() to READ_ONCE() so that it can be used to head dependency chains on all architectures. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 08083186e54f..7d7b77da9716 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -242,6 +242,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __read_once_size(&(x), __u.__c, sizeof(x)); \ else \ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) #define READ_ONCE(x) __READ_ONCE(x, 1) From 506458efaf153c1ea480591c5602a5a3ba5a3b76 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: [PATCH 0550/1085] locking/barriers: Convert users of lockless_dereference() to READ_ONCE() READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/mmu_context.h | 4 ++-- arch/x86/kernel/ldt.c | 2 +- drivers/md/dm-mpath.c | 20 ++++++++++---------- fs/dcache.c | 4 ++-- fs/overlayfs/ovl_entry.h | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- mm/slab.h | 2 +- 13 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 80534d3c2480..589af1eec7c1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment) struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); + ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 3c856a15b98e..efc530642f7d 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -72,8 +72,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - /* lockless_dereference synchronizes with smp_store_release */ - ldt = lockless_dereference(mm->context.ldt); + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f0e64db18ac8..0a21390642c4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -101,7 +101,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) static void install_ldt(struct mm_struct *current_mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(¤t_mm->context.ldt, ldt); /* Activate the LDT for all CPUs using current_mm. */ diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 11f273d2f018..3f88c9d32f7e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/fs/dcache.c b/fs/dcache.c index f90141387f01..34c852af215c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c { /* * Be careful about RCU walk racing with rename: - * use 'lockless_dereference' to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - const unsigned char *cs = lockless_dereference(dentry->d_name.name); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 25d9b5adcd42..36b49bd09264 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { - return lockless_dereference(oi->__upperdentry); + return READ_ONCE(oi->__upperdentry); } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0f85ee9c3268..c67a7703296b 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile = lockless_dereference(od->upperfile); + realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2bea1d5e9930..5ed091c064b2 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -274,7 +274,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -367,7 +367,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af9..a6ddc42f87a5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..824a583079a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4231,7 +4231,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4328,7 +4328,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ae832e13b97..8ac79355915b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -189,7 +189,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 836a72a66fba..9a9f262fc53d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) diff --git a/mm/slab.h b/mm/slab.h index 073362816acc..8894f811a89d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -258,7 +258,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) * memcg_caches issues a write barrier to match this (see * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(arr->entries[idx]); + cachep = READ_ONCE(arr->entries[idx]); rcu_read_unlock(); return cachep; From 59ecbbe7b31cd2d86ff9a9f461a00f7e7533aedc Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:49 +0100 Subject: [PATCH 0551/1085] locking/barriers: Kill lockless_dereference() lockless_dereference() is a nice idea, but it gained little traction in kernel code since its introduction three years ago. This is partly because it's a pain to type, but also because using READ_ONCE() instead has worked correctly on all architectures apart from Alpha, which is a fully supported but somewhat niche architecture these days. Now that READ_ONCE() has been upgraded to contain an implicit smp_read_barrier_depends() and the few callers of lockless_dereference() have been converted, we can remove lockless_dereference() altogether. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-5-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 12 ----------- .../translations/ko_KR/memory-barriers.txt | 12 ----------- include/linux/compiler.h | 20 ------------------- 3 files changed, 44 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index b759a60624fd..470a682f3fa4 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1886,18 +1886,6 @@ There are some more advanced barrier functions: See Documentation/atomic_{t,bitops}.txt for more information. - (*) lockless_dereference(); - - This can be thought of as a pointer-fetch wrapper around the - smp_read_barrier_depends() data-dependency barrier. - - This is also similar to rcu_dereference(), but in cases where - object lifetime is handled by some mechanism other than RCU, for - example, when the objects removed only when the system goes down. - In addition, lockless_dereference() is used in some data structures - that can be used both with and without RCU. - - (*) dma_wmb(); (*) dma_rmb(); diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index a7a813258013..ec3b46e27b7a 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -1858,18 +1858,6 @@ Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효 참고하세요. - (*) lockless_dereference(); - - 이 함수는 smp_read_barrier_depends() 데이터 의존성 배리어를 사용하는 - 포인터 읽어오기 래퍼(wrapper) 함수로 생각될 수 있습니다. - - 객체의 라이프타임이 RCU 외의 메커니즘으로 관리된다는 점을 제외하면 - rcu_dereference() 와도 유사한데, 예를 들면 객체가 시스템이 꺼질 때에만 - 제거되는 경우 등입니다. 또한, lockless_dereference() 은 RCU 와 함께 - 사용될수도, RCU 없이 사용될 수도 있는 일부 데이터 구조에 사용되고 - 있습니다. - - (*) dma_wmb(); (*) dma_rmb(); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7d7b77da9716..5a1cab48442c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -346,24 +346,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s (volatile typeof(x) *)&(x); }) #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x)) -/** - * lockless_dereference() - safely load a pointer for later dereference - * @p: The pointer to load - * - * Similar to rcu_dereference(), but for situations where the pointed-to - * object's lifetime is managed by something other than RCU. That - * "something other" might be reference counting or simple immortality. - * - * The seemingly unused variable ___typecheck_p validates that @p is - * indeed a pointer type by using a pointer to typeof(*p) as the type. - * Taking a pointer to typeof(*p) again is needed in case p is void *. - */ -#define lockless_dereference(p) \ -({ \ - typeof(p) _________p1 = READ_ONCE(p); \ - typeof(*(p)) *___typecheck_p __maybe_unused; \ - smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ - (_________p1); \ -}) - #endif /* __LINUX_COMPILER_H */ From 5a8897cc7631fa544d079c443800f4420d1b173f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:50 +0100 Subject: [PATCH 0552/1085] locking/atomics/alpha: Add smp_read_barrier_depends() to _release()/_relaxed() atomics As part of the fight against smp_read_barrier_depends(), we require dependency ordering to be preserved when a dependency is headed by a load performed using an atomic operation. This patch adds smp_read_barrier_depends() to the _release() and _relaxed() atomics on alpha, which otherwise lack anything to enforce dependency ordering. Signed-off-by: Will Deacon Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-6-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/atomic.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 498933a7df97..16961a3f45ba 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -13,6 +13,15 @@ * than regular operations. */ +/* + * To ensure dependency ordering is preserved for the _relaxed and + * _release atomics, an smp_read_barrier_depends() is unconditionally + * inserted into the _relaxed variants, which are used to build the + * barriered versions. To avoid redundant back-to-back fences, we can + * define the _acquire and _fence versions explicitly. + */ +#define __atomic_op_acquire(op, args...) op##_relaxed(args) +#define __atomic_op_fence __atomic_op_release #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } @@ -60,6 +69,7 @@ static inline int atomic_##op##_return_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ + smp_read_barrier_depends(); \ return result; \ } @@ -77,6 +87,7 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ + smp_read_barrier_depends(); \ return result; \ } @@ -111,6 +122,7 @@ static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ + smp_read_barrier_depends(); \ return result; \ } @@ -128,6 +140,7 @@ static __inline__ long atomic64_fetch_##op##_relaxed(long i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ + smp_read_barrier_depends(); \ return result; \ } From 2a704fc8db7b0080a67d9f4f4cb2a7bcaf79949d Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:32:55 +0200 Subject: [PATCH 0553/1085] perf report: Remove code to handle inline frames from browsers The follow-up commits will make inline frames first-class citizens in the callchain, thereby obsoleting all of this special code. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-2-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 180 +++----------------------------- tools/perf/ui/stdio/hist.c | 77 +------------- tools/perf/util/evsel_fprintf.c | 32 ------ tools/perf/util/hist.c | 5 - tools/perf/util/sort.h | 1 - 5 files changed, 13 insertions(+), 282 deletions(-) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 13dfb0a0bdeb..3a433f370e7f 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -154,57 +154,9 @@ static void callchain_list__set_folding(struct callchain_list *cl, bool unfold) cl->unfolded = unfold ? cl->has_children : false; } -static struct inline_node *inline_node__create(struct map *map, u64 ip) -{ - struct dso *dso; - struct inline_node *node; - - if (map == NULL) - return NULL; - - dso = map->dso; - if (dso == NULL) - return NULL; - - node = dso__parse_addr_inlines(dso, - map__rip_2objdump(map, ip)); - - return node; -} - -static int inline__count_rows(struct inline_node *node) -{ - struct inline_list *ilist; - int i = 0; - - if (node == NULL) - return 0; - - list_for_each_entry(ilist, &node->val, list) { - if ((ilist->filename != NULL) || (ilist->funcname != NULL)) - i++; - } - - return i; -} - -static int callchain_list__inline_rows(struct callchain_list *chain) -{ - struct inline_node *node; - int rows; - - node = inline_node__create(chain->ms.map, chain->ip); - if (node == NULL) - return 0; - - rows = inline__count_rows(node); - inline_node__delete(node); - return rows; -} - static int callchain_node__count_rows_rb_tree(struct callchain_node *node) { - int n = 0, inline_rows; + int n = 0; struct rb_node *nd; for (nd = rb_first(&node->rb_root); nd; nd = rb_next(nd)) { @@ -215,12 +167,6 @@ static int callchain_node__count_rows_rb_tree(struct callchain_node *node) list_for_each_entry(chain, &child->val, list) { ++n; - if (symbol_conf.inline_name) { - inline_rows = - callchain_list__inline_rows(chain); - n += inline_rows; - } - /* We need this because we may not have children */ folded_sign = callchain_list__folded(chain); if (folded_sign == '+') @@ -272,7 +218,7 @@ static int callchain_node__count_rows(struct callchain_node *node) { struct callchain_list *chain; bool unfolded = false; - int n = 0, inline_rows; + int n = 0; if (callchain_param.mode == CHAIN_FLAT) return callchain_node__count_flat_rows(node); @@ -281,10 +227,6 @@ static int callchain_node__count_rows(struct callchain_node *node) list_for_each_entry(chain, &node->val, list) { ++n; - if (symbol_conf.inline_name) { - inline_rows = callchain_list__inline_rows(chain); - n += inline_rows; - } unfolded = chain->unfolded; } @@ -432,19 +374,6 @@ static void hist_entry__init_have_children(struct hist_entry *he) he->init_have_children = true; } -static void hist_entry_init_inline_node(struct hist_entry *he) -{ - if (he->inline_node) - return; - - he->inline_node = inline_node__create(he->ms.map, he->ip); - - if (he->inline_node == NULL) - return; - - he->has_children = true; -} - static bool hist_browser__toggle_fold(struct hist_browser *browser) { struct hist_entry *he = browser->he_selection; @@ -476,12 +405,8 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser) if (he->unfolded) { if (he->leaf) - if (he->inline_node) - he->nr_rows = inline__count_rows( - he->inline_node); - else - he->nr_rows = callchain__count_rows( - &he->sorted_chain); + he->nr_rows = callchain__count_rows( + &he->sorted_chain); else he->nr_rows = hierarchy_count_rows(browser, he, false); @@ -841,71 +766,6 @@ static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_u #define LEVEL_OFFSET_STEP 3 -static int hist_browser__show_inline(struct hist_browser *browser, - struct inline_node *node, - unsigned short row, - int offset) -{ - struct inline_list *ilist; - char buf[1024]; - int color, width, first_row; - - first_row = row; - width = browser->b.width - (LEVEL_OFFSET_STEP + 2); - list_for_each_entry(ilist, &node->val, list) { - if ((ilist->filename != NULL) || (ilist->funcname != NULL)) { - color = HE_COLORSET_NORMAL; - if (ui_browser__is_current_entry(&browser->b, row)) - color = HE_COLORSET_SELECTED; - - if (callchain_param.key == CCKEY_ADDRESS || - callchain_param.key == CCKEY_SRCLINE) { - if (ilist->filename != NULL) - scnprintf(buf, sizeof(buf), - "%s:%d (inline)", - ilist->filename, - ilist->line_nr); - else - scnprintf(buf, sizeof(buf), "??"); - } else if (ilist->funcname != NULL) - scnprintf(buf, sizeof(buf), "%s (inline)", - ilist->funcname); - else if (ilist->filename != NULL) - scnprintf(buf, sizeof(buf), - "%s:%d (inline)", - ilist->filename, - ilist->line_nr); - else - scnprintf(buf, sizeof(buf), "??"); - - ui_browser__set_color(&browser->b, color); - hist_browser__gotorc(browser, row, 0); - ui_browser__write_nstring(&browser->b, " ", - LEVEL_OFFSET_STEP + offset); - ui_browser__write_nstring(&browser->b, buf, width); - row++; - } - } - - return row - first_row; -} - -static size_t show_inline_list(struct hist_browser *browser, struct map *map, - u64 ip, int row, int offset) -{ - struct inline_node *node; - int ret; - - node = inline_node__create(map, ip); - if (node == NULL) - return 0; - - ret = hist_browser__show_inline(browser, node, row, offset); - - inline_node__delete(node); - return ret; -} - static int hist_browser__show_callchain_list(struct hist_browser *browser, struct callchain_node *node, struct callchain_list *chain, @@ -917,7 +777,7 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser, char bf[1024], *alloc_str; char buf[64], *alloc_str2; const char *str; - int inline_rows = 0, ret = 1; + int ret = 1; if (arg->row_offset != 0) { arg->row_offset--; @@ -954,12 +814,7 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser, free(alloc_str); free(alloc_str2); - if (symbol_conf.inline_name) { - inline_rows = show_inline_list(browser, chain->ms.map, - chain->ip, row + 1, offset); - } - - return ret + inline_rows; + return ret; } static bool check_percent_display(struct rb_node *node, u64 parent_total) @@ -1383,12 +1238,6 @@ static int hist_browser__show_entry(struct hist_browser *browser, folded_sign = hist_entry__folded(entry); } - if (symbol_conf.inline_name && - (!entry->has_children)) { - hist_entry_init_inline_node(entry); - folded_sign = hist_entry__folded(entry); - } - if (row_offset == 0) { struct hpp_arg arg = { .b = &browser->b, @@ -1420,8 +1269,7 @@ static int hist_browser__show_entry(struct hist_browser *browser, } if (first) { - if (symbol_conf.use_callchain || - symbol_conf.inline_name) { + if (symbol_conf.use_callchain) { ui_browser__printf(&browser->b, "%c ", folded_sign); width -= 2; } @@ -1463,15 +1311,11 @@ static int hist_browser__show_entry(struct hist_browser *browser, .is_current_entry = current_entry, }; - if (entry->inline_node) - printed += hist_browser__show_inline(browser, - entry->inline_node, row, 0); - else - printed += hist_browser__show_callchain(browser, - entry, 1, row, - hist_browser__show_callchain_entry, - &arg, - hist_browser__check_output_full); + printed += hist_browser__show_callchain(browser, + entry, 1, row, + hist_browser__show_callchain_entry, + &arg, + hist_browser__check_output_full); } return printed; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 8bdb7a500181..b6b9baac0e3b 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -21,64 +21,6 @@ static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin) return ret; } -static size_t inline__fprintf(struct map *map, u64 ip, int left_margin, - int depth, int depth_mask, FILE *fp) -{ - struct dso *dso; - struct inline_node *node; - struct inline_list *ilist; - int ret = 0, i; - - if (map == NULL) - return 0; - - dso = map->dso; - if (dso == NULL) - return 0; - - node = dso__parse_addr_inlines(dso, - map__rip_2objdump(map, ip)); - if (node == NULL) - return 0; - - list_for_each_entry(ilist, &node->val, list) { - if ((ilist->filename != NULL) || (ilist->funcname != NULL)) { - ret += callchain__fprintf_left_margin(fp, left_margin); - - for (i = 0; i < depth; i++) { - if (depth_mask & (1 << i)) - ret += fprintf(fp, "|"); - else - ret += fprintf(fp, " "); - ret += fprintf(fp, " "); - } - - if (callchain_param.key == CCKEY_ADDRESS || - callchain_param.key == CCKEY_SRCLINE) { - if (ilist->filename != NULL) - ret += fprintf(fp, "%s:%d (inline)", - ilist->filename, - ilist->line_nr); - else - ret += fprintf(fp, "??"); - } else if (ilist->funcname != NULL) - ret += fprintf(fp, "%s (inline)", - ilist->funcname); - else if (ilist->filename != NULL) - ret += fprintf(fp, "%s:%d (inline)", - ilist->filename, - ilist->line_nr); - else - ret += fprintf(fp, "??"); - - ret += fprintf(fp, "\n"); - } - } - - inline_node__delete(node); - return ret; -} - static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask, int left_margin) { @@ -137,9 +79,6 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node, fputc('\n', fp); free(alloc_str); - if (symbol_conf.inline_name) - ret += inline__fprintf(chain->ms.map, chain->ip, - left_margin, depth, depth_mask, fp); return ret; } @@ -314,13 +253,6 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root, if (++entries_printed == callchain_param.print_limit) break; - - if (symbol_conf.inline_name) - ret += inline__fprintf(chain->ms.map, - chain->ip, - left_margin, - 0, 0, - fp); } root = &cnode->rb_root; } @@ -600,7 +532,6 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size, { int ret; int callchain_ret = 0; - int inline_ret = 0; struct perf_hpp hpp = { .buf = bf, .size = size, @@ -622,13 +553,7 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size, callchain_ret = hist_entry_callchain__fprintf(he, total_period, 0, fp); - if (callchain_ret == 0 && symbol_conf.inline_name) { - inline_ret = inline__fprintf(he->ms.map, he->ip, 0, 0, 0, fp); - ret += inline_ret; - if (inline_ret > 0) - ret += fprintf(fp, "\n"); - } else - ret += callchain_ret; + ret += callchain_ret; return ret; } diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 583f3a602506..f2c6c5ee11e8 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -169,38 +169,6 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, if (!print_oneline) printed += fprintf(fp, "\n"); - if (symbol_conf.inline_name && node->map) { - struct inline_node *inode; - - addr = map__rip_2objdump(node->map, node->ip), - inode = dso__parse_addr_inlines(node->map->dso, addr); - - if (inode) { - struct inline_list *ilist; - - list_for_each_entry(ilist, &inode->val, list) { - if (print_arrow) - printed += fprintf(fp, " <-"); - - /* IP is same, just skip it */ - if (print_ip) - printed += fprintf(fp, "%c%16s", - s, ""); - if (print_sym) - printed += fprintf(fp, " %s", - ilist->funcname); - if (print_srcline) - printed += fprintf(fp, "\n %s:%d", - ilist->filename, - ilist->line_nr); - if (!print_oneline) - printed += fprintf(fp, "\n"); - } - - inline_node__delete(inode); - } - } - if (symbol_conf.bt_stop_list && node->sym && strlist__has_entry(symbol_conf.bt_stop_list, diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index e60d8d8ea4c2..b0fa9c217e1c 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1141,11 +1141,6 @@ void hist_entry__delete(struct hist_entry *he) zfree(&he->mem_info); } - if (he->inline_node) { - inline_node__delete(he->inline_node); - he->inline_node = NULL; - } - zfree(&he->stat_acc); free_srcline(he->srcline); if (he->srcfile && he->srcfile[0]) diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index f36dc4980a6c..507d096aee4e 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -129,7 +129,6 @@ struct hist_entry { }; char *srcline; char *srcfile; - struct inline_node *inline_node; struct symbol *parent; struct branch_info *branch_info; struct hists *hists; From 40a342cda2cd9bc8f7bf81c5ce1a141584760757 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:32:56 +0200 Subject: [PATCH 0554/1085] perf callchain: Store srcline in callchain_cursor_node This is mostly a preparation to enable the creation of full callchain nodes for inline frames. Such frames will reference the IP of the non-inlined frame, but hold the symbol and srcline for an inlined location. As such, we won't be able to query the srcline on-demand based on the IP alone. Instead, we will leverage the functionality provided by this patch here, and store the srcline for the inlined nodes in the new srcline member of callchain_cursor_node. Note that this patch on its own leaks the srcline, as there is no free_callchain_cursor_node or similar. A future patch will add caching of the srcline and handle deletion properly. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-3-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 31 +++++++++---------------------- tools/perf/util/callchain.h | 6 ++++-- tools/perf/util/machine.c | 18 ++++++++++++++++-- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index a971caf3759d..e7ee794d1e5b 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -566,6 +566,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->ip = cursor_node->ip; call->ms.sym = cursor_node->sym; call->ms.map = map__get(cursor_node->map); + call->srcline = cursor_node->srcline; if (cursor_node->branch) { call->branch_count = 1; @@ -647,20 +648,11 @@ enum match_result { static enum match_result match_chain_srcline(struct callchain_cursor_node *node, struct callchain_list *cnode) { - char *left = NULL; - char *right = NULL; + const char *left = cnode->srcline; + const char *right = node->srcline; enum match_result ret = MATCH_EQ; int cmp; - if (cnode->ms.map) - left = get_srcline(cnode->ms.map->dso, - map__rip_2objdump(cnode->ms.map, cnode->ip), - cnode->ms.sym, true, false); - if (node->map) - right = get_srcline(node->map->dso, - map__rip_2objdump(node->map, node->ip), - node->sym, true, false); - if (left && right) cmp = strcmp(left, right); else if (!left && right) @@ -675,8 +667,6 @@ static enum match_result match_chain_srcline(struct callchain_cursor_node *node, if (cmp != 0) ret = cmp < 0 ? MATCH_LT : MATCH_GT; - free_srcline(left); - free_srcline(right); return ret; } @@ -969,7 +959,7 @@ merge_chain_branch(struct callchain_cursor *cursor, list_for_each_entry_safe(list, next_list, &src->val, list) { callchain_cursor_append(cursor, list->ip, list->ms.map, list->ms.sym, - false, NULL, 0, 0, 0); + false, NULL, 0, 0, 0, list->srcline); list_del(&list->list); map__zput(list->ms.map); free(list); @@ -1009,7 +999,8 @@ int callchain_merge(struct callchain_cursor *cursor, int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, u64 iter_cycles, u64 branch_from) + int nr_loop_iter, u64 iter_cycles, u64 branch_from, + const char *srcline) { struct callchain_cursor_node *node = *cursor->last; @@ -1028,6 +1019,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor, node->branch = branch; node->nr_loop_iter = nr_loop_iter; node->iter_cycles = iter_cycles; + node->srcline = srcline; if (flags) memcpy(&node->branch_flags, flags, @@ -1115,12 +1107,7 @@ char *callchain_list__sym_name(struct callchain_list *cl, int printed; if (cl->ms.sym) { - if (show_srcline && cl->ms.map && !cl->srcline) - cl->srcline = get_srcline(cl->ms.map->dso, - map__rip_2objdump(cl->ms.map, - cl->ip), - cl->ms.sym, false, show_addr); - if (cl->srcline) + if (show_srcline && cl->srcline) printed = scnprintf(bf, bfsize, "%s %s", cl->ms.sym->name, cl->srcline); else @@ -1532,7 +1519,7 @@ int callchain_cursor__copy(struct callchain_cursor *dst, node->branch, &node->branch_flags, node->nr_loop_iter, node->iter_cycles, - node->branch_from); + node->branch_from, node->srcline); if (rc) break; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 1ed6fc61d0a5..8f67b681cde9 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -121,7 +121,7 @@ struct callchain_list { u64 iter_count; u64 iter_cycles; struct branch_type_stat brtype_stat; - char *srcline; + const char *srcline; struct list_head list; }; @@ -135,6 +135,7 @@ struct callchain_cursor_node { u64 ip; struct map *map; struct symbol *sym; + const char *srcline; bool branch; struct branch_flags branch_flags; u64 branch_from; @@ -201,7 +202,8 @@ static inline void callchain_cursor_reset(struct callchain_cursor *cursor) int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, u64 iter_cycles, u64 branch_from); + int nr_loop_iter, u64 iter_cycles, u64 branch_from, + const char *srcline); /* Close a cursor writing session. Initialize for the reader */ static inline void callchain_cursor_commit(struct callchain_cursor *cursor) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 7c3aa479201a..a37e1c056415 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1709,6 +1709,15 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, return mi; } +static char *callchain_srcline(struct map *map, struct symbol *sym, u64 ip) +{ + if (!map || callchain_param.key == CCKEY_FUNCTION) + return NULL; + + return get_srcline(map->dso, map__rip_2objdump(map, ip), + sym, false, callchain_param.key == CCKEY_ADDRESS); +} + struct iterations { int nr_loop_iter; u64 cycles; @@ -1728,6 +1737,7 @@ static int add_callchain_ip(struct thread *thread, struct addr_location al; int nr_loop_iter = 0; u64 iter_cycles = 0; + const char *srcline = NULL; al.filtered = 0; al.sym = NULL; @@ -1783,9 +1793,10 @@ static int add_callchain_ip(struct thread *thread, iter_cycles = iter->cycles; } + srcline = callchain_srcline(al.map, al.sym, al.addr); return callchain_cursor_append(cursor, al.addr, al.map, al.sym, branch, flags, nr_loop_iter, - iter_cycles, branch_from); + iter_cycles, branch_from, srcline); } struct branch_info *sample__resolve_bstack(struct perf_sample *sample, @@ -2101,12 +2112,15 @@ check_calls: static int unwind_entry(struct unwind_entry *entry, void *arg) { struct callchain_cursor *cursor = arg; + const char *srcline = NULL; if (symbol_conf.hide_unresolved && entry->sym == NULL) return 0; + + srcline = callchain_srcline(entry->map, entry->sym, entry->ip); return callchain_cursor_append(cursor, entry->ip, entry->map, entry->sym, - false, NULL, 0, 0, 0); + false, NULL, 0, 0, 0, srcline); } static int thread__resolve_callchain_unwind(struct thread *thread, From fea0cf842c7aa08950063264ab1cfbce4ba38c1b Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:32:57 +0200 Subject: [PATCH 0555/1085] perf callchain: Refactor inline_list to operate on symbols This is a requirement to create real callchain entries for inlined frames. Since the list of inlines usually contains the target symbol too, i.e. the location where the frames get inlined to, we alias that symbol and reuse it as-is is. This ensures that other dependent functionality keeps working, most notably annotation of the target frames. For all other entries in the inline_list, a fake symbol is created. These are marked by new 'inlined' member which is set to true. Only those symbols are managed by the inline_list and get freed when the inline_list is deleted from within inline_node__delete. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-4-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 93 ++++++++++++++++++++++++++------------- tools/perf/util/srcline.h | 7 ++- tools/perf/util/symbol.h | 1 + 3 files changed, 69 insertions(+), 32 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index ed8e8d2de942..c0af61b355ed 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -33,29 +33,19 @@ static const char *dso__name(struct dso *dso) return dso_name; } -static int inline_list__append(char *filename, char *funcname, int line_nr, - struct inline_node *node, struct dso *dso) +static int inline_list__append(struct symbol *symbol, char *filename, + int line_nr, struct inline_node *node) { struct inline_list *ilist; - char *demangled; ilist = zalloc(sizeof(*ilist)); if (ilist == NULL) return -1; + ilist->symbol = symbol; ilist->filename = filename; ilist->line_nr = line_nr; - if (dso != NULL) { - demangled = dso__demangle_sym(dso, 0, funcname); - if (demangled == NULL) { - ilist->funcname = funcname; - } else { - ilist->funcname = demangled; - free(funcname); - } - } - if (callchain_param.order == ORDER_CALLEE) list_add_tail(&ilist->list, &node->val); else @@ -206,19 +196,56 @@ static void addr2line_cleanup(struct a2l_data *a2l) #define MAX_INLINE_NEST 1024 +static struct symbol *new_inline_sym(struct dso *dso, + struct symbol *base_sym, + const char *funcname) +{ + struct symbol *inline_sym; + char *demangled = NULL; + + if (dso) { + demangled = dso__demangle_sym(dso, 0, funcname); + if (demangled) + funcname = demangled; + } + + if (base_sym && strcmp(funcname, base_sym->name) == 0) { + /* reuse the real, existing symbol */ + inline_sym = base_sym; + /* ensure that we don't alias an inlined symbol, which could + * lead to double frees in inline_node__delete + */ + assert(!base_sym->inlined); + } else { + /* create a fake symbol for the inline frame */ + inline_sym = symbol__new(base_sym ? base_sym->start : 0, + base_sym ? base_sym->end : 0, + base_sym ? base_sym->binding : 0, + funcname); + if (inline_sym) + inline_sym->inlined = 1; + } + + free(demangled); + + return inline_sym; +} + static int inline_list__append_dso_a2l(struct dso *dso, - struct inline_node *node) + struct inline_node *node, + struct symbol *sym) { struct a2l_data *a2l = dso->a2l; - char *funcname = a2l->funcname ? strdup(a2l->funcname) : NULL; - char *filename = a2l->filename ? strdup(a2l->filename) : NULL; + struct symbol *inline_sym = new_inline_sym(dso, sym, a2l->funcname); - return inline_list__append(filename, funcname, a2l->line, node, dso); + return inline_list__append(inline_sym, strdup(a2l->filename), + a2l->line, node); } static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *line, struct dso *dso, - bool unwind_inlines, struct inline_node *node) + bool unwind_inlines, struct inline_node *node, + struct symbol *sym) { int ret = 0; struct a2l_data *a2l = dso->a2l; @@ -244,7 +271,7 @@ static int addr2line(const char *dso_name, u64 addr, if (unwind_inlines) { int cnt = 0; - if (node && inline_list__append_dso_a2l(dso, node)) + if (node && inline_list__append_dso_a2l(dso, node, sym)) return 0; while (bfd_find_inliner_info(a2l->abfd, &a2l->filename, @@ -255,7 +282,7 @@ static int addr2line(const char *dso_name, u64 addr, a2l->filename = NULL; if (node != NULL) { - if (inline_list__append_dso_a2l(dso, node)) + if (inline_list__append_dso_a2l(dso, node, sym)) return 0; // found at least one inline frame ret = 1; @@ -287,7 +314,7 @@ void dso__free_a2l(struct dso *dso) } static struct inline_node *addr2inlines(const char *dso_name, u64 addr, - struct dso *dso) + struct dso *dso, struct symbol *sym) { struct inline_node *node; @@ -300,7 +327,7 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, INIT_LIST_HEAD(&node->val); node->addr = addr; - if (!addr2line(dso_name, addr, NULL, NULL, dso, TRUE, node)) + if (!addr2line(dso_name, addr, NULL, NULL, dso, TRUE, node, sym)) goto out_free_inline_node; if (list_empty(&node->val)) @@ -340,7 +367,8 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *line_nr, struct dso *dso __maybe_unused, bool unwind_inlines __maybe_unused, - struct inline_node *node __maybe_unused) + struct inline_node *node __maybe_unused, + struct symbol *sym __maybe_unused) { FILE *fp; char cmd[PATH_MAX]; @@ -380,7 +408,8 @@ void dso__free_a2l(struct dso *dso __maybe_unused) } static struct inline_node *addr2inlines(const char *dso_name, u64 addr, - struct dso *dso __maybe_unused) + struct dso *dso __maybe_unused, + struct symbol *sym) { FILE *fp; char cmd[PATH_MAX]; @@ -408,13 +437,13 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, node->addr = addr; while (getline(&filename, &len, fp) != -1) { + if (filename_split(filename, &line_nr) != 1) { free(filename); goto out; } - if (inline_list__append(filename, NULL, line_nr, node, - NULL) != 0) + if (inline_list__append(sym, filename, line_nr, node) != 0) goto out; filename = NULL; @@ -454,7 +483,8 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, if (dso_name == NULL) goto out; - if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines, NULL)) + if (!addr2line(dso_name, addr, &file, &line, dso, + unwind_inlines, NULL, sym)) goto out; if (asprintf(&srcline, "%s:%u", @@ -500,7 +530,8 @@ char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, return __get_srcline(dso, addr, sym, show_sym, show_addr, false); } -struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr) +struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, + struct symbol *sym) { const char *dso_name; @@ -508,7 +539,7 @@ struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr) if (dso_name == NULL) return NULL; - return addr2inlines(dso_name, addr, dso); + return addr2inlines(dso_name, addr, dso, sym); } void inline_node__delete(struct inline_node *node) @@ -518,7 +549,9 @@ void inline_node__delete(struct inline_node *node) list_for_each_entry_safe(ilist, tmp, &node->val, list) { list_del_init(&ilist->list); zfree(&ilist->filename); - zfree(&ilist->funcname); + /* only the inlined symbols are owned by the list */ + if (ilist->symbol && ilist->symbol->inlined) + symbol__delete(ilist->symbol); free(ilist); } diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index 7b52ba88676e..730bfd6926d7 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -17,8 +17,8 @@ void free_srcline(char *srcline); #define SRCLINE_UNKNOWN ((char *) "??:0") struct inline_list { + struct symbol *symbol; char *filename; - char *funcname; unsigned int line_nr; struct list_head list; }; @@ -28,7 +28,10 @@ struct inline_node { struct list_head val; }; -struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr); +/* parse inlined frames for the given address */ +struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, + struct symbol *sym); +/* free resources associated to the inline node list */ void inline_node__delete(struct inline_node *node); #endif /* PERF_SRCLINE_H */ diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index aad99e7e179b..d880a059babb 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -59,6 +59,7 @@ struct symbol { u8 binding; u8 idle:1; u8 ignore:1; + u8 inlined:1; u8 arch_sym; char name[0]; }; From 2be8832f3c51cf9e36a3e80ff57f4137505c2ba4 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:32:58 +0200 Subject: [PATCH 0556/1085] perf callchain: Refactor inline_list to store srcline string directly This is a preparation for the creation of real callchain entries for inlined frames. The rest of the perf code uses the srcline string. As such, using that also for the srcline API allows us to simplify some of the upcoming code. Most notably, it will allow us to cache the srcline for a given inline node and reuse it for different callchain entries. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-5-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 54 +++++++++++++++++++++++++++++---------- tools/perf/util/srcline.h | 3 +-- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index c0af61b355ed..f202fc7827df 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -33,8 +33,8 @@ static const char *dso__name(struct dso *dso) return dso_name; } -static int inline_list__append(struct symbol *symbol, char *filename, - int line_nr, struct inline_node *node) +static int inline_list__append(struct symbol *symbol, char *srcline, + struct inline_node *node) { struct inline_list *ilist; @@ -43,8 +43,7 @@ static int inline_list__append(struct symbol *symbol, char *filename, return -1; ilist->symbol = symbol; - ilist->filename = filename; - ilist->line_nr = line_nr; + ilist->srcline = srcline; if (callchain_param.order == ORDER_CALLEE) list_add_tail(&ilist->list, &node->val); @@ -54,6 +53,30 @@ static int inline_list__append(struct symbol *symbol, char *filename, return 0; } +/* basename version that takes a const input string */ +static const char *gnu_basename(const char *path) +{ + const char *base = strrchr(path, '/'); + + return base ? base + 1 : path; +} + +static char *srcline_from_fileline(const char *file, unsigned int line) +{ + char *srcline; + + if (!file) + return NULL; + + if (!srcline_full_filename) + file = gnu_basename(file); + + if (asprintf(&srcline, "%s:%u", file, line) < 0) + return NULL; + + return srcline; +} + #ifdef HAVE_LIBBFD_SUPPORT /* @@ -237,9 +260,12 @@ static int inline_list__append_dso_a2l(struct dso *dso, { struct a2l_data *a2l = dso->a2l; struct symbol *inline_sym = new_inline_sym(dso, sym, a2l->funcname); + char *srcline = NULL; - return inline_list__append(inline_sym, strdup(a2l->filename), - a2l->line, node); + if (a2l->filename) + srcline = srcline_from_fileline(a2l->filename, a2l->line); + + return inline_list__append(inline_sym, srcline, node); } static int addr2line(const char *dso_name, u64 addr, @@ -437,13 +463,15 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, node->addr = addr; while (getline(&filename, &len, fp) != -1) { + char *srcline; if (filename_split(filename, &line_nr) != 1) { free(filename); goto out; } - if (inline_list__append(sym, filename, line_nr, node) != 0) + srcline = srcline_from_fileline(filename, line_nr); + if (inline_list__append(sym, srcline, node) != 0) goto out; filename = NULL; @@ -487,16 +515,14 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, unwind_inlines, NULL, sym)) goto out; - if (asprintf(&srcline, "%s:%u", - srcline_full_filename ? file : basename(file), - line) < 0) { - free(file); + srcline = srcline_from_fileline(file, line); + free(file); + + if (!srcline) goto out; - } dso->a2l_fails = 0; - free(file); return srcline; out: @@ -548,7 +574,7 @@ void inline_node__delete(struct inline_node *node) list_for_each_entry_safe(ilist, tmp, &node->val, list) { list_del_init(&ilist->list); - zfree(&ilist->filename); + free_srcline(ilist->srcline); /* only the inlined symbols are owned by the list */ if (ilist->symbol && ilist->symbol->inlined) symbol__delete(ilist->symbol); diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index 730bfd6926d7..0201ed2c0b9c 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -18,8 +18,7 @@ void free_srcline(char *srcline); struct inline_list { struct symbol *symbol; - char *filename; - unsigned int line_nr; + char *srcline; struct list_head list; }; From 11ea2515f32e783b9a7984c148e742c377383915 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:32:59 +0200 Subject: [PATCH 0557/1085] perf callchain: Create real callchain entries for inlined frames The inline_node structs are maintained by the new dso->inlines tree. This in turn keeps ownership of the fake symbols and srcline string representing an inline frame. This tree is sorted by address to allow quick lookups. All other entries of the symbol beside the function name are unused for inline frames. The advantage of this approach is that all existing users of the callchain API can now transparently display inlined frames without having to patch their code. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-6-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 5 ++++ tools/perf/util/dso.h | 1 + tools/perf/util/machine.c | 37 ++++++++++++++++++++++++++++ tools/perf/util/srcline.c | 51 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/srcline.h | 9 +++++++ 5 files changed, 103 insertions(+) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 339e52971380..75c8250b3b8a 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -10,6 +10,7 @@ #include "compress.h" #include "path.h" #include "symbol.h" +#include "srcline.h" #include "dso.h" #include "machine.h" #include "auxtrace.h" @@ -1201,6 +1202,7 @@ struct dso *dso__new(const char *name) for (i = 0; i < MAP__NR_TYPES; ++i) dso->symbols[i] = dso->symbol_names[i] = RB_ROOT; dso->data.cache = RB_ROOT; + dso->inlined_nodes = RB_ROOT; dso->data.fd = -1; dso->data.status = DSO_DATA_STATUS_UNKNOWN; dso->symtab_type = DSO_BINARY_TYPE__NOT_FOUND; @@ -1232,6 +1234,9 @@ void dso__delete(struct dso *dso) if (!RB_EMPTY_NODE(&dso->rb_node)) pr_err("DSO %s is still in rbtree when being deleted!\n", dso->long_name); + + /* free inlines first, as they reference symbols */ + inlines__tree_delete(&dso->inlined_nodes); for (i = 0; i < MAP__NR_TYPES; ++i) symbols__delete(&dso->symbols[i]); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index a2bbb21f301c..122eca0d242d 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -141,6 +141,7 @@ struct dso { struct rb_root *root; /* root of rbtree that rb_node is in */ struct rb_root symbols[MAP__NR_TYPES]; struct rb_root symbol_names[MAP__NR_TYPES]; + struct rb_root inlined_nodes; struct { u64 addr; struct symbol *symbol; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a37e1c056415..3d049cb313ac 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2109,6 +2109,40 @@ check_calls: return 0; } +static int append_inlines(struct callchain_cursor *cursor, + struct map *map, struct symbol *sym, u64 ip) +{ + struct inline_node *inline_node; + struct inline_list *ilist; + u64 addr; + + if (!symbol_conf.inline_name || !map || !sym) + return 1; + + addr = map__rip_2objdump(map, ip); + + inline_node = inlines__tree_find(&map->dso->inlined_nodes, addr); + if (!inline_node) { + inline_node = dso__parse_addr_inlines(map->dso, addr, sym); + if (!inline_node) + return 1; + + inlines__tree_insert(&map->dso->inlined_nodes, inline_node); + } + + list_for_each_entry(ilist, &inline_node->val, list) { + int ret = callchain_cursor_append(cursor, ip, map, + ilist->symbol, false, + NULL, 0, 0, 0, + ilist->srcline); + + if (ret != 0) + return ret; + } + + return 0; +} + static int unwind_entry(struct unwind_entry *entry, void *arg) { struct callchain_cursor *cursor = arg; @@ -2117,6 +2151,9 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) if (symbol_conf.hide_unresolved && entry->sym == NULL) return 0; + if (append_inlines(cursor, entry->map, entry->sym, entry->ip) == 0) + return 0; + srcline = callchain_srcline(entry->map, entry->sym, entry->ip); return callchain_cursor_append(cursor, entry->ip, entry->map, entry->sym, diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index f202fc7827df..8bea6621d657 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -583,3 +583,54 @@ void inline_node__delete(struct inline_node *node) free(node); } + +void inlines__tree_insert(struct rb_root *tree, struct inline_node *inlines) +{ + struct rb_node **p = &tree->rb_node; + struct rb_node *parent = NULL; + const u64 addr = inlines->addr; + struct inline_node *i; + + while (*p != NULL) { + parent = *p; + i = rb_entry(parent, struct inline_node, rb_node); + if (addr < i->addr) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&inlines->rb_node, parent, p); + rb_insert_color(&inlines->rb_node, tree); +} + +struct inline_node *inlines__tree_find(struct rb_root *tree, u64 addr) +{ + struct rb_node *n = tree->rb_node; + + while (n) { + struct inline_node *i = rb_entry(n, struct inline_node, + rb_node); + + if (addr < i->addr) + n = n->rb_left; + else if (addr > i->addr) + n = n->rb_right; + else + return i; + } + + return NULL; +} + +void inlines__tree_delete(struct rb_root *tree) +{ + struct inline_node *pos; + struct rb_node *next = rb_first(tree); + + while (next) { + pos = rb_entry(next, struct inline_node, rb_node); + next = rb_next(&pos->rb_node); + rb_erase(&pos->rb_node, tree); + inline_node__delete(pos); + } +} diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index 0201ed2c0b9c..ebe38cd22294 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -2,6 +2,7 @@ #define PERF_SRCLINE_H #include +#include #include struct dso; @@ -25,6 +26,7 @@ struct inline_list { struct inline_node { u64 addr; struct list_head val; + struct rb_node rb_node; }; /* parse inlined frames for the given address */ @@ -33,4 +35,11 @@ struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, /* free resources associated to the inline node list */ void inline_node__delete(struct inline_node *node); +/* insert the inline node list into the DSO, which will take ownership */ +void inlines__tree_insert(struct rb_root *tree, struct inline_node *inlines); +/* find previously inserted inline node list */ +struct inline_node *inlines__tree_find(struct rb_root *tree, u64 addr); +/* delete all nodes within the tree of inline_node s */ +void inlines__tree_delete(struct rb_root *tree); + #endif /* PERF_SRCLINE_H */ From cbe50f61727f538f05e63186c2e0022182a3a28f Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:33:00 +0200 Subject: [PATCH 0558/1085] perf report: Fall-back to function name comparison for -g srcline When a callchain entry has no srcline available, we ended up comparing the instruction pointer. I consider this to be not too useful. Rather, I think we should group the entries by function name, which this patch adds. For people who want to split the data on the IP boundary, using `-g address` is the correct choice. Before: ~~~~~ 100.00% 38.86% [.] main | |--61.14%--main inlining.cpp:14 | std::norm complex:664 | std::_Norm_helper::_S_do_it complex:654 | std::abs complex:597 | std::__complex_abs complex:589 | | | |--56.03%--hypot | | | | | |--8.45%--__hypot_finite | | | | | |--7.62%--__hypot_finite | | | | | |--2.29%--__hypot_finite | | | | | |--2.24%--__hypot_finite | | | | | |--2.06%--__hypot_finite | | | | | |--1.81%--__hypot_finite ... ~~~~~ After: ~~~~~ 100.00% 38.86% [.] main | |--61.14%--main inlining.cpp:14 | std::norm complex:664 | std::_Norm_helper::_S_do_it complex:654 | std::abs complex:597 | std::__complex_abs complex:589 | | | |--60.29%--hypot | | | | | --56.03%--__hypot_finite | | | --0.85%--cabs ~~~~~ Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-7-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index e7ee794d1e5b..0f2ba493a7a3 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -645,11 +645,9 @@ enum match_result { MATCH_GT, }; -static enum match_result match_chain_srcline(struct callchain_cursor_node *node, - struct callchain_list *cnode) +static enum match_result match_chain_strings(const char *left, + const char *right) { - const char *left = cnode->srcline; - const char *right = node->srcline; enum match_result ret = MATCH_EQ; int cmp; @@ -659,10 +657,8 @@ static enum match_result match_chain_srcline(struct callchain_cursor_node *node, cmp = 1; else if (left && !right) cmp = -1; - else if (cnode->ip == node->ip) - cmp = 0; else - cmp = (cnode->ip < node->ip) ? -1 : 1; + return MATCH_ERROR; if (cmp != 0) ret = cmp < 0 ? MATCH_LT : MATCH_GT; @@ -679,10 +675,18 @@ static enum match_result match_chain(struct callchain_cursor_node *node, struct dso *right_dso = NULL; if (callchain_param.key == CCKEY_SRCLINE) { - enum match_result match = match_chain_srcline(node, cnode); + enum match_result match = match_chain_strings(cnode->srcline, + node->srcline); + + /* if no srcline is available, fallback to symbol name */ + if (match == MATCH_ERROR && cnode->ms.sym && node->sym) + match = match_chain_strings(cnode->ms.sym->name, + node->sym->name); if (match != MATCH_ERROR) return match; + + /* otherwise fall-back to IP-based comparison below */ } if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) { From 8932f8071cae8a12dfd5f492247777ee176b0da4 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:33:01 +0200 Subject: [PATCH 0559/1085] perf callchain: Mark inlined frames in output by " (inlined)" suffix The original patch that introduced inline frame output in the various browsers used this suffix already. The new centralized approach that uses fake symbols for inlined frames was missing this approach so far. Instead of changing the symbol name itself, we only print the suffix where needed. This allows us to efficiently lookup the symbol for a given name without first having to append the suffix before the lookup. Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-8-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 10 +++++++--- tools/perf/util/sort.c | 3 +++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 0f2ba493a7a3..77031efdca5c 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1111,11 +1111,15 @@ char *callchain_list__sym_name(struct callchain_list *cl, int printed; if (cl->ms.sym) { + const char *inlined = cl->ms.sym->inlined ? " (inlined)" : ""; + if (show_srcline && cl->srcline) - printed = scnprintf(bf, bfsize, "%s %s", - cl->ms.sym->name, cl->srcline); + printed = scnprintf(bf, bfsize, "%s %s%s", + cl->ms.sym->name, cl->srcline, + inlined); else - printed = scnprintf(bf, bfsize, "%s", cl->ms.sym->name); + printed = scnprintf(bf, bfsize, "%s%s", + cl->ms.sym->name, inlined); } else printed = scnprintf(bf, bfsize, "%#" PRIx64, cl->ip); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index eb3ab902a1c0..acb9210fd18a 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -283,6 +283,9 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym, ret += repsep_snprintf(bf + ret, size - ret, "%.*s", width - ret, sym->name); + if (sym->inlined) + ret += repsep_snprintf(bf + ret, size - ret, + " (inlined)"); } } else { size_t len = BITS_PER_LONG / 4; From 9628b56dc1240ce0faa3bd9b7a3390fa4451c59f Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:33:02 +0200 Subject: [PATCH 0560/1085] perf script: Mark inlined frames and do not print DSO for them Instead of showing the (repeated) DSO name of the non-inlined frame, we now show the "(inlined)" suffix instead. Before: 214f7 __hypot_finite (/usr/lib/libm-2.25.so) ace3 hypot (/usr/lib/libm-2.25.so) a4a std::__complex_abs (/home/milian/projects/src/perf-tests/inlining) a4a std::abs (/home/milian/projects/src/perf-tests/inlining) a4a std::_Norm_helper::_S_do_it (/home/milian/projects/src/perf-tests/inlining) a4a std::norm (/home/milian/projects/src/perf-tests/inlining) a4a main (/home/milian/projects/src/perf-tests/inlining) 20510 __libc_start_main (/usr/lib/libc-2.25.so) bd9 _start (/home/milian/projects/src/perf-tests/inlining) After: 214f7 __hypot_finite (/usr/lib/libm-2.25.so) ace3 hypot (/usr/lib/libm-2.25.so) a4a std::__complex_abs (inlined) a4a std::abs (inlined) a4a std::_Norm_helper::_S_do_it (inlined) a4a std::norm (inlined) a4a main (/home/milian/projects/src/perf-tests/inlining) 20510 __libc_start_main (/usr/lib/libc-2.25.so) bd9 _start (/home/milian/projects/src/perf-tests/inlining) Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-9-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel_fprintf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index f2c6c5ee11e8..5b9e89257aa7 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -157,7 +157,7 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, } } - if (print_dso) { + if (print_dso && (!node->sym || !node->sym->inlined)) { printed += fprintf(fp, " ("); printed += map__fprintf_dsoname(node->map, fp); printed += fprintf(fp, ")"); @@ -166,6 +166,9 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, if (print_srcline) printed += map__fprintf_srcline(node->map, addr, "\n ", fp); + if (node->sym && node->sym->inlined) + printed += fprintf(fp, " (inlined)"); + if (!print_oneline) printed += fprintf(fp, "\n"); From 9856240ad3269f2fdab0b2fa4400ef8aab792061 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:33:03 +0200 Subject: [PATCH 0561/1085] perf callchain: Compare symbol name for inlined frames when matching The fake symbols we create for inlined frames will represent different functions but can use the symbol start address. This leads to issues when different inline branches all lead to the same function. Before: ~~~~~ $ perf report -s sym -i perf.inlining.data --inline --stdio -g function ... --38.86%--_start __libc_start_main main | --37.57%--std::norm (inlined) std::_Norm_helper::_S_do_it (inlined) | --36.36%--std::abs (inlined) std::__complex_abs (inlined) | --12.24%--std::linear_congruential_engine::operator() (inlined) std::__detail::__mod (inlined) std::__detail::_Mod::__calc (inlined) ~~~~~ Note that this backtrace representation is completely bogus. Complex abs does not call the linear congruential engine! It is just a side-effect of a longer inlined stack being appended to a shorter, different inlined stack, both of which originate in the same function (main). This patch fixes the issue: ~~~~~ $ perf report -s sym -i perf.inlining.data --inline --stdio -g function ... --38.86%--_start __libc_start_main main | |--35.59%--std::uniform_real_distribution::operator() > (inlined) | std::uniform_real_distribution::operator() > (inlined) | | | --34.37%--std::__detail::_Adaptor, double>::operator() (inlined) | std::generate_canonical > (inlined) | | | --12.24%--std::linear_congruential_engine::operator() (inlined) | std::__detail::__mod (inlined) | std::__detail::_Mod::__calc (inlined) | --1.99%--std::norm (inlined) std::_Norm_helper::_S_do_it (inlined) std::abs (inlined) std::__complex_abs (inlined) ~~~~~ Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-10-milian.wolff@kdab.com Cc: Arnaldo Carvalho de Melo [ Fix up conflict with c1fbc0cf81f1 ("perf callchain: Compare dsos (as well) for CCKEY_FUNCTION"), remove unneeded hunk ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 77031efdca5c..35a920f09503 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -690,6 +690,14 @@ static enum match_result match_chain(struct callchain_cursor_node *node, } if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) { + /* + * Compare inlined frames based on their symbol name because + * different inlined frames will have the same symbol start + */ + if (cnode->ms.sym->inlined || node->sym->inlined) + return match_chain_strings(cnode->ms.sym->name, + node->sym->name); + left = cnode->ms.sym->start; right = sym->start; left_dso = cnode->ms.map->dso; From aa441895f7b4ff5394d4964a8e6749f3866e44d0 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 9 Oct 2017 22:33:04 +0200 Subject: [PATCH 0562/1085] perf report: Compare symbol name for inlined frames when sorting Similar to the callstack frame matching, we also have to compare the symbol name when sorting hist entries. The reason is twofold: On one hand, multiple inlined functions will use the same symbol start/end values of the parent, non-inlined symbol. As such, all of these symbols often end up missing from top-level report, as they get merged with the non-inlined frame. On the other hand, multiple different functions may end up inlining the same function, and we need to aggregate these values properly. Before: ~~~~~ perf report --stdio --inline -g none # Children Self Command Shared Object Symbol # ........ ........ ............ ............. ................................... # 100.00% 39.69% cpp-inlining cpp-inlining [.] main 100.00% 0.00% cpp-inlining cpp-inlining [.] _start 100.00% 0.00% cpp-inlining libc-2.25.so [.] __libc_start_main 97.03% 0.00% cpp-inlining cpp-inlining [.] std::norm (inlined) 59.53% 4.26% cpp-inlining libm-2.25.so [.] hypot 55.21% 55.08% cpp-inlining libm-2.25.so [.] __hypot_finite 0.52% 0.52% cpp-inlining libm-2.25.so [.] cabs ~~~~~ After: ~~~~~ perf report --stdio --inline -g none # Children Self Command Shared Object Symbol # ........ ........ ............ ............. ................................................................................................................................... # 100.00% 39.69% cpp-inlining cpp-inlining [.] main 100.00% 0.00% cpp-inlining cpp-inlining [.] _start 100.00% 0.00% cpp-inlining libc-2.25.so [.] __libc_start_main 62.57% 0.00% cpp-inlining cpp-inlining [.] std::_Norm_helper::_S_do_it (inlined) 62.57% 0.00% cpp-inlining cpp-inlining [.] std::__complex_abs (inlined) 62.57% 0.00% cpp-inlining cpp-inlining [.] std::abs (inlined) 62.57% 0.00% cpp-inlining cpp-inlining [.] std::norm (inlined) 59.53% 4.26% cpp-inlining libm-2.25.so [.] hypot 55.21% 55.08% cpp-inlining libm-2.25.so [.] __hypot_finite 34.46% 0.00% cpp-inlining cpp-inlining [.] std::uniform_real_distribution::operator() > (inlined) 32.39% 0.00% cpp-inlining cpp-inlining [.] std::__detail::_Adaptor, double>::operator() (inlined) 32.39% 0.00% cpp-inlining cpp-inlining [.] std::generate_canonical > (inlined) 12.29% 0.00% cpp-inlining cpp-inlining [.] std::__detail::_Mod::__calc (inlined) 12.29% 0.00% cpp-inlining cpp-inlining [.] std::__detail::__mod (inlined) 12.29% 0.00% cpp-inlining cpp-inlining [.] std::linear_congruential_engine::operator() (inlined) 0.52% 0.52% cpp-inlining libm-2.25.so [.] cabs ~~~~~ Signed-off-by: Milian Wolff Reviewed-by: Jiri Olsa Reviewed-by: Namhyung Kim Cc: David Ahern Cc: Peter Zijlstra Cc: Yao Jin Link: http://lkml.kernel.org/r/20171009203310.17362-11-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index acb9210fd18a..006d10a0dc96 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -225,6 +225,9 @@ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r) if (sym_l == sym_r) return 0; + if (sym_l->inlined || sym_r->inlined) + return strcmp(sym_l->name, sym_r->name); + if (sym_l->start != sym_r->start) return (int64_t)(sym_r->start - sym_l->start); From e0d02285f16e8d5810f3d5d5e8a5886ca0015d3b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:47 +0100 Subject: [PATCH 0563/1085] locking/qrwlock: Use 'struct qrwlock' instead of 'struct __qrwlock' There's no good reason to keep the internal structure of struct qrwlock hidden from qrwlock.h, particularly as it's actually needed for unlock and ends up being abstracted independently behind the __qrwlock_write_byte() function. Stop pretending we can hide this stuff, and move the __qrwlock definition into qrwlock, removing the __qrwlock_write_byte() nastiness and using the same struct definition everywhere instead. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 12 +----------- include/asm-generic/qrwlock_types.h | 15 +++++++++++++-- kernel/locking/qrwlock.c | 26 ++------------------------ 3 files changed, 16 insertions(+), 37 deletions(-) diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 50925327b0a8..02c0a768e6b0 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -128,23 +128,13 @@ static inline void queued_read_unlock(struct qrwlock *lock) (void)atomic_sub_return_release(_QR_BIAS, &lock->cnts); } -/** - * __qrwlock_write_byte - retrieve the write byte address of a queue rwlock - * @lock : Pointer to queue rwlock structure - * Return: the write byte address of a queue rwlock - */ -static inline u8 *__qrwlock_write_byte(struct qrwlock *lock) -{ - return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN); -} - /** * queued_write_unlock - release write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ static inline void queued_write_unlock(struct qrwlock *lock) { - smp_store_release(__qrwlock_write_byte(lock), 0); + smp_store_release(&lock->wmode, 0); } /* diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h index 0abc6b6062fb..507f2dc51bba 100644 --- a/include/asm-generic/qrwlock_types.h +++ b/include/asm-generic/qrwlock_types.h @@ -9,12 +9,23 @@ */ typedef struct qrwlock { - atomic_t cnts; + union { + atomic_t cnts; + struct { +#ifdef __LITTLE_ENDIAN + u8 wmode; /* Writer mode */ + u8 rcnts[3]; /* Reader counts */ +#else + u8 rcnts[3]; /* Reader counts */ + u8 wmode; /* Writer mode */ +#endif + }; + }; arch_spinlock_t wait_lock; } arch_rwlock_t; #define __ARCH_RW_LOCK_UNLOCKED { \ - .cnts = ATOMIC_INIT(0), \ + { .cnts = ATOMIC_INIT(0), }, \ .wait_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ } diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2655f26ec882..1af791e37348 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -23,26 +23,6 @@ #include #include -/* - * This internal data structure is used for optimizing access to some of - * the subfields within the atomic_t cnts. - */ -struct __qrwlock { - union { - atomic_t cnts; - struct { -#ifdef __LITTLE_ENDIAN - u8 wmode; /* Writer mode */ - u8 rcnts[3]; /* Reader counts */ -#else - u8 rcnts[3]; /* Reader counts */ - u8 wmode; /* Writer mode */ -#endif - }; - }; - arch_spinlock_t lock; -}; - /** * rspin_until_writer_unlock - inc reader count & spin until writer is gone * @lock : Pointer to queue rwlock structure @@ -125,10 +105,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) * or wait for a previous writer to go away. */ for (;;) { - struct __qrwlock *l = (struct __qrwlock *)lock; - - if (!READ_ONCE(l->wmode) && - (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) + if (!READ_ONCE(lock->wmode) && + (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax(); From 4df714be4dcf40bfb0d4af0f851a6e1977afa02e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:48 +0100 Subject: [PATCH 0564/1085] locking/atomic: Add atomic_cond_read_acquire() smp_cond_load_acquire() provides a way to spin on a variable with acquire semantics until some conditional expression involving the variable is satisfied. Architectures such as arm64 can potentially enter a low-power state, waking up only when the value of the variable changes, which reduces the system impact of tight polling loops. This patch makes the same interface available to users of atomic_t, atomic64_t and atomic_long_t, rather than require messy accesses to the structure internals. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/asm-generic/atomic-long.h | 3 +++ include/linux/atomic.h | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h index 288cc9e96395..f2d97b782031 100644 --- a/include/asm-generic/atomic-long.h +++ b/include/asm-generic/atomic-long.h @@ -243,4 +243,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) #define atomic_long_inc_not_zero(l) \ ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l)) +#define atomic_long_cond_read_acquire(v, c) \ + ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c)) + #endif /* _ASM_GENERIC_ATOMIC_LONG_H */ diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 40d6bfec0e0d..0aeb2b3f4578 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -653,6 +653,8 @@ static inline int atomic_dec_if_positive(atomic_t *v) } #endif +#define atomic_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c)) + #ifdef CONFIG_GENERIC_ATOMIC64 #include #endif @@ -1072,6 +1074,8 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v } #endif +#define atomic64_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c)) + #include #endif /* _LINUX_ATOMIC_H */ From b519b56e378ee82caf9b079b04f5db87dedc3251 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:49 +0100 Subject: [PATCH 0565/1085] locking/qrwlock: Use atomic_cond_read_acquire() when spinning in qrwlock The qrwlock slowpaths involve spinning when either a prospective reader is waiting for a concurrent writer to drain, or a prospective writer is waiting for concurrent readers to drain. In both of these situations, atomic_cond_read_acquire() can be used to avoid busy-waiting and make use of any backoff functionality provided by the architecture. This patch replaces the open-code loops and rspin_until_writer_unlock() implementation with atomic_cond_read_acquire(). The write mode transition zero to _QW_WAITING is left alone, since (a) this doesn't need acquire semantics and (b) should be fast. Tested-by: Waiman Long Tested-by: Jeremy Linton Tested-by: Adam Wallis Tested-by: Jan Glauber Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 4 +-- kernel/locking/qrwlock.c | 50 +++++++++-------------------------- 2 files changed, 14 insertions(+), 40 deletions(-) diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 02c0a768e6b0..c716b02e8fd7 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -49,7 +49,7 @@ /* * External function declarations */ -extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts); +extern void queued_read_lock_slowpath(struct qrwlock *lock); extern void queued_write_lock_slowpath(struct qrwlock *lock); /** @@ -100,7 +100,7 @@ static inline void queued_read_lock(struct qrwlock *lock) return; /* The slowpath will decrement the reader count, if necessary. */ - queued_read_lock_slowpath(lock, cnts); + queued_read_lock_slowpath(lock); } /** diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 1af791e37348..5825e0fc1a8e 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -23,29 +23,11 @@ #include #include -/** - * rspin_until_writer_unlock - inc reader count & spin until writer is gone - * @lock : Pointer to queue rwlock structure - * @writer: Current queue rwlock writer status byte - * - * In interrupt context or at the head of the queue, the reader will just - * increment the reader count & wait until the writer releases the lock. - */ -static __always_inline void -rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) -{ - while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax(); - cnts = atomic_read_acquire(&lock->cnts); - } -} - /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure - * @cnts: Current qrwlock lock value */ -void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) +void queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -53,13 +35,12 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) if (unlikely(in_interrupt())) { /* * Readers in interrupt context will get the lock immediately - * if the writer is just waiting (not holding the lock yet). - * The rspin_until_writer_unlock() function returns immediately - * in this case. Otherwise, they will spin (with ACQUIRE - * semantics) until the lock is available without waiting in - * the queue. + * if the writer is just waiting (not holding the lock yet), + * so spin with ACQUIRE semantics until the lock is available + * without waiting in the queue. */ - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) + != _QW_LOCKED); return; } atomic_sub(_QR_BIAS, &lock->cnts); @@ -68,14 +49,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * Put the reader into the wait queue */ arch_spin_lock(&lock->wait_lock); + atomic_add(_QR_BIAS, &lock->cnts); /* * The ACQUIRE semantics of the following spinning code ensure * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED); /* * Signal the next one in queue to become queue head @@ -90,8 +71,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { - u32 cnts; - /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -113,15 +92,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock) } /* When no more readers, set the locked flag */ - for (;;) { - cnts = atomic_read(&lock->cnts); - if ((cnts == _QW_WAITING) && - (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) - break; - - cpu_relax(); - } + do { + atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); + } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, + _QW_LOCKED) != _QW_WAITING); unlock: arch_spin_unlock(&lock->wait_lock); } From 087133ac90763cd339b6b67f2998f87dcc136c52 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:50 +0100 Subject: [PATCH 0566/1085] locking/qrwlock, arm64: Move rwlock implementation over to qrwlocks Now that the qrwlock can make use of WFE, remove our homebrewed rwlock code in favour of the generic queued implementation. Tested-by: Waiman Long Tested-by: Jeremy Linton Tested-by: Adam Wallis Tested-by: Jan Glauber Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Thomas Gleixner Cc: boqun.feng@gmail.com Cc: linux-arm-kernel@lists.infradead.org Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507810851-306-5-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/arm64/Kconfig | 17 +++ arch/arm64/include/asm/Kbuild | 1 + arch/arm64/include/asm/spinlock.h | 164 +----------------------- arch/arm64/include/asm/spinlock_types.h | 6 +- 4 files changed, 20 insertions(+), 168 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0df64a6a56d4..df02ad932020 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -22,7 +22,24 @@ config ARM64 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_NMI_SAFE_CMPXCHG if ACPI_APEI_SEA + select ARCH_INLINE_READ_LOCK if !PREEMPT + select ARCH_INLINE_READ_LOCK_BH if !PREEMPT + select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPT + select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPT + select ARCH_INLINE_READ_UNLOCK if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_INLINE_WRITE_LOCK if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPT select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_QUEUED_RWLOCKS select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 2326e39d5892..e63d0a8312de 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += msi.h generic-y += preempt.h +generic-y += qrwlock.h generic-y += rwsem.h generic-y += segment.h generic-y += serial.h diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index aa51a38e46e4..fdb827c7832f 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -137,169 +137,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock) } #define arch_spin_is_contended arch_spin_is_contended -/* - * Write lock implementation. - * - * Write locks set bit 31. Unlocking, is done by writing 0 since the lock is - * exclusively held. - * - * The memory barriers are implicit with the load-acquire and store-release - * instructions. - */ - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - unsigned int tmp; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - " sevl\n" - "1: wfe\n" - "2: ldaxr %w0, %1\n" - " cbnz %w0, 1b\n" - " stxr %w0, %w2, %1\n" - " cbnz %w0, 2b\n" - __nops(1), - /* LSE atomics */ - "1: mov %w0, wzr\n" - "2: casa %w0, %w2, %1\n" - " cbz %w0, 3f\n" - " ldxr %w0, %1\n" - " cbz %w0, 2b\n" - " wfe\n" - " b 1b\n" - "3:") - : "=&r" (tmp), "+Q" (rw->lock) - : "r" (0x80000000) - : "memory"); -} - -static inline int arch_write_trylock(arch_rwlock_t *rw) -{ - unsigned int tmp; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: ldaxr %w0, %1\n" - " cbnz %w0, 2f\n" - " stxr %w0, %w2, %1\n" - " cbnz %w0, 1b\n" - "2:", - /* LSE atomics */ - " mov %w0, wzr\n" - " casa %w0, %w2, %1\n" - __nops(2)) - : "=&r" (tmp), "+Q" (rw->lock) - : "r" (0x80000000) - : "memory"); - - return !tmp; -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - asm volatile(ARM64_LSE_ATOMIC_INSN( - " stlr wzr, %0", - " swpl wzr, wzr, %0") - : "=Q" (rw->lock) :: "memory"); -} - -/* write_can_lock - would write_trylock() succeed? */ -#define arch_write_can_lock(x) ((x)->lock == 0) - -/* - * Read lock implementation. - * - * It exclusively loads the lock value, increments it and stores the new value - * back if positive and the CPU still exclusively owns the location. If the - * value is negative, the lock is already held. - * - * During unlocking there may be multiple active read locks but no write lock. - * - * The memory barriers are implicit with the load-acquire and store-release - * instructions. - * - * Note that in UNDEFINED cases, such as unlocking a lock twice, the LL/SC - * and LSE implementations may exhibit different behaviour (although this - * will have no effect on lockdep). - */ -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile( - " sevl\n" - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: wfe\n" - "2: ldaxr %w0, %2\n" - " add %w0, %w0, #1\n" - " tbnz %w0, #31, 1b\n" - " stxr %w1, %w0, %2\n" - " cbnz %w1, 2b\n" - __nops(1), - /* LSE atomics */ - "1: wfe\n" - "2: ldxr %w0, %2\n" - " adds %w1, %w0, #1\n" - " tbnz %w1, #31, 1b\n" - " casa %w0, %w1, %2\n" - " sbc %w0, %w1, %w0\n" - " cbnz %w0, 2b") - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "cc", "memory"); -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: ldxr %w0, %2\n" - " sub %w0, %w0, #1\n" - " stlxr %w1, %w0, %2\n" - " cbnz %w1, 1b", - /* LSE atomics */ - " movn %w0, #0\n" - " staddl %w0, %2\n" - __nops(2)) - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "memory"); -} - -static inline int arch_read_trylock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - " mov %w1, #1\n" - "1: ldaxr %w0, %2\n" - " add %w0, %w0, #1\n" - " tbnz %w0, #31, 2f\n" - " stxr %w1, %w0, %2\n" - " cbnz %w1, 1b\n" - "2:", - /* LSE atomics */ - " ldr %w0, %2\n" - " adds %w1, %w0, #1\n" - " tbnz %w1, #31, 1f\n" - " casa %w0, %w1, %2\n" - " sbc %w1, %w1, %w0\n" - __nops(1) - "1:") - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "cc", "memory"); - - return !tmp2; -} - -/* read_can_lock - would read_trylock() succeed? */ -#define arch_read_can_lock(x) ((x)->lock < 0x80000000) +#include /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h index 55be59a35e3f..6b856012c51b 100644 --- a/arch/arm64/include/asm/spinlock_types.h +++ b/arch/arm64/include/asm/spinlock_types.h @@ -36,10 +36,6 @@ typedef struct { #define __ARCH_SPIN_LOCK_UNLOCKED { 0 , 0 } -typedef struct { - volatile unsigned int lock; -} arch_rwlock_t; - -#define __ARCH_RW_LOCK_UNLOCKED { 0 } +#include #endif From d133166146333e1f13fc81c0e6c43c8d99290a8a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:51 +0100 Subject: [PATCH 0567/1085] locking/qrwlock: Prevent slowpath writers getting held up by fastpath When a prospective writer takes the qrwlock locking slowpath due to the lock being held, it attempts to cmpxchg the wmode field from 0 to _QW_WAITING so that concurrent lockers also take the slowpath and queue on the spinlock accordingly, allowing the lockers to drain. Unfortunately, this isn't fair, because a fastpath writer that comes in after the lock is made available but before the _QW_WAITING flag is set can effectively jump the queue. If there is a steady stream of prospective writers, then the waiter will be held off indefinitely. This patch restores fairness by separating _QW_WAITING and _QW_LOCKED into two distinct fields: _QW_LOCKED continues to occupy the bottom byte of the lockword so that it can be cleared unconditionally when unlocking, but _QW_WAITING now occupies what used to be the bottom bit of the reader count. This then forces the slow-path for concurrent lockers. Tested-by: Waiman Long Tested-by: Jeremy Linton Tested-by: Adam Wallis Tested-by: Jan Glauber Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-6-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 23 +++++------------------ include/asm-generic/qrwlock_types.h | 8 ++++---- kernel/locking/qrwlock.c | 20 +++++--------------- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index c716b02e8fd7..0f7062bd55e5 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -26,24 +26,11 @@ /* * Writer states & reader shift and bias. - * - * | +0 | +1 | +2 | +3 | - * ----+----+----+----+----+ - * LE | 78 | 56 | 34 | 12 | 0x12345678 - * ----+----+----+----+----+ - * | wr | rd | - * +----+----+----+----+ - * - * ----+----+----+----+----+ - * BE | 12 | 34 | 56 | 78 | 0x12345678 - * ----+----+----+----+----+ - * | rd | wr | - * +----+----+----+----+ */ -#define _QW_WAITING 1 /* A writer is waiting */ -#define _QW_LOCKED 0xff /* A writer holds the lock */ -#define _QW_WMASK 0xff /* Writer mask */ -#define _QR_SHIFT 8 /* Reader count shift */ +#define _QW_WAITING 0x100 /* A writer is waiting */ +#define _QW_LOCKED 0x0ff /* A writer holds the lock */ +#define _QW_WMASK 0x1ff /* Writer mask */ +#define _QR_SHIFT 9 /* Reader count shift */ #define _QR_BIAS (1U << _QR_SHIFT) /* @@ -134,7 +121,7 @@ static inline void queued_read_unlock(struct qrwlock *lock) */ static inline void queued_write_unlock(struct qrwlock *lock) { - smp_store_release(&lock->wmode, 0); + smp_store_release(&lock->wlocked, 0); } /* diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h index 507f2dc51bba..8af752acbdc0 100644 --- a/include/asm-generic/qrwlock_types.h +++ b/include/asm-generic/qrwlock_types.h @@ -13,11 +13,11 @@ typedef struct qrwlock { atomic_t cnts; struct { #ifdef __LITTLE_ENDIAN - u8 wmode; /* Writer mode */ - u8 rcnts[3]; /* Reader counts */ + u8 wlocked; /* Locked for write? */ + u8 __lstate[3]; #else - u8 rcnts[3]; /* Reader counts */ - u8 wmode; /* Writer mode */ + u8 __lstate[3]; + u8 wlocked; /* Locked for write? */ #endif }; }; diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 5825e0fc1a8e..c7471c3fb798 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -39,8 +39,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * so spin with ACQUIRE semantics until the lock is available * without waiting in the queue. */ - atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) - != _QW_LOCKED); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); return; } atomic_sub(_QR_BIAS, &lock->cnts); @@ -56,7 +55,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); /* * Signal the next one in queue to become queue head @@ -79,19 +78,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; - /* - * Set the waiting flag to notify readers that a writer is pending, - * or wait for a previous writer to go away. - */ - for (;;) { - if (!READ_ONCE(lock->wmode) && - (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0)) - break; + /* Set the waiting flag to notify readers that a writer is pending */ + atomic_add(_QW_WAITING, &lock->cnts); - cpu_relax(); - } - - /* When no more readers, set the locked flag */ + /* When no more readers or writers, set the locked flag */ do { atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, From d3e632f07b6e637d3643081873867ecb6433b2ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:11 -0700 Subject: [PATCH 0568/1085] locking/atomics, dm-integrity: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't pick up some uses, including those in dm-integrity.c. As a preparatory step, this patch converts the driver to use {READ,WRITE}_ONCE() consistently. At the same time, this patch adds the missing include of necessary for the {READ,WRITE}_ONCE() definitions. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Mike Snitzer Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- drivers/md/dm-integrity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 096fe9b66c50..8c5756e1df94 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -6,6 +6,7 @@ * This file is released under the GPL. */ +#include #include #include #include @@ -80,13 +81,13 @@ struct journal_entry { #define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) #if BITS_PER_LONG == 64 -#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #elif defined(CONFIG_LBDAF) -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #else -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) #define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) #endif #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) @@ -320,7 +321,7 @@ static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, in static int dm_integrity_failed(struct dm_integrity_c *ic) { - return ACCESS_ONCE(ic->failed); + return READ_ONCE(ic->failed); } static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, @@ -1545,7 +1546,7 @@ retry_kmap: smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { queue_work(ic->commit_wq, &ic->commit_work); } else { schedule_autocommit(ic); @@ -1798,7 +1799,7 @@ static void integrity_commit(struct work_struct *w) ic->n_committed_sections += commit_sections; spin_unlock_irq(&ic->endio_wait.lock); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) queue_work(ic->writer_wq, &ic->writer_work); release_flush_bios: @@ -1980,7 +1981,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (ACCESS_ONCE(ic->suspending)) + if (READ_ONCE(ic->suspending)) return; spin_lock_irq(&ic->endio_wait.lock); From 332efa6374de75c622b782569b16bb50fd722cf5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:12 -0700 Subject: [PATCH 0569/1085] locking/atomics, EDAC/altera: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the Altera EDAC code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Acked-by: Thor Thayer Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-2-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- drivers/edac/altera_edac.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 346c4987b284..11d6419788c2 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -175,11 +175,11 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, /* * To trigger the error, we need to read the data back * (the data was written with errors above). - * The ACCESS_ONCE macros and printk are used to prevent the + * The READ_ONCE macros and printk are used to prevent the * the compiler optimizing these reads out. */ - reg = ACCESS_ONCE(ptemp[0]); - read_reg = ACCESS_ONCE(ptemp[1]); + reg = READ_ONCE(ptemp[0]); + read_reg = READ_ONCE(ptemp[1]); /* Force Read */ rmb(); @@ -618,7 +618,7 @@ static ssize_t altr_edac_device_trig(struct file *file, for (i = 0; i < (priv->trig_alloc_sz / sizeof(*ptemp)); i++) { /* Read data so we're in the correct state */ rmb(); - if (ACCESS_ONCE(ptemp[i])) + if (READ_ONCE(ptemp[i])) result = -1; /* Toggle Error bit (it is latched), leave ECC enabled */ writel(error_mask, (drvdata->base + priv->set_err_ofst)); @@ -635,7 +635,7 @@ static ssize_t altr_edac_device_trig(struct file *file, /* Read out written data. ECC error caused here */ for (i = 0; i < ALTR_TRIGGER_READ_WRD_CNT; i++) - if (ACCESS_ONCE(ptemp[i]) != i) + if (READ_ONCE(ptemp[i]) != i) edac_printk(KERN_ERR, EDAC_DEVICE, "Read doesn't match written data\n"); From eeafcc5a5925d0819dad462eac1d42fda5fbe36f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:13 -0700 Subject: [PATCH 0570/1085] locking/atomics, firmware/ivc: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() workqueue: kill off ACCESS_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the Tegra IVC code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Jonathan Hunter Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thierry Reding Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-3-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- drivers/firmware/tegra/ivc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/firmware/tegra/ivc.c b/drivers/firmware/tegra/ivc.c index a01461d63f68..00de793e6423 100644 --- a/drivers/firmware/tegra/ivc.c +++ b/drivers/firmware/tegra/ivc.c @@ -99,11 +99,11 @@ static inline bool tegra_ivc_empty(struct tegra_ivc *ivc, { /* * This function performs multiple checks on the same values with - * security implications, so create snapshots with ACCESS_ONCE() to + * security implications, so create snapshots with READ_ONCE() to * ensure that these checks use the same values. */ - u32 tx = ACCESS_ONCE(header->tx.count); - u32 rx = ACCESS_ONCE(header->rx.count); + u32 tx = READ_ONCE(header->tx.count); + u32 rx = READ_ONCE(header->rx.count); /* * Perform an over-full check to prevent denial of service attacks @@ -124,8 +124,8 @@ static inline bool tegra_ivc_empty(struct tegra_ivc *ivc, static inline bool tegra_ivc_full(struct tegra_ivc *ivc, struct tegra_ivc_header *header) { - u32 tx = ACCESS_ONCE(header->tx.count); - u32 rx = ACCESS_ONCE(header->rx.count); + u32 tx = READ_ONCE(header->tx.count); + u32 rx = READ_ONCE(header->rx.count); /* * Invalid cases where the counters indicate that the queue is over @@ -137,8 +137,8 @@ static inline bool tegra_ivc_full(struct tegra_ivc *ivc, static inline u32 tegra_ivc_available(struct tegra_ivc *ivc, struct tegra_ivc_header *header) { - u32 tx = ACCESS_ONCE(header->tx.count); - u32 rx = ACCESS_ONCE(header->rx.count); + u32 tx = READ_ONCE(header->tx.count); + u32 rx = READ_ONCE(header->rx.count); /* * This function isn't expected to be used in scenarios where an @@ -151,8 +151,8 @@ static inline u32 tegra_ivc_available(struct tegra_ivc *ivc, static inline void tegra_ivc_advance_tx(struct tegra_ivc *ivc) { - ACCESS_ONCE(ivc->tx.channel->tx.count) = - ACCESS_ONCE(ivc->tx.channel->tx.count) + 1; + WRITE_ONCE(ivc->tx.channel->tx.count, + READ_ONCE(ivc->tx.channel->tx.count) + 1); if (ivc->tx.position == ivc->num_frames - 1) ivc->tx.position = 0; @@ -162,8 +162,8 @@ static inline void tegra_ivc_advance_tx(struct tegra_ivc *ivc) static inline void tegra_ivc_advance_rx(struct tegra_ivc *ivc) { - ACCESS_ONCE(ivc->rx.channel->rx.count) = - ACCESS_ONCE(ivc->rx.channel->rx.count) + 1; + WRITE_ONCE(ivc->rx.channel->rx.count, + READ_ONCE(ivc->rx.channel->rx.count) + 1); if (ivc->rx.position == ivc->num_frames - 1) ivc->rx.position = 0; @@ -428,7 +428,7 @@ int tegra_ivc_notified(struct tegra_ivc *ivc) /* Copy the receiver's state out of shared memory. */ tegra_ivc_invalidate(ivc, ivc->rx.phys + offset); - state = ACCESS_ONCE(ivc->rx.channel->tx.state); + state = READ_ONCE(ivc->rx.channel->tx.state); if (state == TEGRA_IVC_STATE_SYNC) { offset = offsetof(struct tegra_ivc_header, tx.count); From 66702eb59064f1077e89cce8e7e3cd48ec4b486c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:14 -0700 Subject: [PATCH 0571/1085] locking/atomics, fs/dcache: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the dcache code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Al Viro Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-4-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- fs/dcache.c | 18 +++++++++--------- include/linux/dcache.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 34c852af215c..bcc9f6981569 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -630,7 +630,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry) rcu_read_lock(); spin_unlock(&dentry->d_lock); again: - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure @@ -721,7 +721,7 @@ static inline bool fast_dput(struct dentry *dentry) * around with a zero refcount. */ smp_rmb(); - d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags = READ_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ @@ -850,11 +850,11 @@ struct dentry *dget_parent(struct dentry *dentry) * locking. */ rcu_read_lock(); - ret = ACCESS_ONCE(dentry->d_parent); + ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + if (likely(ret == READ_ONCE(dentry->d_parent))) return ret; dput(ret); } @@ -3040,7 +3040,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * @buflen: allocated length of the buffer * @name: name string and length qstr structure * - * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * The length cannot be trusted, we need to copy it byte-by-byte until @@ -3054,8 +3054,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) */ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { - const char *dname = ACCESS_ONCE(name->name); - u32 dlen = ACCESS_ONCE(name->len); + const char *dname = READ_ONCE(name->name); + u32 dlen = READ_ONCE(name->len); char *p; smp_read_barrier_depends(); @@ -3120,7 +3120,7 @@ restart: struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + struct mount *parent = READ_ONCE(mnt->mnt_parent); /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -3130,7 +3130,7 @@ restart: } /* Global root? */ if (mnt != parent) { - dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = parent; vfsmnt = &mnt->mnt; continue; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ed1a7cf6923a..1d8f5818f647 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -519,7 +519,7 @@ static inline struct inode *d_inode(const struct dentry *dentry) } /** - * d_inode_rcu - Get the actual inode of this dentry with ACCESS_ONCE() + * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes @@ -527,7 +527,7 @@ static inline struct inode *d_inode(const struct dentry *dentry) */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { - return ACCESS_ONCE(dentry->d_inode); + return READ_ONCE(dentry->d_inode); } /** From fd7048adb7a5f3c44154675e0e128fe0f2e16bef Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:15 -0700 Subject: [PATCH 0572/1085] locking/atomics, fs/ncpfs: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() The NCPFS code has some stale comments regarding ACCESS_ONCE() uses which were removed a long time ago. Let's remove the stale comments. Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Alexander Viro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Vandrovec Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-5-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- fs/ncpfs/dir.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 088f52484d6e..72cfaa253a8f 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -119,10 +119,6 @@ static inline int ncp_case_sensitive(const struct inode *i) /* * Note: leave the hash unchanged if the directory * is case-sensitive. - * - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. */ static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) @@ -147,11 +143,6 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) return 0; } -/* - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. - */ static int ncp_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) From 24fbd6e04e4e4793ec9dd9c2e007b0b38d2ec3ab Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:16 -0700 Subject: [PATCH 0573/1085] locking/atomics, media/dvb_ringbuffer: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the DVB ringbuffer code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Sakari Ailus Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-6-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- drivers/media/dvb-core/dvb_ringbuffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb-core/dvb_ringbuffer.c index 2322af1b8742..53011629c9ad 100644 --- a/drivers/media/dvb-core/dvb_ringbuffer.c +++ b/drivers/media/dvb-core/dvb_ringbuffer.c @@ -66,12 +66,12 @@ ssize_t dvb_ringbuffer_free(struct dvb_ringbuffer *rbuf) { ssize_t free; - /* ACCESS_ONCE() to load read pointer on writer side + /* READ_ONCE() to load read pointer on writer side * this pairs with smp_store_release() in dvb_ringbuffer_read(), * dvb_ringbuffer_read_user(), dvb_ringbuffer_flush(), * or dvb_ringbuffer_reset() */ - free = ACCESS_ONCE(rbuf->pread) - rbuf->pwrite; + free = READ_ONCE(rbuf->pread) - rbuf->pwrite; if (free <= 0) free += rbuf->size; return free-1; @@ -143,7 +143,7 @@ ssize_t dvb_ringbuffer_read_user(struct dvb_ringbuffer *rbuf, u8 __user *buf, si todo -= split; /* smp_store_release() for read pointer update to ensure * that buf is not overwritten until read is complete, - * this pairs with ACCESS_ONCE() in dvb_ringbuffer_free() + * this pairs with READ_ONCE() in dvb_ringbuffer_free() */ smp_store_release(&rbuf->pread, 0); } @@ -168,7 +168,7 @@ void dvb_ringbuffer_read(struct dvb_ringbuffer *rbuf, u8 *buf, size_t len) todo -= split; /* smp_store_release() for read pointer update to ensure * that buf is not overwritten until read is complete, - * this pairs with ACCESS_ONCE() in dvb_ringbuffer_free() + * this pairs with READ_ONCE() in dvb_ringbuffer_free() */ smp_store_release(&rbuf->pread, 0); } From 14cd5d4a0125f643350e7fa12f5384f1fc2d3e9d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:17 -0700 Subject: [PATCH 0574/1085] locking/atomics, net/netlink/netfilter: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts netlink and netfilter code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: David S. Miller Cc: Florian Westphal Cc: Jozsef Kadlecsik Cc: Linus Torvalds Cc: Pablo Neira Ayuso Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-7-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/genetlink.h | 2 +- include/linux/netfilter/nfnetlink.h | 2 +- include/linux/rtnetlink.h | 2 +- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/ipvs/ip_vs_sync.c | 2 +- net/netfilter/nfnetlink_queue.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index a4c61cbce777..0e694cf62414 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -30,7 +30,7 @@ extern wait_queue_head_t genl_sk_destructing_waitq; * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because + * both the smp_read_barrier_depends() and the READ_ONCE(), because * caller holds genl mutex. */ #define genl_dereference(p) \ diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 41d04e9d088a..0f47a4aa7fc4 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -66,7 +66,7 @@ static inline bool lockdep_nfnl_is_held(__u8 subsys_id) * @ss: The nfnetlink subsystem ID * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because + * both the smp_read_barrier_depends() and the READ_ONCE(), because * caller holds the NFNL subsystem mutex. */ #define nfnl_dereference(p, ss) \ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index dea59c8eec54..765f7b915475 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -67,7 +67,7 @@ static inline bool lockdep_rtnl_is_held(void) * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because + * both the smp_read_barrier_depends() and the READ_ONCE(), because * caller holds RTNL. */ #define rtnl_dereference(p) \ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f5b12a4ad09..5c68e279eaea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1164,8 +1164,8 @@ static inline u8 nft_genmask_next(const struct net *net) static inline u8 nft_genmask_cur(const struct net *net) { - /* Use ACCESS_ONCE() to prevent refetching the value for atomicity */ - return 1 << ACCESS_ONCE(net->nft.gencursor); + /* Use READ_ONCE() to prevent refetching the value for atomicity */ + return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0e5b64a75da0..1cfffd42d1e2 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -457,7 +457,7 @@ static inline bool in_persistence(struct ip_vs_conn *cp) static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long orig = READ_ONCE(cp->sync_endtime); unsigned long now = jiffies; unsigned long n = (now + cp->timeout) & ~3UL; unsigned int sync_refresh_period; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index c9796629858f..a16356cacec3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -401,7 +401,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, outdev = entry->state.out; - switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; @@ -412,7 +412,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, skb_checksum_help(entskb)) return NULL; - data_len = ACCESS_ONCE(queue->copy_range); + data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; From a9da6f29baf6b61fc94f9768638ee54e9bff173c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:18 -0700 Subject: [PATCH 0575/1085] locking/atomics, net/ipv4/tcp_input.c: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the IPv4 TCP input code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: David S. Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-8-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7eec3383702b..74480e039005 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -815,12 +815,12 @@ static void tcp_update_pacing_rate(struct sock *sk) if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); - /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, - sk->sk_max_pacing_rate); + WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, + sk->sk_max_pacing_rate)); } /* Calculate rto without backoff. This is the second half of Van Jacobson's From ef4d9af62f47e3070b00c3307a4d8eb5092bb9a2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:19 -0700 Subject: [PATCH 0576/1085] locking/atomics, net/average: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't pick up some uses, including those in . As a preparatory step, this patch converts the file to use {READ,WRITE}_ONCE() consistently. At the same time, this patch addds missing includes necessary for {READ,WRITE}_ONCE(), *BUG_ON*(), and ilog2(). ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Reviewed-by: Johannes Berg Cc: David S. Miller Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-9-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/average.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/average.h b/include/linux/average.h index 7ddaf340d2ac..3f462292269c 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -1,6 +1,10 @@ #ifndef _LINUX_AVERAGE_H #define _LINUX_AVERAGE_H +#include +#include +#include + /* * Exponentially weighted moving average (EWMA) * @@ -48,7 +52,7 @@ static inline void ewma_##name##_add(struct ewma_##name *e, \ unsigned long val) \ { \ - unsigned long internal = ACCESS_ONCE(e->internal); \ + unsigned long internal = READ_ONCE(e->internal); \ unsigned long weight_rcp = ilog2(_weight_rcp); \ unsigned long precision = _precision; \ \ @@ -57,10 +61,10 @@ BUILD_BUG_ON((_precision) > 30); \ BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ \ - ACCESS_ONCE(e->internal) = internal ? \ + WRITE_ONCE(e->internal, internal ? \ (((internal << weight_rcp) - internal) + \ (val << precision)) >> weight_rcp : \ - (val << precision); \ + (val << precision)); \ } #endif /* _LINUX_AVERAGE_H */ From 94bbc9c1a0b0967509f8ad91323245aad30572f7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:20 -0700 Subject: [PATCH 0577/1085] locking/atomics, samples/mic/mpssd/mpssd.c: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. The bulk of the kernel code can be transformed via Coccinelle to use {READ,WRITE}_ONCE(), though this only modifies users of ACCESS_ONCE(), and not the implementation itself. As such, it has the potential to break homebrew ACCESS_ONCE() macros seen in some user code in the kernel tree (e.g. the virtio code, as fixed in commit ea9156fb3b71d9f7). To avoid fragility if/when that transformation occurs, and to align with the preferred usage of {READ,WRITE}_ONCE(), this patch updates the MPSSD sample code to use READ_ONCE() rather than ACCESS_ONCE(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-10-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- samples/mic/mpssd/mpssd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c index 49db1def1721..f42ce551bb48 100644 --- a/samples/mic/mpssd/mpssd.c +++ b/samples/mic/mpssd/mpssd.c @@ -65,7 +65,7 @@ static struct mic_info mic_list; /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) #define GSO_ENABLED 1 #define MAX_GSO_SIZE (64 * 1024) @@ -382,7 +382,7 @@ disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, static inline __u16 read_avail_idx(struct mic_vring *vr) { - return ACCESS_ONCE(vr->info->avail_idx); + return READ_ONCE(vr->info->avail_idx); } static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, @@ -523,7 +523,7 @@ spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) { __u16 avail_idx = read_avail_idx(vr); - while (avail_idx == le16toh(ACCESS_ONCE(vr->vr.avail->idx))) { + while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) { #ifdef DEBUG mpsslog("%s %s waiting for desc avail %d info_avail %d\n", mic->name, __func__, From 564cbc87937d5de1b5b666b0ed68122d6c74fa07 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:21 -0700 Subject: [PATCH 0578/1085] locking/atomics, selftests/powerpc: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. The bulk of the kernel code can be transformed via Coccinelle to use {READ,WRITE}_ONCE(), though this only modifies users of ACCESS_ONCE(), and not the implementation itself. As such, it has the potential to break homebrew ACCESS_ONCE() macros seen in some user code in the kernel tree (e.g. the virtio code, as fixed in commit ea9156fb3b71d9f7). To avoid fragility if/when that transformation occurs, and to align with the preferred usage of {READ,WRITE}_ONCE(), this patch updates the DSCR selftest code to use READ_ONCE() rather than ACCESS_ONCE(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-11-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/powerpc/dscr/dscr.h | 2 +- tools/testing/selftests/powerpc/dscr/dscr_default_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h index 18ea223bd398..cdb840bc54f2 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr.h +++ b/tools/testing/selftests/powerpc/dscr/dscr.h @@ -39,7 +39,7 @@ #define rmb() asm volatile("lwsync":::"memory") #define wmb() asm volatile("lwsync":::"memory") -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) /* Prilvilege state DSCR access */ inline unsigned long get_dscr(void) diff --git a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c index df17c3bab0a7..9e1a37e93b63 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c +++ b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c @@ -27,7 +27,7 @@ static void *do_test(void *in) unsigned long d, cur_dscr, cur_dscr_usr; unsigned long s1, s2; - s1 = ACCESS_ONCE(sequence); + s1 = READ_ONCE(sequence); if (s1 & 1) continue; rmb(); From c95491ed6d6a958743d82bea1d053819988da418 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:22 -0700 Subject: [PATCH 0579/1085] locking/atomics, workqueue: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the workqueue code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Acked-by: Tejun Heo Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-12-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..39831b2f3c5f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4647,7 +4647,7 @@ static void rebind_workers(struct worker_pool *pool) * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * - * ACCESS_ONCE() is necessary because @worker->flags may be + * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_waking_up(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency @@ -4656,7 +4656,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; - ACCESS_ONCE(worker->flags) = worker_flags; + WRITE_ONCE(worker->flags, worker_flags); } spin_unlock_irq(&pool->lock); From 5cd38016d0c88e3f67528c1a7380a6d5c90859e9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:23 -0700 Subject: [PATCH 0580/1085] locking/atomics, rcutorture/formal: Prepare for ACCESS_ONCE() removal For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. The bulk of the kernel code can be transformed via Coccinelle to use {READ,WRITE}_ONCE(), though this only modifies users of ACCESS_ONCE(), and not the implementation itself. As such, it has the potential to break homebrew ACCESS_ONCE() macros seen in some user code in the kernel tree (e.g. the virtio code, as fixed in commit ea9156fb3b71d9f7). To avoid fragility if/when that transformation occurs, this patch reworks the definitions of {READ,WRITE}_ONCE() in the rcutorture formal tests, and removes the unused ACCESS_ONCE() helper. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-13-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- .../selftests/rcutorture/formal/srcu-cbmc/src/barriers.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h index 6687acc08e6d..cc27b9ebcf20 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h @@ -34,8 +34,7 @@ #define rs_smp_mb() do {} while (0) #endif -#define ACCESS_ONCE(x) (*(volatile typeof(x) *) &(x)) -#define READ_ONCE(x) ACCESS_ONCE(x) -#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = (val)) +#define READ_ONCE(x) (*(volatile typeof(x) *) &(x)) +#define WRITE_ONCE(x) ((*(volatile typeof(x) *) &(x)) = (val)) #endif From 3587679d93d0b0e4c31e5a2ad1dffdfcb77e8526 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Oct 2017 14:07:24 -0700 Subject: [PATCH 0581/1085] locking/atomics, doc/filesystems: Convert ACCESS_ONCE() references For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle documentation, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the filesystems documentation to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Paul E. McKenney Acked-by: Will Deacon Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/1508792849-3115-14-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/filesystems/path-lookup.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/path-lookup.md b/Documentation/filesystems/path-lookup.md index 1b39e084a2b2..1933ef734e63 100644 --- a/Documentation/filesystems/path-lookup.md +++ b/Documentation/filesystems/path-lookup.md @@ -826,9 +826,9 @@ If the filesystem may need to revalidate dcache entries, then *is* passed the dentry but does not have access to the `inode` or the `seq` number from the `nameidata`, so it needs to be extra careful when accessing fields in the dentry. This "extra care" typically -involves using `ACCESS_ONCE()` or the newer [`READ_ONCE()`] to access -fields, and verifying the result is not NULL before using it. This -pattern can be see in `nfs_lookup_revalidate()`. +involves using [`READ_ONCE()`] to access fields, and verifying the +result is not NULL before using it. This pattern can be seen in +`nfs_lookup_revalidate()`. A pair of patterns ------------------ From b03a0fe0c5e4b46dcd400d27395b124499554a71 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Oct 2017 14:07:25 -0700 Subject: [PATCH 0582/1085] locking/atomics, mm: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the mm code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Paul E. McKenney Acked-by: Will Deacon Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/1508792849-3115-15-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a728bed16c20..cae514e7dcfc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3891,9 +3891,9 @@ static int handle_pte_fault(struct vm_fault *vmf) /* * some architectures can have larger ptes than wordsize, * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee - * atomic accesses. The code below just needs a consistent - * view for the ifs and we later double check anyway with the + * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic + * accesses. The code below just needs a consistent view + * for the ifs and we later double check anyway with the * ptl lock held. So here a barrier will do. */ barrier(); From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:29 -0700 Subject: [PATCH 0583/1085] locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/arc/kernel/smp.c | 2 +- arch/arm/include/asm/spinlock.h | 2 +- arch/arm/mach-tegra/cpuidle-tegra20.c | 2 +- arch/arm/vdso/vgettimeofday.c | 2 +- arch/ia64/include/asm/spinlock.h | 8 ++--- arch/mips/include/asm/vdso.h | 2 +- arch/mips/kernel/pm-cps.c | 2 +- arch/mn10300/kernel/mn10300-serial.c | 4 +-- arch/parisc/include/asm/atomic.h | 2 +- arch/powerpc/platforms/powernv/opal-msglog.c | 2 +- arch/s390/include/asm/spinlock.h | 6 ++-- arch/s390/lib/spinlock.c | 16 ++++----- arch/sparc/include/asm/atomic_32.h | 2 +- arch/tile/gxio/dma_queue.c | 4 +-- arch/tile/include/gxio/dma_queue.h | 2 +- arch/tile/kernel/ptrace.c | 2 +- arch/x86/entry/common.c | 2 +- arch/x86/entry/vdso/vclock_gettime.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/kernel/espfix_64.c | 6 ++-- arch/x86/kernel/nmi.c | 2 +- arch/x86/kvm/mmu.c | 4 +-- arch/x86/kvm/page_track.c | 2 +- arch/x86/xen/p2m.c | 2 +- arch/xtensa/platforms/xtfpga/lcd.c | 14 ++++---- block/blk-wbt.c | 2 +- drivers/base/core.c | 2 +- drivers/base/power/runtime.c | 4 +-- drivers/char/random.c | 4 +-- drivers/clocksource/bcm2835_timer.c | 2 +- drivers/crypto/caam/jr.c | 4 +-- drivers/crypto/nx/nx-842-powernv.c | 2 +- drivers/firewire/ohci.c | 10 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 +-- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 2 +- drivers/gpu/drm/radeon/radeon_gem.c | 4 +-- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 2 +- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/pio.c | 6 ++-- drivers/infiniband/hw/hfi1/ruc.c | 2 +- drivers/infiniband/hw/hfi1/sdma.c | 8 ++--- drivers/infiniband/hw/hfi1/sdma.h | 2 +- drivers/infiniband/hw/hfi1/uc.c | 4 +-- drivers/infiniband/hw/hfi1/ud.c | 4 +-- drivers/infiniband/hw/hfi1/user_sdma.c | 8 ++--- drivers/infiniband/hw/qib/qib_ruc.c | 2 +- drivers/infiniband/hw/qib/qib_uc.c | 4 +-- drivers/infiniband/hw/qib/qib_ud.c | 4 +-- drivers/infiniband/sw/rdmavt/qp.c | 6 ++-- drivers/input/misc/regulator-haptic.c | 2 +- drivers/md/dm-bufio.c | 10 +++--- drivers/md/dm-kcopyd.c | 4 +-- drivers/md/dm-stats.c | 36 +++++++++---------- drivers/md/dm-switch.c | 2 +- drivers/md/dm-thin.c | 2 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm.c | 4 +-- drivers/md/md.c | 2 +- drivers/md/raid5.c | 2 +- drivers/misc/mic/scif/scif_rb.c | 8 ++--- drivers/misc/mic/scif/scif_rma_list.c | 2 +- drivers/net/bonding/bond_alb.c | 2 +- drivers/net/bonding/bond_main.c | 6 ++-- drivers/net/ethernet/chelsio/cxgb4/sge.c | 4 +-- drivers/net/ethernet/emulex/benet/be_main.c | 2 +- drivers/net/ethernet/hisilicon/hip04_eth.c | 4 +-- .../net/ethernet/intel/i40e/i40e_debugfs.c | 4 +-- .../net/ethernet/intel/i40e/i40e_ethtool.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 +-- drivers/net/ethernet/intel/i40e/i40e_ptp.c | 4 +-- drivers/net/ethernet/intel/igb/e1000_regs.h | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- .../net/ethernet/intel/ixgbe/ixgbe_common.h | 4 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++--- drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 4 +-- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/vf.h | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 12 +++---- .../net/ethernet/neterion/vxge/vxge-main.c | 2 +- drivers/net/ethernet/sfc/ef10.c | 10 +++--- drivers/net/ethernet/sfc/efx.c | 4 +-- drivers/net/ethernet/sfc/falcon/efx.c | 4 +-- drivers/net/ethernet/sfc/falcon/falcon.c | 4 +-- drivers/net/ethernet/sfc/falcon/farch.c | 8 ++--- drivers/net/ethernet/sfc/falcon/nic.h | 6 ++-- drivers/net/ethernet/sfc/falcon/tx.c | 6 ++-- drivers/net/ethernet/sfc/farch.c | 8 ++--- drivers/net/ethernet/sfc/nic.h | 6 ++-- drivers/net/ethernet/sfc/ptp.c | 10 +++--- drivers/net/ethernet/sfc/tx.c | 6 ++-- drivers/net/ethernet/sun/niu.c | 4 +-- drivers/net/tap.c | 2 +- drivers/net/tun.c | 4 +-- drivers/net/wireless/ath/ath5k/desc.c | 8 ++--- .../broadcom/brcm80211/brcmfmac/sdio.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 4 +-- drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 2 +- .../net/wireless/intel/iwlwifi/pcie/trans.c | 10 +++--- drivers/net/wireless/mac80211_hwsim.c | 4 +-- drivers/scsi/qla2xxx/qla_target.c | 2 +- drivers/target/target_core_user.c | 2 +- drivers/usb/class/cdc-wdm.c | 2 +- drivers/usb/core/devio.c | 2 +- drivers/usb/core/sysfs.c | 4 +-- drivers/usb/gadget/udc/gr_udc.c | 4 +-- drivers/usb/host/ohci-hcd.c | 2 +- drivers/usb/host/uhci-hcd.h | 4 +-- drivers/vfio/vfio.c | 2 +- drivers/vhost/scsi.c | 2 +- fs/aio.c | 2 +- fs/buffer.c | 3 +- fs/crypto/keyinfo.c | 2 +- fs/direct-io.c | 2 +- fs/exec.c | 2 +- fs/fcntl.c | 2 +- fs/fs_pin.c | 4 +-- fs/fuse/dev.c | 2 +- fs/inode.c | 2 +- fs/namei.c | 4 +-- fs/namespace.c | 2 +- fs/nfs/dir.c | 8 ++--- fs/proc/array.c | 2 +- fs/proc_namespace.c | 2 +- fs/splice.c | 2 +- fs/userfaultfd.c | 8 ++--- fs/xfs/xfs_log_priv.h | 4 +-- include/linux/bitops.h | 4 +-- include/linux/dynamic_queue_limits.h | 2 +- include/linux/huge_mm.h | 2 +- include/linux/if_team.h | 2 +- include/linux/llist.h | 2 +- include/linux/pm_runtime.h | 2 +- include/net/ip_vs.h | 6 ++-- kernel/acct.c | 4 +-- kernel/events/core.c | 6 ++-- kernel/events/ring_buffer.c | 2 +- kernel/exit.c | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_stack.c | 2 +- kernel/user_namespace.c | 2 +- lib/assoc_array.c | 20 +++++------ lib/dynamic_queue_limits.c | 2 +- lib/llist.c | 2 +- lib/vsprintf.c | 4 +-- mm/huge_memory.c | 2 +- net/core/dev.c | 2 +- net/core/pktgen.c | 2 +- net/ipv4/inet_fragment.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/udp.c | 4 +-- net/ipv6/ip6_tunnel.c | 8 ++--- net/ipv6/udp.c | 4 +-- net/llc/llc_input.c | 4 +-- net/mac80211/sta_info.c | 2 +- net/netlabel/netlabel_calipso.c | 2 +- net/wireless/nl80211.c | 2 +- sound/firewire/amdtp-am824.c | 6 ++-- sound/firewire/amdtp-stream.c | 23 ++++++------ sound/firewire/amdtp-stream.h | 2 +- sound/firewire/digi00x/amdtp-dot.c | 6 ++-- sound/firewire/fireface/amdtp-ff.c | 4 +-- sound/firewire/fireface/ff-midi.c | 10 +++--- sound/firewire/fireface/ff-transaction.c | 8 ++--- sound/firewire/isight.c | 18 +++++----- sound/firewire/motu/amdtp-motu.c | 4 +-- sound/firewire/oxfw/oxfw-scs1x.c | 12 +++---- sound/firewire/tascam/amdtp-tascam.c | 4 +-- sound/firewire/tascam/tascam-transaction.c | 6 ++-- sound/soc/xtensa/xtfpga-i2s.c | 6 ++-- sound/usb/bcd2000/bcd2000.c | 4 +-- tools/arch/x86/include/asm/atomic.h | 2 +- tools/include/asm-generic/atomic-gcc.h | 2 +- tools/perf/util/auxtrace.h | 4 +-- tools/perf/util/session.h | 2 +- virt/kvm/kvm_main.c | 2 +- 180 files changed, 383 insertions(+), 385 deletions(-) diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index f46267153ec2..94cabe73664b 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -245,7 +245,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg) * and read back old value */ do { - new = old = ACCESS_ONCE(*ipi_data_ptr); + new = old = READ_ONCE(*ipi_data_ptr); new |= 1U << msg; } while (cmpxchg(ipi_data_ptr, old, new) != old); diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index daa87212c9a1..77f50ae0aeb4 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -71,7 +71,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) while (lockval.tickets.next != lockval.tickets.owner) { wfe(); - lockval.tickets.owner = ACCESS_ONCE(lock->tickets.owner); + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); } smp_mb(); diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c index 76e4c83cd5c8..3f24addd7972 100644 --- a/arch/arm/mach-tegra/cpuidle-tegra20.c +++ b/arch/arm/mach-tegra/cpuidle-tegra20.c @@ -179,7 +179,7 @@ static int tegra20_idle_lp2_coupled(struct cpuidle_device *dev, bool entered_lp2 = false; if (tegra_pending_sgi()) - ACCESS_ONCE(abort_flag) = true; + WRITE_ONCE(abort_flag, true); cpuidle_coupled_parallel_barrier(dev, &abort_barrier); diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c index 79214d5ff097..a9dd619c6c29 100644 --- a/arch/arm/vdso/vgettimeofday.c +++ b/arch/arm/vdso/vgettimeofday.c @@ -35,7 +35,7 @@ static notrace u32 __vdso_read_begin(const struct vdso_data *vdata) { u32 seq; repeat: - seq = ACCESS_ONCE(vdata->seq_count); + seq = READ_ONCE(vdata->seq_count); if (seq & 1) { cpu_relax(); goto repeat; diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 35b31884863b..e98775be112d 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -61,7 +61,7 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { - int tmp = ACCESS_ONCE(lock->lock); + int tmp = READ_ONCE(lock->lock); if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK)) return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp; @@ -73,19 +73,19 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) unsigned short *p = (unsigned short *)&lock->lock + 1, tmp; asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p)); - ACCESS_ONCE(*p) = (tmp + 2) & ~1; + WRITE_ONCE(*p, (tmp + 2) & ~1); } static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { - long tmp = ACCESS_ONCE(lock->lock); + long tmp = READ_ONCE(lock->lock); return !!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK); } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { - long tmp = ACCESS_ONCE(lock->lock); + long tmp = READ_ONCE(lock->lock); return ((tmp - (tmp >> TICKET_SHIFT)) & TICKET_MASK) > 1; } diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h index b7cd6cf77b83..91bf0c2c265c 100644 --- a/arch/mips/include/asm/vdso.h +++ b/arch/mips/include/asm/vdso.h @@ -99,7 +99,7 @@ static inline u32 vdso_data_read_begin(const union mips_vdso_data *data) u32 seq; while (true) { - seq = ACCESS_ONCE(data->seq_count); + seq = READ_ONCE(data->seq_count); if (likely(!(seq & 1))) { /* Paired with smp_wmb() in vdso_data_write_*(). */ smp_rmb(); diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c index 4655017f2377..1d2996cd58da 100644 --- a/arch/mips/kernel/pm-cps.c +++ b/arch/mips/kernel/pm-cps.c @@ -166,7 +166,7 @@ int cps_pm_enter_state(enum cps_pm_state state) nc_core_ready_count = nc_addr; /* Ensure ready_count is zero-initialised before the assembly runs */ - ACCESS_ONCE(*nc_core_ready_count) = 0; + WRITE_ONCE(*nc_core_ready_count, 0); coupled_barrier(&per_cpu(pm_barrier, core), online); /* Run the generated entry code */ diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c index 7ecf69879e2d..d7ef1232a82a 100644 --- a/arch/mn10300/kernel/mn10300-serial.c +++ b/arch/mn10300/kernel/mn10300-serial.c @@ -543,7 +543,7 @@ static void mn10300_serial_receive_interrupt(struct mn10300_serial_port *port) try_again: /* pull chars out of the hat */ - ix = ACCESS_ONCE(port->rx_outp); + ix = READ_ONCE(port->rx_outp); if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) { if (push && !tport->low_latency) tty_flip_buffer_push(tport); @@ -1724,7 +1724,7 @@ static int mn10300_serial_poll_get_char(struct uart_port *_port) if (mn10300_serial_int_tbl[port->rx_irq].port != NULL) { do { /* pull chars out of the hat */ - ix = ACCESS_ONCE(port->rx_outp); + ix = READ_ONCE(port->rx_outp); if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) return NO_POLL_CHAR; diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 17b98a87e5e2..c57d4e8307f2 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -260,7 +260,7 @@ atomic64_set(atomic64_t *v, s64 i) static __inline__ s64 atomic64_read(const atomic64_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } #define atomic64_inc(v) (atomic64_add( 1,(v))) diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index 7a9cde0cfbd1..acd3206dfae3 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -43,7 +43,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (!opal_memcons) return -ENODEV; - out_pos = be32_to_cpu(ACCESS_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 9fa855f91e55..66f4160010ef 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -117,14 +117,14 @@ extern int _raw_write_trylock_retry(arch_rwlock_t *lp); static inline int arch_read_trylock_once(arch_rwlock_t *rw) { - int old = ACCESS_ONCE(rw->lock); + int old = READ_ONCE(rw->lock); return likely(old >= 0 && __atomic_cmpxchg_bool(&rw->lock, old, old + 1)); } static inline int arch_write_trylock_once(arch_rwlock_t *rw) { - int old = ACCESS_ONCE(rw->lock); + int old = READ_ONCE(rw->lock); return likely(old == 0 && __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)); } @@ -211,7 +211,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) int old; do { - old = ACCESS_ONCE(rw->lock); + old = READ_ONCE(rw->lock); } while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1)); } diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index b12663d653d8..34e30b9ea234 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -162,8 +162,8 @@ void _raw_read_lock_wait(arch_rwlock_t *rw) smp_yield_cpu(~owner); count = spin_retry; } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); + old = READ_ONCE(rw->lock); + owner = READ_ONCE(rw->owner); if (old < 0) continue; if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) @@ -178,7 +178,7 @@ int _raw_read_trylock_retry(arch_rwlock_t *rw) int old; while (count-- > 0) { - old = ACCESS_ONCE(rw->lock); + old = READ_ONCE(rw->lock); if (old < 0) continue; if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) @@ -202,8 +202,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, int prev) smp_yield_cpu(~owner); count = spin_retry; } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); + old = READ_ONCE(rw->lock); + owner = READ_ONCE(rw->owner); smp_mb(); if (old >= 0) { prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR); @@ -230,8 +230,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw) smp_yield_cpu(~owner); count = spin_retry; } - old = ACCESS_ONCE(rw->lock); - owner = ACCESS_ONCE(rw->owner); + old = READ_ONCE(rw->lock); + owner = READ_ONCE(rw->owner); if (old >= 0 && __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000)) prev = old; @@ -251,7 +251,7 @@ int _raw_write_trylock_retry(arch_rwlock_t *rw) int old; while (count-- > 0) { - old = ACCESS_ONCE(rw->lock); + old = READ_ONCE(rw->lock); if (old) continue; if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)) diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 7643e979e333..e2f398e9456c 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -31,7 +31,7 @@ void atomic_set(atomic_t *, int); #define atomic_set_release(v, i) atomic_set((v), (i)) -#define atomic_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) #define atomic_sub(i, v) ((void)atomic_add_return(-(int)(i), (v))) diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c index baa60357f8ba..b7ba577d82ca 100644 --- a/arch/tile/gxio/dma_queue.c +++ b/arch/tile/gxio/dma_queue.c @@ -163,14 +163,14 @@ int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue, int64_t completion_slot, int update) { if (update) { - if (ACCESS_ONCE(dma_queue->hw_complete_count) > + if (READ_ONCE(dma_queue->hw_complete_count) > completion_slot) return 1; __gxio_dma_queue_update_credits(dma_queue); } - return ACCESS_ONCE(dma_queue->hw_complete_count) > completion_slot; + return READ_ONCE(dma_queue->hw_complete_count) > completion_slot; } EXPORT_SYMBOL_GPL(__gxio_dma_queue_is_complete); diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h index b9e45e37649e..c8fd47edba30 100644 --- a/arch/tile/include/gxio/dma_queue.h +++ b/arch/tile/include/gxio/dma_queue.h @@ -121,7 +121,7 @@ static inline int64_t __gxio_dma_queue_reserve(__gxio_dma_queue_t *dma_queue, * if the result is LESS than "hw_complete_count". */ uint64_t complete; - complete = ACCESS_ONCE(dma_queue->hw_complete_count); + complete = READ_ONCE(dma_queue->hw_complete_count); slot |= (complete & 0xffffffffff000000); if (slot < complete) slot += 0x1000000; diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c index e1a078e6828e..d516d61751c2 100644 --- a/arch/tile/kernel/ptrace.c +++ b/arch/tile/kernel/ptrace.c @@ -255,7 +255,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, int do_syscall_trace_enter(struct pt_regs *regs) { - u32 work = ACCESS_ONCE(current_thread_info()->flags); + u32 work = READ_ONCE(current_thread_info()->flags); if ((work & _TIF_SYSCALL_TRACE) && tracehook_report_syscall_entry(regs)) { diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03505ffbe1b6..eaa0ba66cf96 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -75,7 +75,7 @@ static long syscall_trace_enter(struct pt_regs *regs) if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; if (unlikely(work & _TIF_SYSCALL_EMU)) emulated = true; diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index fa8dbfcf7ed3..11b13c4b43d5 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -318,7 +318,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86 so we don't need any locks. */ - time_t result = ACCESS_ONCE(gtod->wall_time_sec); + time_t result = READ_ONCE(gtod->wall_time_sec); if (t) *t = result; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 589af1eec7c1..140d33288e78 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2118,7 +2118,7 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } - if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) + if (READ_ONCE(x86_pmu.attr_rdpmc)) event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; return err; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 022e59714562..53dd162576a8 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -48,7 +48,7 @@ static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) unsigned ret; repeat: - ret = ACCESS_ONCE(s->seq); + ret = READ_ONCE(s->seq); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9c4e7ba6870c..7d7715dde901 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -155,14 +155,14 @@ void init_espfix_ap(int cpu) page = cpu/ESPFIX_STACKS_PER_PAGE; /* Did another CPU already set this up? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (likely(stack_page)) goto done; mutex_lock(&espfix_init_mutex); /* Did we race on the lock? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (stack_page) goto unlock_done; @@ -200,7 +200,7 @@ void init_espfix_ap(int cpu) set_pte(&pte_p[n*PTE_STRIDE], pte); /* Job is done for this CPU and any CPU which shares this page */ - ACCESS_ONCE(espfix_pages[page]) = stack_page; + WRITE_ONCE(espfix_pages[page], stack_page); unlock_done: mutex_unlock(&espfix_init_mutex); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35aafc95e4b8..18bc9b51ac9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w) { struct nmiaction *a = container_of(w, struct nmiaction, irq_work); int remainder_ns, decimal_msecs; - u64 whole_msecs = ACCESS_ONCE(a->max_duration); + u64 whole_msecs = READ_ONCE(a->max_duration); remainder_ns = do_div(whole_msecs, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a69cf053711..a119b361b8b7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -443,7 +443,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) static u64 __get_spte_lockless(u64 *sptep) { - return ACCESS_ONCE(*sptep); + return READ_ONCE(*sptep); } #else union split_spte { @@ -4819,7 +4819,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * If we don't have indirect shadow pages, it means no page is * write-protected, so we can exit simply. */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; remote_flush = local_flush = false; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index ea67dc876316..01c1371f39f8 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -157,7 +157,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return false; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); - return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); + return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } void kvm_page_track_cleanup(struct kvm *kvm) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6083ba462f35..13b4f19b9131 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -547,7 +547,7 @@ int xen_alloc_p2m_entry(unsigned long pfn) if (p2m_top_mfn && pfn < MAX_P2M_PFN) { topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); + mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); diff --git a/arch/xtensa/platforms/xtfpga/lcd.c b/arch/xtensa/platforms/xtfpga/lcd.c index 4dc0c1b43f4b..2f7eb66c23ec 100644 --- a/arch/xtensa/platforms/xtfpga/lcd.c +++ b/arch/xtensa/platforms/xtfpga/lcd.c @@ -34,23 +34,23 @@ static void lcd_put_byte(u8 *addr, u8 data) { #ifdef CONFIG_XTFPGA_LCD_8BIT_ACCESS - ACCESS_ONCE(*addr) = data; + WRITE_ONCE(*addr, data); #else - ACCESS_ONCE(*addr) = data & 0xf0; - ACCESS_ONCE(*addr) = (data << 4) & 0xf0; + WRITE_ONCE(*addr, data & 0xf0); + WRITE_ONCE(*addr, (data << 4) & 0xf0); #endif } static int __init lcd_init(void) { - ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT; + WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT); mdelay(5); - ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT; + WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT); udelay(200); - ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT; + WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT); udelay(50); #ifndef CONFIG_XTFPGA_LCD_8BIT_ACCESS - ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE4BIT; + WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT); udelay(50); lcd_put_byte(LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT); udelay(50); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6a9a0f03a67b..d822530e6aea 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -261,7 +261,7 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + u64 now, issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; diff --git a/drivers/base/core.c b/drivers/base/core.c index 12ebd055724c..4b8ba2a75a4d 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -668,7 +668,7 @@ const char *dev_driver_string(const struct device *dev) * so be careful about accessing it. dev->bus and dev->class should * never change once they are set, so they don't need special care. */ - drv = ACCESS_ONCE(dev->driver); + drv = READ_ONCE(dev->driver); return drv ? drv->name : (dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "")); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 7bcf80fa9ada..41d7c2b99f69 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -134,11 +134,11 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev) if (!dev->power.use_autosuspend) goto out; - autosuspend_delay = ACCESS_ONCE(dev->power.autosuspend_delay); + autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) goto out; - last_busy = ACCESS_ONCE(dev->power.last_busy); + last_busy = READ_ONCE(dev->power.last_busy); elapsed = jiffies - last_busy; if (elapsed < 0) goto out; /* jiffies has wrapped around. */ diff --git a/drivers/char/random.c b/drivers/char/random.c index 8ad92707e45f..6c7ccac2679e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -641,7 +641,7 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) return; retry: - entropy_count = orig = ACCESS_ONCE(r->entropy_count); + entropy_count = orig = READ_ONCE(r->entropy_count); if (nfrac < 0) { /* Debit */ entropy_count += nfrac; @@ -1265,7 +1265,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, /* Can we pull enough? */ retry: - entropy_count = orig = ACCESS_ONCE(r->entropy_count); + entropy_count = orig = READ_ONCE(r->entropy_count); ibytes = nbytes; /* never pull more than available */ have_bytes = entropy_count >> (ENTROPY_SHIFT + 3); diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c index 39e489a96ad7..60da2537bef9 100644 --- a/drivers/clocksource/bcm2835_timer.c +++ b/drivers/clocksource/bcm2835_timer.c @@ -71,7 +71,7 @@ static irqreturn_t bcm2835_time_interrupt(int irq, void *dev_id) if (readl_relaxed(timer->control) & timer->match_mask) { writel_relaxed(timer->match_mask, timer->control); - event_handler = ACCESS_ONCE(timer->evt.event_handler); + event_handler = READ_ONCE(timer->evt.event_handler); if (event_handler) event_handler(&timer->evt); return IRQ_HANDLED; diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c index d258953ff488..f4f258075b89 100644 --- a/drivers/crypto/caam/jr.c +++ b/drivers/crypto/caam/jr.c @@ -172,7 +172,7 @@ static void caam_jr_dequeue(unsigned long devarg) while (rd_reg32(&jrp->rregs->outring_used)) { - head = ACCESS_ONCE(jrp->head); + head = READ_ONCE(jrp->head); spin_lock(&jrp->outlock); @@ -341,7 +341,7 @@ int caam_jr_enqueue(struct device *dev, u32 *desc, spin_lock_bh(&jrp->inplock); head = jrp->head; - tail = ACCESS_ONCE(jrp->tail); + tail = READ_ONCE(jrp->tail); if (!rd_reg32(&jrp->rregs->inpring_avail) || CIRC_SPACE(head, tail, JOBR_DEPTH) <= 0) { diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c index 874ddf5e9087..0f20f5ec9617 100644 --- a/drivers/crypto/nx/nx-842-powernv.c +++ b/drivers/crypto/nx/nx-842-powernv.c @@ -193,7 +193,7 @@ static int wait_for_csb(struct nx842_workmem *wmem, ktime_t start = wmem->start, now = ktime_get(); ktime_t timeout = ktime_add_ms(start, CSB_WAIT_MAX); - while (!(ACCESS_ONCE(csb->flags) & CSB_V)) { + while (!(READ_ONCE(csb->flags) & CSB_V)) { cpu_relax(); now = ktime_get(); if (ktime_after(now, timeout)) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 8bf89267dc25..ccf52368a073 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -734,7 +734,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx, __le16 res_count, next_res_count; i = ar_first_buffer_index(ctx); - res_count = ACCESS_ONCE(ctx->descriptors[i].res_count); + res_count = READ_ONCE(ctx->descriptors[i].res_count); /* A buffer that is not yet completely filled must be the last one. */ while (i != last && res_count == 0) { @@ -742,8 +742,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx, /* Peek at the next descriptor. */ next_i = ar_next_buffer_index(i); rmb(); /* read descriptors in order */ - next_res_count = ACCESS_ONCE( - ctx->descriptors[next_i].res_count); + next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count); /* * If the next descriptor is still empty, we must stop at this * descriptor. @@ -759,8 +758,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx, if (MAX_AR_PACKET_SIZE > PAGE_SIZE && i != last) { next_i = ar_next_buffer_index(next_i); rmb(); - next_res_count = ACCESS_ONCE( - ctx->descriptors[next_i].res_count); + next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count); if (next_res_count != cpu_to_le16(PAGE_SIZE)) goto next_buffer_is_active; } @@ -2812,7 +2810,7 @@ static int handle_ir_buffer_fill(struct context *context, u32 buffer_dma; req_count = le16_to_cpu(last->req_count); - res_count = le16_to_cpu(ACCESS_ONCE(last->res_count)); + res_count = le16_to_cpu(READ_ONCE(last->res_count)); completed = req_count - res_count; buffer_dma = le32_to_cpu(last->data_address); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 333bad749067..303b5e099a98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -260,7 +260,7 @@ static void amdgpu_fence_fallback(unsigned long arg) */ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring) { - uint64_t seq = ACCESS_ONCE(ring->fence_drv.sync_seq); + uint64_t seq = READ_ONCE(ring->fence_drv.sync_seq); struct dma_fence *fence, **ptr; int r; @@ -300,7 +300,7 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring) amdgpu_fence_process(ring); emitted = 0x100000000ull; emitted -= atomic_read(&ring->fence_drv.last_seq); - emitted += ACCESS_ONCE(ring->fence_drv.sync_seq); + emitted += READ_ONCE(ring->fence_drv.sync_seq); return lower_32_bits(emitted); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 7171968f261e..6149a47fe63d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -788,11 +788,11 @@ static int amdgpu_debugfs_gem_bo_info(int id, void *ptr, void *data) seq_printf(m, "\t0x%08x: %12ld byte %s", id, amdgpu_bo_size(bo), placement); - offset = ACCESS_ONCE(bo->tbo.mem.start); + offset = READ_ONCE(bo->tbo.mem.start); if (offset != AMDGPU_BO_INVALID_OFFSET) seq_printf(m, " @ 0x%010Lx", offset); - pin_count = ACCESS_ONCE(bo->pin_count); + pin_count = READ_ONCE(bo->pin_count); if (pin_count) seq_printf(m, " pin count %d", pin_count); seq_printf(m, "\n"); diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 38cea6fb25a8..a25f6c72f219 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -187,7 +187,7 @@ static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity) if (kfifo_is_empty(&entity->job_queue)) return false; - if (ACCESS_ONCE(entity->dependency)) + if (READ_ONCE(entity->dependency)) return false; return true; diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 3386452bd2f0..cf3deb283da5 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -451,7 +451,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data, else r = 0; - cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type); + cur_placement = READ_ONCE(robj->tbo.mem.mem_type); args->domain = radeon_mem_type_to_domain(cur_placement); drm_gem_object_put_unlocked(gobj); return r; @@ -481,7 +481,7 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, r = ret; /* Flush HDP cache via MMIO if necessary */ - cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type); + cur_placement = READ_ONCE(robj->tbo.mem.mem_type); if (rdev->asic->mmio_hdp_flush && radeon_mem_type_to_domain(cur_placement) == RADEON_GEM_DOMAIN_VRAM) robj->rdev->asic->mmio_hdp_flush(rdev); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index a552e4ea5440..6ac094ee8983 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -904,7 +904,7 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv, if (unlikely(drm_is_render_client(file_priv))) require_exist = true; - if (ACCESS_ONCE(vmw_fpriv(file_priv)->locked_master)) { + if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) { DRM_ERROR("Locked master refused legacy " "surface reference.\n"); return -EACCES; diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index d9a1e9893136..97bea2e1aa6a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -380,7 +380,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, if (sc->flags & SCF_FROZEN) { wait_event_interruptible_timeout( dd->event_queue, - !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN), + !(READ_ONCE(dd->flags) & HFI1_FROZEN), msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); if (dd->flags & HFI1_FROZEN) return -ENOLCK; diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 7108a4b5e94c..75e740780285 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1423,14 +1423,14 @@ retry: goto done; } /* copy from receiver cache line and recalculate */ - sc->alloc_free = ACCESS_ONCE(sc->free); + sc->alloc_free = READ_ONCE(sc->free); avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free); if (blocks > avail) { /* still no room, actively update */ sc_release_update(sc); - sc->alloc_free = ACCESS_ONCE(sc->free); + sc->alloc_free = READ_ONCE(sc->free); trycount++; goto retry; } @@ -1667,7 +1667,7 @@ void sc_release_update(struct send_context *sc) /* call sent buffer callbacks */ code = -1; /* code not yet set */ - head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */ + head = READ_ONCE(sc->sr_head); /* snapshot the head */ tail = sc->sr_tail; while (head != tail) { pbuf = &sc->sr[tail].pbuf; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index b3291f0fde9a..a7fc664f0d4e 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -363,7 +363,7 @@ static void ruc_loopback(struct rvt_qp *sqp) again: smp_read_barrier_depends(); /* see post_one_send() */ - if (sqp->s_last == ACCESS_ONCE(sqp->s_head)) + if (sqp->s_last == READ_ONCE(sqp->s_head)) goto clr_busy; wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 6781bcdb10b3..08346d25441c 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1725,7 +1725,7 @@ retry: swhead = sde->descq_head & sde->sdma_mask; /* this code is really bad for cache line trading */ - swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; cnt = sde->descq_cnt; if (swhead < swtail) @@ -1872,7 +1872,7 @@ retry: if ((status & sde->idle_mask) && !idle_check_done) { u16 swtail; - swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; if (swtail != hwhead) { hwhead = (u16)read_sde_csr(sde, SD(HEAD)); idle_check_done = 1; @@ -2222,7 +2222,7 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) u16 len; head = sde->descq_head & sde->sdma_mask; - tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; seq_printf(s, SDE_FMT, sde->this_idx, sde->cpu, sdma_state_name(sde->state.current_state), @@ -3305,7 +3305,7 @@ int sdma_ahg_alloc(struct sdma_engine *sde) return -EINVAL; } while (1) { - nr = ffz(ACCESS_ONCE(sde->ahg_bits)); + nr = ffz(READ_ONCE(sde->ahg_bits)); if (nr > 31) { trace_hfi1_ahg_allocate(sde, -ENOSPC); return -ENOSPC; diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 107011d8613b..374c59784950 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -445,7 +445,7 @@ static inline u16 sdma_descq_freecnt(struct sdma_engine *sde) { return sde->descq_cnt - (sde->descq_tail - - ACCESS_ONCE(sde->descq_head)) - 1; + READ_ONCE(sde->descq_head)) - 1; } static inline u16 sdma_descq_inprocess(struct sdma_engine *sde) diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 0b646173ca22..9a31c585427f 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -80,7 +80,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; /* We are in the error state, flush the work request. */ smp_read_barrier_depends(); /* see post_one_send() */ - if (qp->s_last == ACCESS_ONCE(qp->s_head)) + if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ if (iowait_sdma_pending(&priv->s_iowait)) { @@ -121,7 +121,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; /* Check if send work queue is empty. */ smp_read_barrier_depends(); /* see post_one_send() */ - if (qp->s_cur == ACCESS_ONCE(qp->s_head)) { + if (qp->s_cur == READ_ONCE(qp->s_head)) { clear_ahg(qp); goto bail; } diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 2ba74fdd6f15..7fec6b984e3e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -487,7 +487,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; /* We are in the error state, flush the work request. */ smp_read_barrier_depends(); /* see post_one_send */ - if (qp->s_last == ACCESS_ONCE(qp->s_head)) + if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ if (iowait_sdma_pending(&priv->s_iowait)) { @@ -501,7 +501,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* see post_one_send() */ smp_read_barrier_depends(); - if (qp->s_cur == ACCESS_ONCE(qp->s_head)) + if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; wqe = rvt_get_swqe_ptr(qp, qp->s_cur); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index c0c0e0445cbf..8ec6e8a8d6f7 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -276,7 +276,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, - (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); + (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); kfree(pq->reqs); kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); @@ -591,7 +591,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, if (ret != -EBUSY) { req->status = ret; WRITE_ONCE(req->has_error, 1); - if (ACCESS_ONCE(req->seqcomp) == + if (READ_ONCE(req->seqcomp) == req->seqsubmitted - 1) goto free_req; return ret; @@ -825,7 +825,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) */ if (req->data_len) { iovec = &req->iovs[req->iov_idx]; - if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { + if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { if (++req->iov_idx == req->data_iovs) { ret = -EFAULT; goto free_txreq; @@ -1390,7 +1390,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) } else { if (status != SDMA_TXREQ_S_OK) req->status = status; - if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && + if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && (READ_ONCE(req->done) || READ_ONCE(req->has_error))) { user_sdma_free_request(req, false); diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 53efbb0b40c4..9a37e844d4c8 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -368,7 +368,7 @@ static void qib_ruc_loopback(struct rvt_qp *sqp) again: smp_read_barrier_depends(); /* see post_one_send() */ - if (sqp->s_last == ACCESS_ONCE(sqp->s_head)) + if (sqp->s_last == READ_ONCE(sqp->s_head)) goto clr_busy; wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 498e2202e72c..bddcc37ace44 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -61,7 +61,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; /* We are in the error state, flush the work request. */ smp_read_barrier_depends(); /* see post_one_send() */ - if (qp->s_last == ACCESS_ONCE(qp->s_head)) + if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ if (atomic_read(&priv->s_dma_busy)) { @@ -91,7 +91,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; /* Check if send work queue is empty. */ smp_read_barrier_depends(); /* see post_one_send() */ - if (qp->s_cur == ACCESS_ONCE(qp->s_head)) + if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; /* * Start a new request. diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index be4907453ac4..15962ed193ce 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -253,7 +253,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) goto bail; /* We are in the error state, flush the work request. */ smp_read_barrier_depends(); /* see post_one_send */ - if (qp->s_last == ACCESS_ONCE(qp->s_head)) + if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ if (atomic_read(&priv->s_dma_busy)) { @@ -267,7 +267,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) /* see post_one_send() */ smp_read_barrier_depends(); - if (qp->s_cur == ACCESS_ONCE(qp->s_head)) + if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; wqe = rvt_get_swqe_ptr(qp, qp->s_cur); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 22df09ae809e..b670cb9d2006 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1073,7 +1073,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) rdi->driver_f.notify_error_qp(qp); /* Schedule the sending tasklet to drain the send work queue. */ - if (ACCESS_ONCE(qp->s_last) != qp->s_head) + if (READ_ONCE(qp->s_last) != qp->s_head) rdi->driver_f.schedule_send(qp); rvt_clear_mr_refs(qp, 0); @@ -1686,7 +1686,7 @@ static inline int rvt_qp_is_avail( if (likely(qp->s_avail)) return 0; smp_read_barrier_depends(); /* see rc.c */ - slast = ACCESS_ONCE(qp->s_last); + slast = READ_ONCE(qp->s_last); if (qp->s_head >= slast) avail = qp->s_size - (qp->s_head - slast); else @@ -1917,7 +1917,7 @@ int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * ahead and kick the send engine into gear. Otherwise we will always * just schedule the send to happen later. */ - call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next; + call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next; for (; wr; wr = wr->next) { err = rvt_post_one_wr(qp, wr, &call_send); diff --git a/drivers/input/misc/regulator-haptic.c b/drivers/input/misc/regulator-haptic.c index 2e8f801932be..a1db1e5040dc 100644 --- a/drivers/input/misc/regulator-haptic.c +++ b/drivers/input/misc/regulator-haptic.c @@ -233,7 +233,7 @@ static int __maybe_unused regulator_haptic_resume(struct device *dev) haptic->suspended = false; - magnitude = ACCESS_ONCE(haptic->magnitude); + magnitude = READ_ONCE(haptic->magnitude); if (magnitude) regulator_haptic_set_voltage(haptic, magnitude); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index d216a8f7bc22..33bb074d6941 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -347,7 +347,7 @@ static void __cache_size_refresh(void) BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); BUG_ON(dm_bufio_client_count < 0); - dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); + dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size); /* * Use default if set to 0 and report the actual cache size used. @@ -960,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { + if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { if (mutex_trylock(&dm_bufio_clients_lock)) { __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); @@ -1600,7 +1600,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } @@ -1647,7 +1647,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); - return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]); + return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); } /* @@ -1818,7 +1818,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); static unsigned get_max_age_hz(void) { - unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); + unsigned max_age = READ_ONCE(dm_bufio_max_age); if (max_age > UINT_MAX / HZ) max_age = UINT_MAX / HZ; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index cf2c67e35eaf..eb45cc3df31d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t) try_again: spin_lock_irq(&throttle_spinlock); - throttle = ACCESS_ONCE(t->throttle); + throttle = READ_ONCE(t->throttle); if (likely(throttle >= 100)) goto skip_limit; @@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t) t->num_io_jobs--; - if (likely(ACCESS_ONCE(t->throttle) >= 100)) + if (likely(READ_ONCE(t->throttle) >= 100)) goto skip_limit; if (!t->num_io_jobs) { diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 6028d8247f58..a1a5eec783cc 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -431,7 +431,7 @@ do_sync_free: synchronize_rcu_expedited(); dm_stat_free(&s->rcu_head); } else { - ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; + WRITE_ONCE(dm_stat_need_rcu_barrier, 1); call_rcu(&s->rcu_head, dm_stat_free); } return 0; @@ -639,12 +639,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, */ last = raw_cpu_ptr(stats->last); stats_aux->merged = - (bi_sector == (ACCESS_ONCE(last->last_sector) && + (bi_sector == (READ_ONCE(last->last_sector) && ((bi_rw == WRITE) == - (ACCESS_ONCE(last->last_rw) == WRITE)) + (READ_ONCE(last->last_rw) == WRITE)) )); - ACCESS_ONCE(last->last_sector) = end_sector; - ACCESS_ONCE(last->last_rw) = bi_rw; + WRITE_ONCE(last->last_sector, end_sector); + WRITE_ONCE(last->last_rw, bi_rw); } rcu_read_lock(); @@ -693,22 +693,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared for_each_possible_cpu(cpu) { p = &s->stat_percpu[cpu][x]; - shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); - shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); - shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); - shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); - shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); - shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); - shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); - shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); - shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); - shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); - shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); - shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); + shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); if (s->n_histogram_entries) { unsigned i; for (i = 0; i < s->n_histogram_entries + 1; i++) - shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); + shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); } } } diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 4c8de1ff78ca..8d0ba879777e 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long switch_get_position(sctx, region_nr, ®ion_index, &bit); - return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + return (READ_ONCE(sctx->region_table[region_index]) >> bit) & ((1 << sctx->region_table_entry_bits) - 1); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 1e25705209c2..89e5dff9b4cf 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); - unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; + unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ; /* * Never allow the pool to transition to PM_WRITE mode if user diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index bda3caca23ca..fba93237a780 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -589,7 +589,7 @@ static void verity_prefetch_io(struct work_struct *work) verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { - unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); + unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster); cluster >>= v->data_dev_block_bits; if (unlikely(!cluster)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4be85324f44d..8aaffa19b29a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -114,7 +114,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; static int __dm_get_module_param_int(int *module_param, int min, int max) { - int param = ACCESS_ONCE(*module_param); + int param = READ_ONCE(*module_param); int modified_param = 0; bool modified = true; @@ -136,7 +136,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max) unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { - unsigned param = ACCESS_ONCE(*module_param); + unsigned param = READ_ONCE(*module_param); unsigned modified_param = 0; if (!param) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0ff1bbf6c90e..447ddcbc9566 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2651,7 +2651,7 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; - unsigned long flags = ACCESS_ONCE(rdev->flags); + unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 928e24a07133..7d9a50eed9db 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6072,7 +6072,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n */ rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; diff --git a/drivers/misc/mic/scif/scif_rb.c b/drivers/misc/mic/scif/scif_rb.c index 637cc4686742..b665757ca89a 100644 --- a/drivers/misc/mic/scif/scif_rb.c +++ b/drivers/misc/mic/scif/scif_rb.c @@ -138,7 +138,7 @@ void scif_rb_commit(struct scif_rb *rb) * the read barrier in scif_rb_count(..) */ wmb(); - ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset; + WRITE_ONCE(*rb->write_ptr, rb->current_write_offset); #ifdef CONFIG_INTEL_MIC_CARD /* * X100 Si bug: For the case where a Core is performing an EXT_WR @@ -147,7 +147,7 @@ void scif_rb_commit(struct scif_rb *rb) * This way, if ordering is violated for the Interrupt Message, it will * fall just behind the first Posted associated with the first EXT_WR. */ - ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset; + WRITE_ONCE(*rb->write_ptr, rb->current_write_offset); #endif } @@ -210,7 +210,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb) * scif_rb_space(..) */ mb(); - ACCESS_ONCE(*rb->read_ptr) = new_offset; + WRITE_ONCE(*rb->read_ptr, new_offset); #ifdef CONFIG_INTEL_MIC_CARD /* * X100 Si Bug: For the case where a Core is performing an EXT_WR @@ -219,7 +219,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb) * This way, if ordering is violated for the Interrupt Message, it will * fall just behind the first Posted associated with the first EXT_WR. */ - ACCESS_ONCE(*rb->read_ptr) = new_offset; + WRITE_ONCE(*rb->read_ptr, new_offset); #endif } diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c index e1ef8daedd5a..a036dbb4101e 100644 --- a/drivers/misc/mic/scif/scif_rma_list.c +++ b/drivers/misc/mic/scif/scif_rma_list.c @@ -277,7 +277,7 @@ retry: * Need to restart list traversal if there has been * an asynchronous list entry deletion. */ - if (ACCESS_ONCE(ep->rma_info.async_list_del)) + if (READ_ONCE(ep->rma_info.async_list_del)) goto retry; } mutex_unlock(&ep->rma_info.rma_lock); diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index c02cc817a490..1ed9529e7bd1 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1378,7 +1378,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) unsigned int count; slaves = rcu_dereference(bond->slave_arr); - count = slaves ? ACCESS_ONCE(slaves->count) : 0; + count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[hash_index % count]; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c99dc59d729b..af51b90cecbb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1167,7 +1167,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; - recv_probe = ACCESS_ONCE(bond->recv_probe); + recv_probe = READ_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { @@ -3810,7 +3810,7 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev else bond_xmit_slave_id(bond, skb, 0); } else { - int slave_cnt = ACCESS_ONCE(bond->slave_cnt); + int slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond); @@ -3972,7 +3972,7 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int count; slaves = rcu_dereference(bond->slave_arr); - count = slaves ? ACCESS_ONCE(slaves->count) : 0; + count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) { slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; bond_dev_queue_xmit(bond, skb, slave->dev); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 4ef68f69b58c..43f52a8fe708 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -405,7 +405,7 @@ void free_tx_desc(struct adapter *adap, struct sge_txq *q, */ static inline int reclaimable(const struct sge_txq *q) { - int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx)); + int hw_cidx = ntohs(READ_ONCE(q->stat->cidx)); hw_cidx -= q->cidx; return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx; } @@ -1375,7 +1375,7 @@ out_free: dev_kfree_skb_any(skb); */ static inline void reclaim_completed_tx_imm(struct sge_txq *q) { - int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx)); + int hw_cidx = ntohs(READ_ONCE(q->stat->cidx)); int reclaim = hw_cidx - q->cidx; if (reclaim < 0) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 0e3d9f39a807..c6e859a27ee6 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -605,7 +605,7 @@ static void accumulate_16bit_val(u32 *acc, u16 val) if (wrapped) newacc += 65536; - ACCESS_ONCE(*acc) = newacc; + WRITE_ONCE(*acc, newacc); } static void populate_erx_stats(struct be_adapter *adapter, diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 0cec06bec63e..340e28211135 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -373,7 +373,7 @@ static int hip04_tx_reclaim(struct net_device *ndev, bool force) unsigned int count; smp_rmb(); - count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail); + count = tx_count(READ_ONCE(priv->tx_head), tx_tail); if (count == 0) goto out; @@ -431,7 +431,7 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) dma_addr_t phys; smp_rmb(); - count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail)); + count = tx_count(tx_head, READ_ONCE(priv->tx_tail)); if (count == (TX_DESC_NUM - 1)) { netif_stop_queue(ndev); return NETDEV_TX_BUSY; diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 8f326f87a815..2cb9539c931e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -264,7 +264,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) vsi->rx_buf_failed, vsi->rx_page_failed); rcu_read_lock(); for (i = 0; i < vsi->num_queue_pairs; i++) { - struct i40e_ring *rx_ring = ACCESS_ONCE(vsi->rx_rings[i]); + struct i40e_ring *rx_ring = READ_ONCE(vsi->rx_rings[i]); if (!rx_ring) continue; @@ -320,7 +320,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) ITR_IS_DYNAMIC(rx_ring->rx_itr_setting) ? "dynamic" : "fixed"); } for (i = 0; i < vsi->num_queue_pairs; i++) { - struct i40e_ring *tx_ring = ACCESS_ONCE(vsi->tx_rings[i]); + struct i40e_ring *tx_ring = READ_ONCE(vsi->tx_rings[i]); if (!tx_ring) continue; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 05e89864f781..e9e04a485e0a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1570,7 +1570,7 @@ static void i40e_get_ethtool_stats(struct net_device *netdev, } rcu_read_lock(); for (j = 0; j < vsi->num_queue_pairs; j++) { - tx_ring = ACCESS_ONCE(vsi->tx_rings[j]); + tx_ring = READ_ONCE(vsi->tx_rings[j]); if (!tx_ring) continue; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6498da8806cb..de1fcac7834d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -455,7 +455,7 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev, u64 bytes, packets; unsigned int start; - tx_ring = ACCESS_ONCE(vsi->tx_rings[i]); + tx_ring = READ_ONCE(vsi->tx_rings[i]); if (!tx_ring) continue; i40e_get_netdev_stats_struct_tx(tx_ring, stats); @@ -791,7 +791,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) rcu_read_lock(); for (q = 0; q < vsi->num_queue_pairs; q++) { /* locate Tx ring */ - p = ACCESS_ONCE(vsi->tx_rings[q]); + p = READ_ONCE(vsi->tx_rings[q]); do { start = u64_stats_fetch_begin_irq(&p->syncp); diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index d8456c381c99..97381238eb7c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -130,7 +130,7 @@ static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) } smp_mb(); /* Force any pending update before accessing. */ - adj = ACCESS_ONCE(pf->ptp_base_adj); + adj = READ_ONCE(pf->ptp_base_adj); freq = adj; freq *= ppb; @@ -499,7 +499,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf) wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32); /* Update the base adjustement value. */ - ACCESS_ONCE(pf->ptp_base_adj) = incval; + WRITE_ONCE(pf->ptp_base_adj, incval); smp_mb(); /* Force the above update. */ } diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h index 58adbf234e07..31a3f09df9f7 100644 --- a/drivers/net/ethernet/intel/igb/e1000_regs.h +++ b/drivers/net/ethernet/intel/igb/e1000_regs.h @@ -375,7 +375,7 @@ u32 igb_rd32(struct e1000_hw *hw, u32 reg); /* write operations, indexed using DWORDS */ #define wr32(reg, val) \ do { \ - u8 __iomem *hw_addr = ACCESS_ONCE((hw)->hw_addr); \ + u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \ if (!E1000_REMOVED(hw_addr)) \ writel((val), &hw_addr[(reg)]); \ } while (0) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fd4a46b03cc8..6bccc2be2b91 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -750,7 +750,7 @@ static void igb_cache_ring_register(struct igb_adapter *adapter) u32 igb_rd32(struct e1000_hw *hw, u32 reg) { struct igb_adapter *igb = container_of(hw, struct igb_adapter, hw); - u8 __iomem *hw_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr); u32 value = 0; if (E1000_REMOVED(hw_addr)) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h index e083732adf64..a01409e2e06c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h @@ -161,7 +161,7 @@ static inline bool ixgbe_removed(void __iomem *addr) static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value) { - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr); if (ixgbe_removed(reg_addr)) return; @@ -180,7 +180,7 @@ static inline void writeq(u64 val, void __iomem *addr) static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value) { - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr); if (ixgbe_removed(reg_addr)) return; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4d76afd13868..2224e691ee07 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -380,7 +380,7 @@ static void ixgbe_check_remove(struct ixgbe_hw *hw, u32 reg) */ u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg) { - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr); u32 value; if (ixgbe_removed(reg_addr)) @@ -8630,7 +8630,7 @@ static void ixgbe_get_stats64(struct net_device *netdev, rcu_read_lock(); for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]); + struct ixgbe_ring *ring = READ_ONCE(adapter->rx_ring[i]); u64 bytes, packets; unsigned int start; @@ -8646,12 +8646,12 @@ static void ixgbe_get_stats64(struct net_device *netdev, } for (i = 0; i < adapter->num_tx_queues; i++) { - struct ixgbe_ring *ring = ACCESS_ONCE(adapter->tx_ring[i]); + struct ixgbe_ring *ring = READ_ONCE(adapter->tx_ring[i]); ixgbe_get_ring_stats64(stats, ring); } for (i = 0; i < adapter->num_xdp_queues; i++) { - struct ixgbe_ring *ring = ACCESS_ONCE(adapter->xdp_ring[i]); + struct ixgbe_ring *ring = READ_ONCE(adapter->xdp_ring[i]); ixgbe_get_ring_stats64(stats, ring); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index 86d6924a2b71..ae312c45696a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -378,7 +378,7 @@ static int ixgbe_ptp_adjfreq_82599(struct ptp_clock_info *ptp, s32 ppb) } smp_mb(); - incval = ACCESS_ONCE(adapter->base_incval); + incval = READ_ONCE(adapter->base_incval); freq = incval; freq *= ppb; @@ -1159,7 +1159,7 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter) } /* update the base incval used to calculate frequency adjustment */ - ACCESS_ONCE(adapter->base_incval) = incval; + WRITE_ONCE(adapter->base_incval, incval); smp_mb(); /* need lock to prevent incorrect read while modifying cyclecounter */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 032f8ac06357..cacb30682434 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -164,7 +164,7 @@ static void ixgbevf_check_remove(struct ixgbe_hw *hw, u32 reg) u32 ixgbevf_read_reg(struct ixgbe_hw *hw, u32 reg) { - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr); u32 value; if (IXGBE_REMOVED(reg_addr)) diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h index 04d8d4ee4f04..c651fefcc3d2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.h +++ b/drivers/net/ethernet/intel/ixgbevf/vf.h @@ -182,7 +182,7 @@ struct ixgbevf_info { static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value) { - u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr); + u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr); if (IXGBE_REMOVED(reg_addr)) return; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8a32a8f7f9c0..3541a7f9d12e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -414,8 +414,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, index = cons_index & size_mask; cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; - last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb); - ring_cons = ACCESS_ONCE(ring->cons); + last_nr_txbb = READ_ONCE(ring->last_nr_txbb); + ring_cons = READ_ONCE(ring->cons); ring_index = ring_cons & size_mask; stamp_index = ring_index; @@ -479,8 +479,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, wmb(); /* we want to dirty this cache line once */ - ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; - ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; + WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb); + WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped); if (cq->type == TX_XDP) return done < budget; @@ -858,7 +858,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_drop; /* fetch ring->cons far ahead before needing it to avoid stall */ - ring_cons = ACCESS_ONCE(ring->cons); + ring_cons = READ_ONCE(ring->cons); real_size = get_real_size(skb, shinfo, dev, &lso_header_size, &inline_ok, &fragptr); @@ -1066,7 +1066,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) */ smp_rmb(); - ring_cons = ACCESS_ONCE(ring->cons); + ring_cons = READ_ONCE(ring->cons); if (unlikely(!mlx4_en_is_tx_ring_full(ring))) { netif_tx_wake_queue(ring->tx_queue); ring->wake_queue++; diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c index 50ea69d88480..5dd5f61e1114 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-main.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c @@ -2629,7 +2629,7 @@ static void vxge_poll_vp_lockup(unsigned long data) ring = &vdev->vpaths[i].ring; /* Truncated to machine word size number of frames */ - rx_frms = ACCESS_ONCE(ring->stats.rx_frms); + rx_frms = READ_ONCE(ring->stats.rx_frms); /* Did this vpath received any packets */ if (ring->stats.prev_rx_frms == rx_frms) { diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 13f72f5b18d2..a95a46bcd339 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2073,7 +2073,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id) netif_vdbg(efx, intr, efx->net_dev, "IRQ %d on CPU %d\n", irq, raw_smp_processor_id()); - if (likely(ACCESS_ONCE(efx->irq_soft_enabled))) { + if (likely(READ_ONCE(efx->irq_soft_enabled))) { /* Note test interrupts */ if (context->index == efx->irq_level) efx->last_irq_cpu = raw_smp_processor_id(); @@ -2088,7 +2088,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id) static irqreturn_t efx_ef10_legacy_interrupt(int irq, void *dev_id) { struct efx_nic *efx = dev_id; - bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled); + bool soft_enabled = READ_ONCE(efx->irq_soft_enabled); struct efx_channel *channel; efx_dword_t reg; u32 queues; @@ -3291,7 +3291,7 @@ static int efx_ef10_handle_rx_event(struct efx_channel *channel, bool rx_cont; u16 flags = 0; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return 0; /* Basic packet information */ @@ -3428,7 +3428,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) unsigned int tx_ev_q_label; int tx_descs = 0; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return 0; if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT))) @@ -5316,7 +5316,7 @@ static void efx_ef10_filter_remove_old(struct efx_nic *efx) int i; for (i = 0; i < HUNT_FILTER_TBL_ROWS; i++) { - if (ACCESS_ONCE(table->entry[i].spec) & + if (READ_ONCE(table->entry[i].spec) & EFX_EF10_FILTER_FLAG_AUTO_OLD) { rc = efx_ef10_filter_remove_internal(efx, 1U << EFX_FILTER_PRI_AUTO, i, true); diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index b9cb697b2818..016616a63880 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2809,7 +2809,7 @@ static void efx_reset_work(struct work_struct *data) unsigned long pending; enum reset_type method; - pending = ACCESS_ONCE(efx->reset_pending); + pending = READ_ONCE(efx->reset_pending); method = fls(pending) - 1; if (method == RESET_TYPE_MC_BIST) @@ -2874,7 +2874,7 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type) /* If we're not READY then just leave the flags set as the cue * to abort probing or reschedule the reset later. */ - if (ACCESS_ONCE(efx->state) != STATE_READY) + if (READ_ONCE(efx->state) != STATE_READY) return; /* efx_process_channel() will no longer read events once a diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index 29614da91cbf..7263275fde4a 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2545,7 +2545,7 @@ static void ef4_reset_work(struct work_struct *data) unsigned long pending; enum reset_type method; - pending = ACCESS_ONCE(efx->reset_pending); + pending = READ_ONCE(efx->reset_pending); method = fls(pending) - 1; if ((method == RESET_TYPE_RECOVER_OR_DISABLE || @@ -2605,7 +2605,7 @@ void ef4_schedule_reset(struct ef4_nic *efx, enum reset_type type) /* If we're not READY then just leave the flags set as the cue * to abort probing or reschedule the reset later. */ - if (ACCESS_ONCE(efx->state) != STATE_READY) + if (READ_ONCE(efx->state) != STATE_READY) return; queue_work(reset_workqueue, &efx->reset_work); diff --git a/drivers/net/ethernet/sfc/falcon/falcon.c b/drivers/net/ethernet/sfc/falcon/falcon.c index 93c713c1f627..cd8bb472d758 100644 --- a/drivers/net/ethernet/sfc/falcon/falcon.c +++ b/drivers/net/ethernet/sfc/falcon/falcon.c @@ -452,7 +452,7 @@ static irqreturn_t falcon_legacy_interrupt_a1(int irq, void *dev_id) "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n", irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker)); - if (!likely(ACCESS_ONCE(efx->irq_soft_enabled))) + if (!likely(READ_ONCE(efx->irq_soft_enabled))) return IRQ_HANDLED; /* Check to see if we have a serious error condition */ @@ -1372,7 +1372,7 @@ static void falcon_reconfigure_mac_wrapper(struct ef4_nic *efx) ef4_oword_t reg; int link_speed, isolate; - isolate = !!ACCESS_ONCE(efx->reset_pending); + isolate = !!READ_ONCE(efx->reset_pending); switch (link_state->speed) { case 10000: link_speed = 3; break; diff --git a/drivers/net/ethernet/sfc/falcon/farch.c b/drivers/net/ethernet/sfc/falcon/farch.c index 05916c710d8c..494884f6af4a 100644 --- a/drivers/net/ethernet/sfc/falcon/farch.c +++ b/drivers/net/ethernet/sfc/falcon/farch.c @@ -834,7 +834,7 @@ ef4_farch_handle_tx_event(struct ef4_channel *channel, ef4_qword_t *event) struct ef4_nic *efx = channel->efx; int tx_packets = 0; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return 0; if (likely(EF4_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) { @@ -990,7 +990,7 @@ ef4_farch_handle_rx_event(struct ef4_channel *channel, const ef4_qword_t *event) struct ef4_rx_queue *rx_queue; struct ef4_nic *efx = channel->efx; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return; rx_ev_cont = EF4_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT); @@ -1504,7 +1504,7 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx) irqreturn_t ef4_farch_legacy_interrupt(int irq, void *dev_id) { struct ef4_nic *efx = dev_id; - bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled); + bool soft_enabled = READ_ONCE(efx->irq_soft_enabled); ef4_oword_t *int_ker = efx->irq_status.addr; irqreturn_t result = IRQ_NONE; struct ef4_channel *channel; @@ -1596,7 +1596,7 @@ irqreturn_t ef4_farch_msi_interrupt(int irq, void *dev_id) "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n", irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker)); - if (!likely(ACCESS_ONCE(efx->irq_soft_enabled))) + if (!likely(READ_ONCE(efx->irq_soft_enabled))) return IRQ_HANDLED; /* Handle non-event-queue sources */ diff --git a/drivers/net/ethernet/sfc/falcon/nic.h b/drivers/net/ethernet/sfc/falcon/nic.h index a4c4592f6023..54ca457cdb15 100644 --- a/drivers/net/ethernet/sfc/falcon/nic.h +++ b/drivers/net/ethernet/sfc/falcon/nic.h @@ -83,7 +83,7 @@ static inline struct ef4_tx_queue *ef4_tx_queue_partner(struct ef4_tx_queue *tx_ static inline bool __ef4_nic_tx_is_empty(struct ef4_tx_queue *tx_queue, unsigned int write_count) { - unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count); + unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count); if (empty_read_count == 0) return false; @@ -464,11 +464,11 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx); static inline int ef4_nic_event_test_irq_cpu(struct ef4_channel *channel) { - return ACCESS_ONCE(channel->event_test_cpu); + return READ_ONCE(channel->event_test_cpu); } static inline int ef4_nic_irq_test_irq_cpu(struct ef4_nic *efx) { - return ACCESS_ONCE(efx->last_irq_cpu); + return READ_ONCE(efx->last_irq_cpu); } /* Global Resources */ diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c index 6a75f4140a4b..6486814e97dc 100644 --- a/drivers/net/ethernet/sfc/falcon/tx.c +++ b/drivers/net/ethernet/sfc/falcon/tx.c @@ -134,8 +134,8 @@ static void ef4_tx_maybe_stop_queue(struct ef4_tx_queue *txq1) */ netif_tx_stop_queue(txq1->core_txq); smp_mb(); - txq1->old_read_count = ACCESS_ONCE(txq1->read_count); - txq2->old_read_count = ACCESS_ONCE(txq2->read_count); + txq1->old_read_count = READ_ONCE(txq1->read_count); + txq2->old_read_count = READ_ONCE(txq2->read_count); fill_level = max(txq1->insert_count - txq1->old_read_count, txq2->insert_count - txq2->old_read_count); @@ -524,7 +524,7 @@ void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index) /* Check whether the hardware queue is now empty */ if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { - tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count); + tx_queue->old_write_count = READ_ONCE(tx_queue->write_count); if (tx_queue->read_count == tx_queue->old_write_count) { smp_mb(); tx_queue->empty_read_count = diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c index ba45150f53c7..86454d25a405 100644 --- a/drivers/net/ethernet/sfc/farch.c +++ b/drivers/net/ethernet/sfc/farch.c @@ -827,7 +827,7 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) struct efx_nic *efx = channel->efx; int tx_packets = 0; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return 0; if (likely(EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) { @@ -979,7 +979,7 @@ efx_farch_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event) struct efx_rx_queue *rx_queue; struct efx_nic *efx = channel->efx; - if (unlikely(ACCESS_ONCE(efx->reset_pending))) + if (unlikely(READ_ONCE(efx->reset_pending))) return; rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT); @@ -1520,7 +1520,7 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx) irqreturn_t efx_farch_legacy_interrupt(int irq, void *dev_id) { struct efx_nic *efx = dev_id; - bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled); + bool soft_enabled = READ_ONCE(efx->irq_soft_enabled); efx_oword_t *int_ker = efx->irq_status.addr; irqreturn_t result = IRQ_NONE; struct efx_channel *channel; @@ -1612,7 +1612,7 @@ irqreturn_t efx_farch_msi_interrupt(int irq, void *dev_id) "IRQ %d on CPU %d status " EFX_OWORD_FMT "\n", irq, raw_smp_processor_id(), EFX_OWORD_VAL(*int_ker)); - if (!likely(ACCESS_ONCE(efx->irq_soft_enabled))) + if (!likely(READ_ONCE(efx->irq_soft_enabled))) return IRQ_HANDLED; /* Handle non-event-queue sources */ diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index 4d7fb8af880d..7b51b6371724 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -81,7 +81,7 @@ static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue) static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue, unsigned int write_count) { - unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count); + unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count); if (empty_read_count == 0) return false; @@ -617,11 +617,11 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx); static inline int efx_nic_event_test_irq_cpu(struct efx_channel *channel) { - return ACCESS_ONCE(channel->event_test_cpu); + return READ_ONCE(channel->event_test_cpu); } static inline int efx_nic_irq_test_irq_cpu(struct efx_nic *efx) { - return ACCESS_ONCE(efx->last_irq_cpu); + return READ_ONCE(efx->last_irq_cpu); } /* Global Resources */ diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 60cdb97f58e2..56c2db398def 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -658,7 +658,7 @@ static void efx_ptp_send_times(struct efx_nic *efx, /* Write host time for specified period or until MC is done */ while ((timespec64_compare(&now.ts_real, &limit) < 0) && - ACCESS_ONCE(*mc_running)) { + READ_ONCE(*mc_running)) { struct timespec64 update_time; unsigned int host_time; @@ -668,7 +668,7 @@ static void efx_ptp_send_times(struct efx_nic *efx, do { pps_get_ts(&now); } while ((timespec64_compare(&now.ts_real, &update_time) < 0) && - ACCESS_ONCE(*mc_running)); + READ_ONCE(*mc_running)); /* Synchronise NIC with single word of time only */ host_time = (now.ts_real.tv_sec << MC_NANOSECOND_BITS | @@ -832,14 +832,14 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings) ptp->start.dma_addr); /* Clear flag that signals MC ready */ - ACCESS_ONCE(*start) = 0; + WRITE_ONCE(*start, 0); rc = efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf, MC_CMD_PTP_IN_SYNCHRONIZE_LEN); EFX_WARN_ON_ONCE_PARANOID(rc); /* Wait for start from MCDI (or timeout) */ timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS); - while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) { + while (!READ_ONCE(*start) && (time_before(jiffies, timeout))) { udelay(20); /* Usually start MCDI execution quickly */ loops++; } @@ -849,7 +849,7 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings) if (!time_before(jiffies, timeout)) ++ptp->sync_timeouts; - if (ACCESS_ONCE(*start)) + if (READ_ONCE(*start)) efx_ptp_send_times(efx, &last_time); /* Collect results */ diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 32bf1fecf864..efb66ea21f27 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -136,8 +136,8 @@ static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) */ netif_tx_stop_queue(txq1->core_txq); smp_mb(); - txq1->old_read_count = ACCESS_ONCE(txq1->read_count); - txq2->old_read_count = ACCESS_ONCE(txq2->read_count); + txq1->old_read_count = READ_ONCE(txq1->read_count); + txq2->old_read_count = READ_ONCE(txq2->read_count); fill_level = max(txq1->insert_count - txq1->old_read_count, txq2->insert_count - txq2->old_read_count); @@ -752,7 +752,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) /* Check whether the hardware queue is now empty */ if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { - tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count); + tx_queue->old_write_count = READ_ONCE(tx_queue->write_count); if (tx_queue->read_count == tx_queue->old_write_count) { smp_mb(); tx_queue->empty_read_count = diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 6a4e8e1bbd90..8ab0fb6892d5 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6245,7 +6245,7 @@ static void niu_get_rx_stats(struct niu *np, pkts = dropped = errors = bytes = 0; - rx_rings = ACCESS_ONCE(np->rx_rings); + rx_rings = READ_ONCE(np->rx_rings); if (!rx_rings) goto no_rings; @@ -6276,7 +6276,7 @@ static void niu_get_tx_stats(struct niu *np, pkts = errors = bytes = 0; - tx_rings = ACCESS_ONCE(np->tx_rings); + tx_rings = READ_ONCE(np->tx_rings); if (!tx_rings) goto no_rings; diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 21b71ae947fd..b55b29b90b88 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -257,7 +257,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, * and validate that the result isn't NULL - in case we are * racing against queue removal. */ - int numvtaps = ACCESS_ONCE(tap->numvtaps); + int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e21bf90b819f..27cd50c5bc9e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -469,7 +469,7 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, u32 numqueues = 0; rcu_read_lock(); - numqueues = ACCESS_ONCE(tun->numqueues); + numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); if (txq) { @@ -864,7 +864,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); - numqueues = ACCESS_ONCE(tun->numqueues); + numqueues = READ_ONCE(tun->numqueues); /* Drop packet if interface is not attached */ if (txq >= numqueues) diff --git a/drivers/net/wireless/ath/ath5k/desc.c b/drivers/net/wireless/ath/ath5k/desc.c index bd8d4392d68b..80f75139495f 100644 --- a/drivers/net/wireless/ath/ath5k/desc.c +++ b/drivers/net/wireless/ath/ath5k/desc.c @@ -500,13 +500,13 @@ ath5k_hw_proc_4word_tx_status(struct ath5k_hw *ah, tx_status = &desc->ud.ds_tx5212.tx_stat; - txstat1 = ACCESS_ONCE(tx_status->tx_status_1); + txstat1 = READ_ONCE(tx_status->tx_status_1); /* No frame has been send or error */ if (unlikely(!(txstat1 & AR5K_DESC_TX_STATUS1_DONE))) return -EINPROGRESS; - txstat0 = ACCESS_ONCE(tx_status->tx_status_0); + txstat0 = READ_ONCE(tx_status->tx_status_0); /* * Get descriptor status @@ -700,14 +700,14 @@ ath5k_hw_proc_5212_rx_status(struct ath5k_hw *ah, u32 rxstat0, rxstat1; rx_status = &desc->ud.ds_rx.rx_stat; - rxstat1 = ACCESS_ONCE(rx_status->rx_status_1); + rxstat1 = READ_ONCE(rx_status->rx_status_1); /* No frame received / not ready */ if (unlikely(!(rxstat1 & AR5K_5212_RX_DESC_STATUS1_DONE))) return -EINPROGRESS; memset(rs, 0, sizeof(struct ath5k_rx_status)); - rxstat0 = ACCESS_ONCE(rx_status->rx_status_0); + rxstat0 = READ_ONCE(rx_status->rx_status_0); /* * Frame receive status diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 613caca7dc02..785a0f33b7e6 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -3628,7 +3628,7 @@ static void brcmf_sdio_dataworker(struct work_struct *work) bus->dpc_running = true; wmb(); - while (ACCESS_ONCE(bus->dpc_triggered)) { + while (READ_ONCE(bus->dpc_triggered)) { bus->dpc_triggered = false; brcmf_sdio_dpc(bus); bus->idlecount = 0; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 231878969332..0f45f34e39d3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1118,7 +1118,7 @@ void iwl_mvm_set_hw_ctkill_state(struct iwl_mvm *mvm, bool state) static bool iwl_mvm_set_hw_rfkill_state(struct iwl_op_mode *op_mode, bool state) { struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode); - bool calibrating = ACCESS_ONCE(mvm->calibrating); + bool calibrating = READ_ONCE(mvm->calibrating); if (state) set_bit(IWL_MVM_STATUS_HW_RFKILL, &mvm->status); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 6f2e2af23219..6e9d3289b9d0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -652,7 +652,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) return -1; } else if (info.control.vif->type == NL80211_IFTYPE_STATION && is_multicast_ether_addr(hdr->addr1)) { - u8 ap_sta_id = ACCESS_ONCE(mvmvif->ap_sta_id); + u8 ap_sta_id = READ_ONCE(mvmvif->ap_sta_id); if (ap_sta_id != IWL_MVM_INVALID_STA) sta_id = ap_sta_id; @@ -700,7 +700,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, snap_ip_tcp = 8 + skb_transport_header(skb) - skb_network_header(skb) + tcp_hdrlen(skb); - dbg_max_amsdu_len = ACCESS_ONCE(mvm->max_amsdu_len); + dbg_max_amsdu_len = READ_ONCE(mvm->max_amsdu_len); if (!sta->max_amsdu_len || !ieee80211_is_data_qos(hdr->frame_control) || diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index a06b6612b658..f25ce3a1ea50 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1247,7 +1247,7 @@ restart: spin_lock(&rxq->lock); /* uCode's read index (stored in shared DRAM) indicates the last Rx * buffer that the driver may process (last buffer filled by ucode). */ - r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF; + r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF; i = rxq->read; /* W/A 9000 device step A0 wrap-around bug */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 2e3e013ec95a..9ad3f4fe5894 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -2076,12 +2076,12 @@ static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx) IWL_DEBUG_TX_QUEUES(trans, "Emptying queue %d...\n", txq_idx); txq = trans_pcie->txq[txq_idx]; - wr_ptr = ACCESS_ONCE(txq->write_ptr); + wr_ptr = READ_ONCE(txq->write_ptr); - while (txq->read_ptr != ACCESS_ONCE(txq->write_ptr) && + while (txq->read_ptr != READ_ONCE(txq->write_ptr) && !time_after(jiffies, now + msecs_to_jiffies(IWL_FLUSH_WAIT_MS))) { - u8 write_ptr = ACCESS_ONCE(txq->write_ptr); + u8 write_ptr = READ_ONCE(txq->write_ptr); if (WARN_ONCE(wr_ptr != write_ptr, "WR pointer moved while flushing %d -> %d\n", @@ -2553,7 +2553,7 @@ static u32 iwl_trans_pcie_dump_rbs(struct iwl_trans *trans, spin_lock(&rxq->lock); - r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF; + r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF; for (i = rxq->read, j = 0; i != r && j < allocated_rb_nums; @@ -2814,7 +2814,7 @@ static struct iwl_trans_dump_data /* Dump RBs is supported only for pre-9000 devices (1 queue) */ struct iwl_rxq *rxq = &trans_pcie->rxq[0]; /* RBs */ - num_rbs = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) + num_rbs = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF; num_rbs = (num_rbs - rxq->read) & RX_QUEUE_MASK; len += num_rbs * (sizeof(*data) + diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6467ffac9811..d2b3d6177a55 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1380,7 +1380,7 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, mac80211_hwsim_monitor_rx(hw, skb, channel); /* wmediumd mode check */ - _portid = ACCESS_ONCE(data->wmediumd); + _portid = READ_ONCE(data->wmediumd); if (_portid) return mac80211_hwsim_tx_frame_nl(hw, skb, _portid); @@ -1477,7 +1477,7 @@ static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv; - u32 _pid = ACCESS_ONCE(data->wmediumd); + u32 _pid = READ_ONCE(data->wmediumd); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) { struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index f05cfc83c9c8..f946bf889015 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -996,7 +996,7 @@ static void qlt_free_session_done(struct work_struct *work) if (logout_started) { bool traced = false; - while (!ACCESS_ONCE(sess->logout_completed)) { + while (!READ_ONCE(sess->logout_completed)) { if (!traced) { ql_dbg(ql_dbg_tgt_mgt, vha, 0xf086, "%s: waiting for sess %p logout\n", diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 942d094269fb..9469695f5871 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -985,7 +985,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) mb = udev->mb_addr; tcmu_flush_dcache_range(mb, sizeof(*mb)); - while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) { + while (udev->cmdr_last_cleaned != READ_ONCE(mb->cmd_tail)) { struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned; struct tcmu_cmd *cmd; diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 3e865dbf878c..fbaa2a90d25d 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -483,7 +483,7 @@ static ssize_t wdm_read if (rv < 0) return -ERESTARTSYS; - cntr = ACCESS_ONCE(desc->length); + cntr = READ_ONCE(desc->length); if (cntr == 0) { desc->read = 0; retry: diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index e9326f31db8d..4ae667d8c238 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -150,7 +150,7 @@ static int usbfs_increase_memory_usage(u64 amount) { u64 lim; - lim = ACCESS_ONCE(usbfs_memory_mb); + lim = READ_ONCE(usbfs_memory_mb); lim <<= 20; atomic64_add(amount, &usbfs_memory_usage); diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c index d930bfda4010..58d59c5f8592 100644 --- a/drivers/usb/core/sysfs.c +++ b/drivers/usb/core/sysfs.c @@ -973,7 +973,7 @@ static ssize_t interface_show(struct device *dev, struct device_attribute *attr, char *string; intf = to_usb_interface(dev); - string = ACCESS_ONCE(intf->cur_altsetting->string); + string = READ_ONCE(intf->cur_altsetting->string); if (!string) return 0; return sprintf(buf, "%s\n", string); @@ -989,7 +989,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, intf = to_usb_interface(dev); udev = interface_to_usbdev(intf); - alt = ACCESS_ONCE(intf->cur_altsetting); + alt = READ_ONCE(intf->cur_altsetting); return sprintf(buf, "usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02X" "ic%02Xisc%02Xip%02Xin%02X\n", diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c index 1f9941145746..0b59fa50aa30 100644 --- a/drivers/usb/gadget/udc/gr_udc.c +++ b/drivers/usb/gadget/udc/gr_udc.c @@ -1261,7 +1261,7 @@ static int gr_handle_in_ep(struct gr_ep *ep) if (!req->last_desc) return 0; - if (ACCESS_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN) + if (READ_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN) return 0; /* Not put in hardware buffers yet */ if (gr_read32(&ep->regs->epstat) & (GR_EPSTAT_B1 | GR_EPSTAT_B0)) @@ -1290,7 +1290,7 @@ static int gr_handle_out_ep(struct gr_ep *ep) if (!req->curr_desc) return 0; - ctrl = ACCESS_ONCE(req->curr_desc->ctrl); + ctrl = READ_ONCE(req->curr_desc->ctrl); if (ctrl & GR_DESC_OUT_CTRL_EN) return 0; /* Not received yet */ diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index 44924824fa41..c86f89babd57 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -785,7 +785,7 @@ static void io_watchdog_func(unsigned long _ohci) } /* find the last TD processed by the controller. */ - head = hc32_to_cpu(ohci, ACCESS_ONCE(ed->hwHeadP)) & TD_MASK; + head = hc32_to_cpu(ohci, READ_ONCE(ed->hwHeadP)) & TD_MASK; td_start = td; td_next = list_prepare_entry(td, &ed->td_list, td_list); list_for_each_entry_continue(td_next, &ed->td_list, td_list) { diff --git a/drivers/usb/host/uhci-hcd.h b/drivers/usb/host/uhci-hcd.h index 91b22b2ea3aa..09a2a259941b 100644 --- a/drivers/usb/host/uhci-hcd.h +++ b/drivers/usb/host/uhci-hcd.h @@ -186,7 +186,7 @@ struct uhci_qh { * We need a special accessor for the element pointer because it is * subject to asynchronous updates by the controller. */ -#define qh_element(qh) ACCESS_ONCE((qh)->element) +#define qh_element(qh) READ_ONCE((qh)->element) #define LINK_TO_QH(uhci, qh) (UHCI_PTR_QH((uhci)) | \ cpu_to_hc32((uhci), (qh)->dma_handle)) @@ -274,7 +274,7 @@ struct uhci_td { * subject to asynchronous updates by the controller. */ #define td_status(uhci, td) hc32_to_cpu((uhci), \ - ACCESS_ONCE((td)->status)) + READ_ONCE((td)->status)) #define LINK_TO_TD(uhci, td) (cpu_to_hc32((uhci), (td)->dma_handle)) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index f5a86f651f38..2bc3705a99bd 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -665,7 +665,7 @@ static int vfio_dev_viable(struct device *dev, void *data) { struct vfio_group *group = data; struct vfio_device *device; - struct device_driver *drv = ACCESS_ONCE(dev->driver); + struct device_driver *drv = READ_ONCE(dev->driver); struct vfio_unbound_dev *unbound; int ret = -EINVAL; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 046f6d280af5..35e929f132e8 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -929,7 +929,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) continue; } - tpg = ACCESS_ONCE(vs_tpg[*target]); + tpg = READ_ONCE(vs_tpg[*target]); if (unlikely(!tpg)) { /* Target does not exist, fail the request */ vhost_scsi_send_bad_target(vs, vq, head, out); diff --git a/fs/aio.c b/fs/aio.c index 5a2487217072..e6de7715228c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) * actually has a cancel function, hence the cmpxchg() */ - cancel = ACCESS_ONCE(kiocb->ki_cancel); + cancel = READ_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) return -EINVAL; diff --git a/fs/buffer.c b/fs/buffer.c index 170df856bdb9..32ce01f0f95f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1692,7 +1692,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), + b_state); return page_buffers(page); } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 8e704d12a1cf..0083bd4fcaa5 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -373,7 +373,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) struct fscrypt_info *prev; if (ci == NULL) - ci = ACCESS_ONCE(inode->i_crypt_info); + ci = READ_ONCE(inode->i_crypt_info); if (ci == NULL) return; diff --git a/fs/direct-io.c b/fs/direct-io.c index b53e66d9abd7..98fe1325da9d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1152,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index 3e14ba25f678..1d6243d9f2b6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1911,7 +1911,7 @@ void set_dumpable(struct mm_struct *mm, int value) return; do { - old = ACCESS_ONCE(mm->flags); + old = READ_ONCE(mm->flags); new = (old & ~MMF_DUMPABLE_MASK) | value; } while (cmpxchg(&mm->flags, old, new) != old); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 448a1119f0be..57bf2964bb83 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -724,7 +724,7 @@ static void send_sigio_to_task(struct task_struct *p, * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ - int signum = ACCESS_ONCE(fown->signum); + int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; diff --git a/fs/fs_pin.c b/fs/fs_pin.c index e747b3d720ee..2d07f292b625 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -78,7 +78,7 @@ void mnt_pin_kill(struct mount *m) while (1) { struct hlist_node *p; rcu_read_lock(); - p = ACCESS_ONCE(m->mnt_pins.first); + p = READ_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; @@ -92,7 +92,7 @@ void group_pin_kill(struct hlist_head *p) while (1) { struct hlist_node *q; rcu_read_lock(); - q = ACCESS_ONCE(p->first); + q = READ_ONCE(p->first); if (!q) { rcu_read_unlock(); break; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 13c65dd2d37d..a42d89371748 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file) * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return ACCESS_ONCE(file->private_data); + return READ_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, diff --git a/fs/inode.c b/fs/inode.c index d1e35b53bb23..fd401028a309 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags, WARN_ON_ONCE(flags & ~mask); do { - old_flags = ACCESS_ONCE(inode->i_flags); + old_flags = READ_ONCE(inode->i_flags); new_flags = (old_flags & ~mask) | flags; } while (unlikely(cmpxchg(&inode->i_flags, old_flags, new_flags) != old_flags)); diff --git a/fs/namei.c b/fs/namei.c index c75ea03ca147..40a0f34bf990 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1209,7 +1209,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), managed &= DCACHE_MANAGED_DENTRY, unlikely(managed != 0)) { /* Allow the filesystem to manage the transit without i_mutex @@ -1394,7 +1394,7 @@ int follow_down(struct path *path) unsigned managed; int ret; - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), unlikely(managed & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. diff --git a/fs/namespace.c b/fs/namespace.c index d18deb4c410b..e158ec6b527b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5ceaeb1f6fb6..f439f1c45008 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1081,7 +1081,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) int error; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1168,7 +1168,7 @@ out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: if (flags & LOOKUP_RCU) { - if (parent != ACCESS_ONCE(dentry->d_parent)) + if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else dput(parent); @@ -1582,7 +1582,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1596,7 +1596,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) ret = -ECHILD; if (!(flags & LOOKUP_RCU)) dput(parent); - else if (parent != ACCESS_ONCE(dentry->d_parent)) + else if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; goto out; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 77a8eacbe032..375e8bf0dd24 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -453,7 +453,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 99dff222fe67..03afd5150916 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -27,7 +27,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); - event = ACCESS_ONCE(ns->event); + event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= POLLERR | POLLPRI; diff --git a/fs/splice.c b/fs/splice.c index f3084cce0ea6..39e2dc01ac12 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe); */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - unsigned int buffers = ACCESS_ONCE(pipe->buffers); + unsigned int buffers = READ_ONCE(pipe->buffers); spd->nr_pages_max = buffers; if (buffers <= PIPE_DEF_BUFFERS) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c713fd5b3e6..f46d133c0949 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * in __get_user_pages if userfaultfd_release waits on the * caller of handle_userfault to release the mmap_sem. */ - if (unlikely(ACCESS_ONCE(ctx->released))) { + if (unlikely(READ_ONCE(ctx->released))) { /* * Don't return VM_FAULT_SIGBUS in this case, so a non * cooperative manager can close the uffd after the @@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) vmf->flags, reason); up_read(&mm->mmap_sem); - if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + if (likely(must_wait && !READ_ONCE(ctx->released) && (return_to_userland ? !signal_pending(current) : !fatal_signal_pending(current)))) { wake_up_poll(&ctx->fd_wqh, POLLIN); @@ -586,7 +586,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, set_current_state(TASK_KILLABLE); if (ewq->msg.event == 0) break; - if (ACCESS_ONCE(ctx->released) || + if (READ_ONCE(ctx->released) || fatal_signal_pending(current)) { /* * &ewq->wq may be queued in fork_event, but @@ -833,7 +833,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - ACCESS_ONCE(ctx->released) = true; + WRITE_ONCE(ctx->released, true); if (!mmget_not_zero(mm)) goto wakeup; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 51bf7b827387..129975970d99 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -592,9 +592,9 @@ xlog_valid_lsn( * a transiently forward state. Instead, we can see the LSN in a * transiently behind state if we happen to race with a cycle wrap. */ - cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + cur_cycle = READ_ONCE(log->l_curr_cycle); smp_rmb(); - cur_block = ACCESS_ONCE(log->l_curr_block); + cur_block = READ_ONCE(log->l_curr_block); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 8fbe259b197c..0a7ce668f8e0 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -236,7 +236,7 @@ static inline unsigned long __ffs64(u64 word) typeof(*ptr) old, new; \ \ do { \ - old = ACCESS_ONCE(*ptr); \ + old = READ_ONCE(*ptr); \ new = (old & ~mask) | bits; \ } while (cmpxchg(ptr, old, new) != old); \ \ @@ -251,7 +251,7 @@ static inline unsigned long __ffs64(u64 word) typeof(*ptr) old, new; \ \ do { \ - old = ACCESS_ONCE(*ptr); \ + old = READ_ONCE(*ptr); \ new = old & ~clear; \ } while (!(old & test) && \ cmpxchg(ptr, old, new) != old); \ diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index a4be70398ce1..36dd4ffb5715 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -88,7 +88,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count) /* Returns how many objects can be queued, < 0 indicates over limit. */ static inline int dql_avail(const struct dql *dql) { - return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued); + return READ_ONCE(dql->adj_limit) - READ_ONCE(dql->num_queued); } /* Record number of completed objects and recalculate the limit. */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 14bc21c2ee7f..785a00ca4628 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -221,7 +221,7 @@ extern struct page *huge_zero_page; static inline bool is_huge_zero_page(struct page *page) { - return ACCESS_ONCE(huge_zero_page) == page; + return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 30294603526f..d95cae09dea0 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -247,7 +247,7 @@ static inline struct team_port *team_get_port_by_index(struct team *team, static inline int team_num_to_port_index(struct team *team, unsigned int num) { - int en_port_count = ACCESS_ONCE(team->en_port_count); + int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; diff --git a/include/linux/llist.h b/include/linux/llist.h index 1957635e6d5f..85abc2915e8d 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -198,7 +198,7 @@ static inline void init_llist_head(struct llist_head *list) */ static inline bool llist_empty(const struct llist_head *head) { - return ACCESS_ONCE(head->first) == NULL; + return READ_ONCE(head->first) == NULL; } static inline struct llist_node *llist_next(struct llist_node *node) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 2efb08a60e63..f0fc4700b6ff 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -105,7 +105,7 @@ static inline bool pm_runtime_callbacks_present(struct device *dev) static inline void pm_runtime_mark_last_busy(struct device *dev) { - ACCESS_ONCE(dev->power.last_busy) = jiffies; + WRITE_ONCE(dev->power.last_busy, jiffies); } static inline bool pm_runtime_is_irq_safe(struct device *dev) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4f4f786255ef..3fadb6f9982b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -983,12 +983,12 @@ static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]); + return READ_ONCE(ipvs->sysctl_sync_threshold[1]); } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period); + return READ_ONCE(ipvs->sysctl_sync_refresh_period); } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) @@ -1013,7 +1013,7 @@ static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_ports); + return READ_ONCE(ipvs->sysctl_sync_ports); } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) diff --git a/kernel/acct.c b/kernel/acct.c index 5e72af29ab73..21eedd0dd81a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -146,7 +146,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) again: smp_rmb(); rcu_read_lock(); - res = to_acct(ACCESS_ONCE(ns->bacct)); + res = to_acct(READ_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; @@ -158,7 +158,7 @@ again: } rcu_read_unlock(); mutex_lock(&res->lock); - if (res != to_acct(ACCESS_ONCE(ns->bacct))) { + if (res != to_acct(READ_ONCE(ns->bacct))) { mutex_unlock(&res->lock); acct_put(res); goto again; diff --git a/kernel/events/core.c b/kernel/events/core.c index 824a583079a1..8fd2f2d1358a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1200,7 +1200,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); + ctx = READ_ONCE(event->ctx); if (!atomic_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; @@ -5302,8 +5302,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; - aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); - aux_size = ACCESS_ONCE(rb->user_page->aux_size); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..f3e37971c842 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); diff --git a/kernel/exit.c b/kernel/exit.c index f6cad39f35df..6b4298a41167 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ - int exit_state = ACCESS_ONCE(p->exit_state); + int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..845f3805c73d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * if it happened, we have to fail the write. */ barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 652c682707cd..9050c8b3ccde 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1459,7 +1459,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, static inline void *event_file_data(struct file *filp) { - return ACCESS_ONCE(file_inode(filp)->i_private); + return READ_ONCE(file_inode(filp)->i_private); } extern struct mutex event_mutex; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 49cb41412eec..780262210c9a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -77,7 +77,7 @@ check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; - int frame_size = ACCESS_ONCE(tracer_frame); + int frame_size = READ_ONCE(tracer_frame); int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..d32b45662fb6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file, int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; - unsigned long userns_flags = ACCESS_ONCE(ns->flags); + unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 155c55d8db5f..fe7953aead82 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -39,7 +39,7 @@ begin_node: /* Descend through a shortcut */ shortcut = assoc_array_ptr_to_shortcut(cursor); smp_read_barrier_depends(); - cursor = ACCESS_ONCE(shortcut->next_node); + cursor = READ_ONCE(shortcut->next_node); } node = assoc_array_ptr_to_node(cursor); @@ -55,7 +55,7 @@ begin_node: */ has_meta = 0; for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = ACCESS_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); has_meta |= (unsigned long)ptr; if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer @@ -89,7 +89,7 @@ continue_node: smp_read_barrier_depends(); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = ACCESS_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { cursor = ptr; goto begin_node; @@ -98,7 +98,7 @@ continue_node: finished_node: /* Move up to the parent (may need to skip back over a shortcut) */ - parent = ACCESS_ONCE(node->back_pointer); + parent = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (parent == stop) return 0; @@ -107,7 +107,7 @@ finished_node: shortcut = assoc_array_ptr_to_shortcut(parent); smp_read_barrier_depends(); cursor = parent; - parent = ACCESS_ONCE(shortcut->back_pointer); + parent = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; if (parent == stop) return 0; @@ -147,7 +147,7 @@ int assoc_array_iterate(const struct assoc_array *array, void *iterator_data), void *iterator_data) { - struct assoc_array_ptr *root = ACCESS_ONCE(array->root); + struct assoc_array_ptr *root = READ_ONCE(array->root); if (!root) return 0; @@ -194,7 +194,7 @@ assoc_array_walk(const struct assoc_array *array, pr_devel("-->%s()\n", __func__); - cursor = ACCESS_ONCE(array->root); + cursor = READ_ONCE(array->root); if (!cursor) return assoc_array_walk_tree_empty; @@ -220,7 +220,7 @@ consider_node: slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); slot &= ASSOC_ARRAY_FAN_MASK; - ptr = ACCESS_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); pr_devel("consider slot %x [ix=%d type=%lu]\n", slot, level, (unsigned long)ptr & 3); @@ -294,7 +294,7 @@ follow_shortcut: } while (sc_level < shortcut->skip_to_level); /* The shortcut matches the leaf's index to this point. */ - cursor = ACCESS_ONCE(shortcut->next_node); + cursor = READ_ONCE(shortcut->next_node); if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { level = sc_level; goto jumped; @@ -337,7 +337,7 @@ void *assoc_array_find(const struct assoc_array *array, * the terminal node. */ for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = ACCESS_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer * and dereferencing the pointer - but only if we are diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index f346715e2255..81770a55cb16 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -20,7 +20,7 @@ void dql_completed(struct dql *dql, unsigned int count) unsigned int ovlimit, completed, num_queued; bool all_prev_completed; - num_queued = ACCESS_ONCE(dql->num_queued); + num_queued = READ_ONCE(dql->num_queued); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); diff --git a/lib/llist.c b/lib/llist.c index ae5872b1df0c..7062e931a7bb 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -41,7 +41,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_node *first; do { - new_last->next = first = ACCESS_ONCE(head->first); + new_last->next = first = READ_ONCE(head->first); } while (cmpxchg(&head->first, first, new_first) != first); return !first; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 86c3385b9eb3..1746bae94d41 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -620,8 +620,8 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { - p = ACCESS_ONCE(d->d_parent); - array[i] = ACCESS_ONCE(d->d_name.name); + p = READ_ONCE(d->d_parent); + array[i] = READ_ONCE(d->d_name.name); if (p == d) { if (i) array[i] = ""; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 269b5df58543..c3bf907a03ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2715,7 +2715,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return ACCESS_ONCE(pgdata->split_queue_len); + return READ_ONCE(pgdata->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, diff --git a/net/core/dev.c b/net/core/dev.c index 11596a302a26..61559ca3980b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3725,7 +3725,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; - cpu = ACCESS_ONCE(rflow->cpu); + cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6e1e10ff433a..3b2034f6d49d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3377,7 +3377,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { - unsigned int burst = ACCESS_ONCE(pkt_dev->burst); + unsigned int burst = READ_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; struct sk_buff *skb; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..f9597ba26599 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work) local_bh_disable(); - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + for (i = READ_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); i = (i + 1) & (INETFRAGS_HASHSZ - 1); if (evicted > INETFRAGS_EVICT_MAX) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3d9f1c2f81c5..c0864562083b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs) { u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); + u32 old = READ_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 new, delta = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc9e46a5369..48531da1aba6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1908,7 +1908,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ebfbccae62fd..02ec9a349303 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1853,7 +1853,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -2298,7 +2298,7 @@ void udp_destroy_sock(struct sock *sk) unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a1c24443cd9e..dab946554157 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -490,7 +490,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (!t) goto out; - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; @@ -899,7 +899,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); if (t) { - u8 tproto = ACCESS_ONCE(t->parms.proto); + u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; @@ -1233,7 +1233,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; @@ -1303,7 +1303,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - tproto = ACCESS_ONCE(t->parms.proto); + tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || ip6_tnl_addr_conflict(t, ipv6h)) return -1; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 40d7234c27b9..3f30fa313bf2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -606,7 +606,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -1432,7 +1432,7 @@ void udpv6_destroy_sock(struct sock *sk) if (static_key_false(&udpv6_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index dd3e83328ad5..82cb93f66b9b 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -193,7 +193,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); - sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL; + sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); @@ -214,7 +214,7 @@ drop: kfree_skb(skb); goto out; handle_station: - sta_handler = ACCESS_ONCE(llc_station_handler); + sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 69615016d5bf..214d2ba02877 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2008,7 +2008,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { - u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index d177dd066504..4d748975117d 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -393,7 +393,7 @@ EXPORT_SYMBOL(netlbl_calipso_ops_register); static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) { - return ACCESS_ONCE(calipso_ops); + return READ_ONCE(calipso_ops); } /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d396cb61a280..eb866647a27a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14201,7 +14201,7 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct sk_buff *msg; void *hdr; - u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid); + u32 nlportid = READ_ONCE(wdev->ap_unexpected_nlportid); if (!nlportid) return false; diff --git a/sound/firewire/amdtp-am824.c b/sound/firewire/amdtp-am824.c index 23ccddb20de1..4210e5c6262e 100644 --- a/sound/firewire/amdtp-am824.c +++ b/sound/firewire/amdtp-am824.c @@ -247,7 +247,7 @@ void amdtp_am824_midi_trigger(struct amdtp_stream *s, unsigned int port, struct amdtp_am824 *p = s->protocol; if (port < p->midi_ports) - ACCESS_ONCE(p->midi[port]) = midi; + WRITE_ONCE(p->midi[port], midi); } EXPORT_SYMBOL_GPL(amdtp_am824_midi_trigger); @@ -336,7 +336,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, __be32 *buffe unsigned int data_blocks, unsigned int *syt) { struct amdtp_am824 *p = s->protocol; - struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); unsigned int pcm_frames; if (pcm) { @@ -357,7 +357,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, __be32 *buffe unsigned int data_blocks, unsigned int *syt) { struct amdtp_am824 *p = s->protocol; - struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); unsigned int pcm_frames; if (pcm) { diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index 3fc581a5ad62..4a1dc145327b 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -376,7 +376,7 @@ static void update_pcm_pointers(struct amdtp_stream *s, ptr = s->pcm_buffer_pointer + frames; if (ptr >= pcm->runtime->buffer_size) ptr -= pcm->runtime->buffer_size; - ACCESS_ONCE(s->pcm_buffer_pointer) = ptr; + WRITE_ONCE(s->pcm_buffer_pointer, ptr); s->pcm_period_pointer += frames; if (s->pcm_period_pointer >= pcm->runtime->period_size) { @@ -388,7 +388,7 @@ static void update_pcm_pointers(struct amdtp_stream *s, static void pcm_period_tasklet(unsigned long data) { struct amdtp_stream *s = (void *)data; - struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); if (pcm) snd_pcm_period_elapsed(pcm); @@ -453,7 +453,7 @@ static int handle_out_packet(struct amdtp_stream *s, s->data_block_counter = (s->data_block_counter + data_blocks) & 0xff; - buffer[0] = cpu_to_be32(ACCESS_ONCE(s->source_node_id_field) | + buffer[0] = cpu_to_be32(READ_ONCE(s->source_node_id_field) | (s->data_block_quadlets << CIP_DBS_SHIFT) | ((s->sph << CIP_SPH_SHIFT) & CIP_SPH_MASK) | s->data_block_counter); @@ -472,7 +472,7 @@ static int handle_out_packet(struct amdtp_stream *s, if (queue_out_packet(s, payload_length) < 0) return -EIO; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm && pcm_frames > 0) update_pcm_pointers(s, pcm, pcm_frames); @@ -504,7 +504,7 @@ static int handle_out_packet_without_header(struct amdtp_stream *s, if (queue_out_packet(s, payload_length) < 0) return -EIO; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm && pcm_frames > 0) update_pcm_pointers(s, pcm, pcm_frames); @@ -621,7 +621,7 @@ end: if (queue_in_packet(s) < 0) return -EIO; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm && pcm_frames > 0) update_pcm_pointers(s, pcm, pcm_frames); @@ -649,7 +649,7 @@ static int handle_in_packet_without_header(struct amdtp_stream *s, if (queue_in_packet(s) < 0) return -EIO; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm && pcm_frames > 0) update_pcm_pointers(s, pcm, pcm_frames); @@ -947,7 +947,7 @@ unsigned long amdtp_stream_pcm_pointer(struct amdtp_stream *s) if (!in_interrupt() && amdtp_stream_running(s)) fw_iso_context_flush_completions(s->context); - return ACCESS_ONCE(s->pcm_buffer_pointer); + return READ_ONCE(s->pcm_buffer_pointer); } EXPORT_SYMBOL(amdtp_stream_pcm_pointer); @@ -977,9 +977,8 @@ EXPORT_SYMBOL(amdtp_stream_pcm_ack); void amdtp_stream_update(struct amdtp_stream *s) { /* Precomputing. */ - ACCESS_ONCE(s->source_node_id_field) = - (fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) & - CIP_SID_MASK; + WRITE_ONCE(s->source_node_id_field, + (fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) & CIP_SID_MASK); } EXPORT_SYMBOL(amdtp_stream_update); @@ -1022,7 +1021,7 @@ void amdtp_stream_pcm_abort(struct amdtp_stream *s) { struct snd_pcm_substream *pcm; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm) snd_pcm_stop_xrun(pcm); } diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h index ed6eafd10992..f9abd8b07ce6 100644 --- a/sound/firewire/amdtp-stream.h +++ b/sound/firewire/amdtp-stream.h @@ -220,7 +220,7 @@ static inline bool amdtp_stream_pcm_running(struct amdtp_stream *s) static inline void amdtp_stream_pcm_trigger(struct amdtp_stream *s, struct snd_pcm_substream *pcm) { - ACCESS_ONCE(s->pcm) = pcm; + WRITE_ONCE(s->pcm, pcm); } static inline bool cip_sfc_is_base_44100(enum cip_sfc sfc) diff --git a/sound/firewire/digi00x/amdtp-dot.c b/sound/firewire/digi00x/amdtp-dot.c index 1453c34ce99f..4a884a335248 100644 --- a/sound/firewire/digi00x/amdtp-dot.c +++ b/sound/firewire/digi00x/amdtp-dot.c @@ -327,7 +327,7 @@ void amdtp_dot_midi_trigger(struct amdtp_stream *s, unsigned int port, struct amdtp_dot *p = s->protocol; if (port < MAX_MIDI_PORTS) - ACCESS_ONCE(p->midi[port]) = midi; + WRITE_ONCE(p->midi[port], midi); } static unsigned int process_tx_data_blocks(struct amdtp_stream *s, @@ -338,7 +338,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, struct snd_pcm_substream *pcm; unsigned int pcm_frames; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm) { read_pcm_s32(s, pcm, buffer, data_blocks); pcm_frames = data_blocks; @@ -359,7 +359,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, struct snd_pcm_substream *pcm; unsigned int pcm_frames; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm) { write_pcm_s32(s, pcm, buffer, data_blocks); pcm_frames = data_blocks; diff --git a/sound/firewire/fireface/amdtp-ff.c b/sound/firewire/fireface/amdtp-ff.c index 780da9deb2f0..77c7598b61ab 100644 --- a/sound/firewire/fireface/amdtp-ff.c +++ b/sound/firewire/fireface/amdtp-ff.c @@ -108,7 +108,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, unsigned int data_blocks, unsigned int *syt) { - struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); unsigned int pcm_frames; if (pcm) { @@ -127,7 +127,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, unsigned int data_blocks, unsigned int *syt) { - struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm); + struct snd_pcm_substream *pcm = READ_ONCE(s->pcm); unsigned int pcm_frames; if (pcm) { diff --git a/sound/firewire/fireface/ff-midi.c b/sound/firewire/fireface/ff-midi.c index 949ee56b4e0e..6a49611ee462 100644 --- a/sound/firewire/fireface/ff-midi.c +++ b/sound/firewire/fireface/ff-midi.c @@ -22,7 +22,7 @@ static int midi_playback_open(struct snd_rawmidi_substream *substream) ff->running_status[substream->number] = 0; ff->rx_midi_error[substream->number] = false; - ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = substream; + WRITE_ONCE(ff->rx_midi_substreams[substream->number], substream); return 0; } @@ -38,7 +38,7 @@ static int midi_playback_close(struct snd_rawmidi_substream *substream) struct snd_ff *ff = substream->rmidi->private_data; cancel_work_sync(&ff->rx_midi_work[substream->number]); - ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = NULL; + WRITE_ONCE(ff->rx_midi_substreams[substream->number], NULL); return 0; } @@ -52,10 +52,10 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *substream, spin_lock_irqsave(&ff->lock, flags); if (up) - ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) = - substream; + WRITE_ONCE(ff->tx_midi_substreams[substream->number], + substream); else - ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) = NULL; + WRITE_ONCE(ff->tx_midi_substreams[substream->number], NULL); spin_unlock_irqrestore(&ff->lock, flags); } diff --git a/sound/firewire/fireface/ff-transaction.c b/sound/firewire/fireface/ff-transaction.c index dd6c8e839647..332b29f8ed75 100644 --- a/sound/firewire/fireface/ff-transaction.c +++ b/sound/firewire/fireface/ff-transaction.c @@ -12,7 +12,7 @@ static void finish_transmit_midi_msg(struct snd_ff *ff, unsigned int port, int rcode) { struct snd_rawmidi_substream *substream = - ACCESS_ONCE(ff->rx_midi_substreams[port]); + READ_ONCE(ff->rx_midi_substreams[port]); if (rcode_is_permanent_error(rcode)) { ff->rx_midi_error[port] = true; @@ -60,7 +60,7 @@ static inline void fill_midi_buf(struct snd_ff *ff, unsigned int port, static void transmit_midi_msg(struct snd_ff *ff, unsigned int port) { struct snd_rawmidi_substream *substream = - ACCESS_ONCE(ff->rx_midi_substreams[port]); + READ_ONCE(ff->rx_midi_substreams[port]); u8 *buf = (u8 *)ff->msg_buf[port]; int i, len; @@ -159,7 +159,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request, */ index = (quad >> 8) & 0xff; if (index > 0) { - substream = ACCESS_ONCE(ff->tx_midi_substreams[0]); + substream = READ_ONCE(ff->tx_midi_substreams[0]); if (substream != NULL) { byte = quad & 0xff; snd_rawmidi_receive(substream, &byte, 1); @@ -169,7 +169,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request, /* Message in second port. */ index = (quad >> 24) & 0xff; if (index > 0) { - substream = ACCESS_ONCE(ff->tx_midi_substreams[1]); + substream = READ_ONCE(ff->tx_midi_substreams[1]); if (substream != NULL) { byte = (quad >> 16) & 0xff; snd_rawmidi_receive(substream, &byte, 1); diff --git a/sound/firewire/isight.c b/sound/firewire/isight.c index 5826aa8362f1..46092fa3ff9b 100644 --- a/sound/firewire/isight.c +++ b/sound/firewire/isight.c @@ -96,7 +96,7 @@ static void isight_update_pointers(struct isight *isight, unsigned int count) ptr += count; if (ptr >= runtime->buffer_size) ptr -= runtime->buffer_size; - ACCESS_ONCE(isight->buffer_pointer) = ptr; + WRITE_ONCE(isight->buffer_pointer, ptr); isight->period_counter += count; if (isight->period_counter >= runtime->period_size) { @@ -111,7 +111,7 @@ static void isight_samples(struct isight *isight, struct snd_pcm_runtime *runtime; unsigned int count1; - if (!ACCESS_ONCE(isight->pcm_running)) + if (!READ_ONCE(isight->pcm_running)) return; runtime = isight->pcm->runtime; @@ -131,7 +131,7 @@ static void isight_samples(struct isight *isight, static void isight_pcm_abort(struct isight *isight) { - if (ACCESS_ONCE(isight->pcm_active)) + if (READ_ONCE(isight->pcm_active)) snd_pcm_stop_xrun(isight->pcm); } @@ -141,7 +141,7 @@ static void isight_dropped_samples(struct isight *isight, unsigned int total) u32 dropped; unsigned int count1; - if (!ACCESS_ONCE(isight->pcm_running)) + if (!READ_ONCE(isight->pcm_running)) return; runtime = isight->pcm->runtime; @@ -293,7 +293,7 @@ static int isight_hw_params(struct snd_pcm_substream *substream, if (err < 0) return err; - ACCESS_ONCE(isight->pcm_active) = true; + WRITE_ONCE(isight->pcm_active, true); return 0; } @@ -331,7 +331,7 @@ static int isight_hw_free(struct snd_pcm_substream *substream) { struct isight *isight = substream->private_data; - ACCESS_ONCE(isight->pcm_active) = false; + WRITE_ONCE(isight->pcm_active, false); mutex_lock(&isight->mutex); isight_stop_streaming(isight); @@ -424,10 +424,10 @@ static int isight_trigger(struct snd_pcm_substream *substream, int cmd) switch (cmd) { case SNDRV_PCM_TRIGGER_START: - ACCESS_ONCE(isight->pcm_running) = true; + WRITE_ONCE(isight->pcm_running, true); break; case SNDRV_PCM_TRIGGER_STOP: - ACCESS_ONCE(isight->pcm_running) = false; + WRITE_ONCE(isight->pcm_running, false); break; default: return -EINVAL; @@ -439,7 +439,7 @@ static snd_pcm_uframes_t isight_pointer(struct snd_pcm_substream *substream) { struct isight *isight = substream->private_data; - return ACCESS_ONCE(isight->buffer_pointer); + return READ_ONCE(isight->buffer_pointer); } static int isight_create_pcm(struct isight *isight) diff --git a/sound/firewire/motu/amdtp-motu.c b/sound/firewire/motu/amdtp-motu.c index 96f0091144bb..f0555a24d90e 100644 --- a/sound/firewire/motu/amdtp-motu.c +++ b/sound/firewire/motu/amdtp-motu.c @@ -310,7 +310,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, if (p->midi_ports) read_midi_messages(s, buffer, data_blocks); - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (data_blocks > 0 && pcm) read_pcm_s32(s, pcm->runtime, buffer, data_blocks); @@ -374,7 +374,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, if (p->midi_ports) write_midi_messages(s, buffer, data_blocks); - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm) write_pcm_s32(s, pcm->runtime, buffer, data_blocks); else diff --git a/sound/firewire/oxfw/oxfw-scs1x.c b/sound/firewire/oxfw/oxfw-scs1x.c index 02d595665898..f33497cdc706 100644 --- a/sound/firewire/oxfw/oxfw-scs1x.c +++ b/sound/firewire/oxfw/oxfw-scs1x.c @@ -112,7 +112,7 @@ static void handle_hss(struct fw_card *card, struct fw_request *request, } if (length >= 1) { - stream = ACCESS_ONCE(scs->input); + stream = READ_ONCE(scs->input); if (stream) midi_input_packet(scs, stream, data, length); } @@ -183,7 +183,7 @@ static void scs_output_work(struct work_struct *work) if (scs->transaction_running) return; - stream = ACCESS_ONCE(scs->output); + stream = READ_ONCE(scs->output); if (!stream || scs->error) { scs->output_idle = true; wake_up(&scs->idle_wait); @@ -291,9 +291,9 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *stream, int up) if (up) { scs->input_escape_count = 0; - ACCESS_ONCE(scs->input) = stream; + WRITE_ONCE(scs->input, stream); } else { - ACCESS_ONCE(scs->input) = NULL; + WRITE_ONCE(scs->input, NULL); } } @@ -319,10 +319,10 @@ static void midi_playback_trigger(struct snd_rawmidi_substream *stream, int up) scs->transaction_bytes = 0; scs->error = false; - ACCESS_ONCE(scs->output) = stream; + WRITE_ONCE(scs->output, stream); schedule_work(&scs->work); } else { - ACCESS_ONCE(scs->output) = NULL; + WRITE_ONCE(scs->output, NULL); } } static void midi_playback_drain(struct snd_rawmidi_substream *stream) diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c index 6aff1fc1c72d..ab482423c165 100644 --- a/sound/firewire/tascam/amdtp-tascam.c +++ b/sound/firewire/tascam/amdtp-tascam.c @@ -124,7 +124,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, { struct snd_pcm_substream *pcm; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (data_blocks > 0 && pcm) read_pcm_s32(s, pcm, buffer, data_blocks); @@ -143,7 +143,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, /* This field is not used. */ *syt = 0x0000; - pcm = ACCESS_ONCE(s->pcm); + pcm = READ_ONCE(s->pcm); if (pcm) write_pcm_s32(s, pcm, buffer, data_blocks); else diff --git a/sound/firewire/tascam/tascam-transaction.c b/sound/firewire/tascam/tascam-transaction.c index 8967c52f5032..2ad692dd4b13 100644 --- a/sound/firewire/tascam/tascam-transaction.c +++ b/sound/firewire/tascam/tascam-transaction.c @@ -148,7 +148,7 @@ static void async_midi_port_callback(struct fw_card *card, int rcode, void *callback_data) { struct snd_fw_async_midi_port *port = callback_data; - struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream); + struct snd_rawmidi_substream *substream = READ_ONCE(port->substream); /* This port is closed. */ if (substream == NULL) @@ -173,7 +173,7 @@ static void midi_port_work(struct work_struct *work) { struct snd_fw_async_midi_port *port = container_of(work, struct snd_fw_async_midi_port, work); - struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream); + struct snd_rawmidi_substream *substream = READ_ONCE(port->substream); int generation; /* Under transacting or error state. */ @@ -282,7 +282,7 @@ static void handle_midi_tx(struct fw_card *card, struct fw_request *request, bytes = 3; } - substream = ACCESS_ONCE(tscm->tx_midi_substreams[port]); + substream = READ_ONCE(tscm->tx_midi_substreams[port]); if (substream != NULL) snd_rawmidi_receive(substream, b + 1, bytes); } diff --git a/sound/soc/xtensa/xtfpga-i2s.c b/sound/soc/xtensa/xtfpga-i2s.c index 8382ffa3bcaf..2472144b329e 100644 --- a/sound/soc/xtensa/xtfpga-i2s.c +++ b/sound/soc/xtensa/xtfpga-i2s.c @@ -165,7 +165,7 @@ static bool xtfpga_pcm_push_tx(struct xtfpga_i2s *i2s) tx_substream = rcu_dereference(i2s->tx_substream); tx_active = tx_substream && snd_pcm_running(tx_substream); if (tx_active) { - unsigned tx_ptr = ACCESS_ONCE(i2s->tx_ptr); + unsigned tx_ptr = READ_ONCE(i2s->tx_ptr); unsigned new_tx_ptr = i2s->tx_fn(i2s, tx_substream->runtime, tx_ptr); @@ -437,7 +437,7 @@ static int xtfpga_pcm_trigger(struct snd_pcm_substream *substream, int cmd) case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: - ACCESS_ONCE(i2s->tx_ptr) = 0; + WRITE_ONCE(i2s->tx_ptr, 0); rcu_assign_pointer(i2s->tx_substream, substream); xtfpga_pcm_refill_fifo(i2s); break; @@ -459,7 +459,7 @@ static snd_pcm_uframes_t xtfpga_pcm_pointer(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct xtfpga_i2s *i2s = runtime->private_data; - snd_pcm_uframes_t pos = ACCESS_ONCE(i2s->tx_ptr); + snd_pcm_uframes_t pos = READ_ONCE(i2s->tx_ptr); return pos < runtime->buffer_size ? pos : 0; } diff --git a/sound/usb/bcd2000/bcd2000.c b/sound/usb/bcd2000/bcd2000.c index 7371e5b06035..fc579f330601 100644 --- a/sound/usb/bcd2000/bcd2000.c +++ b/sound/usb/bcd2000/bcd2000.c @@ -108,7 +108,7 @@ static void bcd2000_midi_handle_input(struct bcd2000 *bcd2k, unsigned int payload_length, tocopy; struct snd_rawmidi_substream *midi_receive_substream; - midi_receive_substream = ACCESS_ONCE(bcd2k->midi_receive_substream); + midi_receive_substream = READ_ONCE(bcd2k->midi_receive_substream); if (!midi_receive_substream) return; @@ -139,7 +139,7 @@ static void bcd2000_midi_send(struct bcd2000 *bcd2k) BUILD_BUG_ON(sizeof(device_cmd_prefix) >= BUFSIZE); - midi_out_substream = ACCESS_ONCE(bcd2k->midi_out_substream); + midi_out_substream = READ_ONCE(bcd2k->midi_out_substream); if (!midi_out_substream) return; diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h index 328eeceec709..96e2d06cb031 100644 --- a/tools/arch/x86/include/asm/atomic.h +++ b/tools/arch/x86/include/asm/atomic.h @@ -24,7 +24,7 @@ */ static inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h index 5e9738f97bf3..97427e700e3b 100644 --- a/tools/include/asm-generic/atomic-gcc.h +++ b/tools/include/asm-generic/atomic-gcc.h @@ -21,7 +21,7 @@ */ static inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 33b5e6cdf38c..d19e11b68de7 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -378,7 +378,7 @@ struct addr_filters { static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm) { struct perf_event_mmap_page *pc = mm->userpg; - u64 head = ACCESS_ONCE(pc->aux_head); + u64 head = READ_ONCE(pc->aux_head); /* Ensure all reads are done after we read the head */ rmb(); @@ -389,7 +389,7 @@ static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm) { struct perf_event_mmap_page *pc = mm->userpg; #if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT) - u64 head = ACCESS_ONCE(pc->aux_head); + u64 head = READ_ONCE(pc->aux_head); #else u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0); #endif diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 47b5e7dbcb18..aae9645c7122 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -113,7 +113,7 @@ int __perf_session__set_tracepoints_handlers(struct perf_session *session, extern volatile int session_done; -#define session_done() ACCESS_ONCE(session_done) +#define session_done() READ_ONCE(session_done) int perf_session__deliver_synth_event(struct perf_session *session, union perf_event *event, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9deb5a245b83..ce507ae1d4f5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2302,7 +2302,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; } else if (pass && i > last_boosted_vcpu) break; - if (!ACCESS_ONCE(vcpu->preempted)) + if (!READ_ONCE(vcpu->preempted)) continue; if (vcpu == me) continue; From 65e53aab6d54385dea799356defcbdcc456fb1a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2017 17:55:57 +0900 Subject: [PATCH 0584/1085] block: Use DECLARE_COMPLETION_ONSTACK() in submit_bio_wait() Simplify the code by getting rid of the submit_bio_ret structure. (This also helps address a lockdep false positive.) Signed-off-by: Christoph Hellwig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- block/bio.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/block/bio.c b/block/bio.c index 101c2a9b5481..5e901bfc0576 100644 --- a/block/bio.c +++ b/block/bio.c @@ -917,17 +917,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); -struct submit_bio_ret { - struct completion event; - int error; -}; - static void submit_bio_wait_endio(struct bio *bio) { - struct submit_bio_ret *ret = bio->bi_private; - - ret->error = blk_status_to_errno(bio->bi_status); - complete(&ret->event); + complete(bio->bi_private); } /** @@ -943,16 +935,15 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - struct submit_bio_ret ret; + DECLARE_COMPLETION_ONSTACK(done); - init_completion(&ret.event); - bio->bi_private = &ret; + bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - wait_for_completion_io(&ret.event); + wait_for_completion_io(&done); - return ret.error; + return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); From 6f0397d7e100f3b3978d6ebb6b2dea29ee7c4a95 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:55:58 +0900 Subject: [PATCH 0585/1085] locking/lockdep: Provide empty lockdep_map structure for !CONFIG_LOCKDEP After this patch the lockdep_map structure takes no space if lockdep is disabled, reducing the number of #ifdefs in unrelated kernel code. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index bfa8e0b0d6f1..b6662d05bcda 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -527,6 +527,11 @@ static inline void lockdep_on(void) */ struct lock_class_key { }; +/* + * The lockdep_map takes no space if lockdep is disabled: + */ +struct lockdep_map { }; + #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) From 24208435e343679b21502fb90786084dfaf15369 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:55:59 +0900 Subject: [PATCH 0586/1085] locking/lockdep, sched/completions: Change the prefix of lock name for completion variables CONFIG_LOCKDEP_COMPLETIONS uses "(complete)" as a prefix of lock name for completion variable. However, what we should use here is a noun - so use "(completion)" instead. Suggested-by: Ingo Molnar Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-4-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- include/linux/completion.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/completion.h b/include/linux/completion.h index cae5400022a3..91218036b1c8 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -53,7 +53,7 @@ static inline void complete_release_commit(struct completion *x) do { \ static struct lock_class_key __key; \ lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - "(complete)" #x, \ + "(completion)" #x, \ &__key, 0); \ __init_completion(x); \ } while (0) @@ -67,7 +67,7 @@ static inline void complete_release_commit(struct completion *x) {} #ifdef CONFIG_LOCKDEP_COMPLETIONS #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ - STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) } + STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) } #else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } From d141babe4244945f1d001118578e0eb3ce12729d Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:00 +0900 Subject: [PATCH 0587/1085] locking/lockdep: Add a boot parameter allowing unwind in cross-release and disable it by default Johan Hovold reported a heavy performance regression caused by lockdep cross-release: > Boot time (from "Linux version" to login prompt) had in fact doubled > since 4.13 where it took 17 seconds (with my current config) compared to > the 35 seconds I now see with 4.14-rc4. > > I quick bisect pointed to lockdep and specifically the following commit: > > 28a903f63ec0 ("locking/lockdep: Handle non(or multi)-acquisition > of a crosslock") > > which I've verified is the commit which doubled the boot time (compared > to 28a903f63ec0^) (added by lockdep crossrelease series [1]). Currently cross-release performs unwind on every acquisition, but that is very expensive. This patch makes unwind optional and disables it by default and only records acquire_ip. Full stack traces are sometimes required for full analysis, in which case a boot paramter, crossrelease_fullstack, can be specified. On my qemu Ubuntu machine (x86_64, 4 cores, 512M), the regression was fixed. We measure boot times with 'perf stat --null --repeat 10 $QEMU', where $QEMU launches a kernel with init=/bin/true: 1. No lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.756558155 seconds time elapsed ( +- 0.09% ) 2. Lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.968710420 seconds time elapsed ( +- 0.12% ) 3. Lockdep enabled + cross-release enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 3.153839636 seconds time elapsed ( +- 0.31% ) 4. Lockdep enabled + cross-release enabled + this patch applied: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.963669551 seconds time elapsed ( +- 0.11% ) I.e. lockdep cross-release performance is now indistinguishable from vanilla lockdep. Bisected-by: Johan Hovold Analyzed-by: Thomas Gleixner Suggested-by: Thomas Gleixner Reported-by: Johan Hovold Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 3 +++ kernel/locking/lockdep.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..f20ed5e0a720 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -709,6 +709,9 @@ It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. + crossrelease_fullstack + [KNL] Allow to record full stack trace in cross-release + cryptomgr.notests [KNL] Disable crypto self-tests diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e36e652d996f..160b5d6df7cb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,15 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +static int crossrelease_fullstack; +static int __init allow_crossrelease_fullstack(char *str) +{ + crossrelease_fullstack = 1; + return 0; +} + +early_param("crossrelease_fullstack", allow_crossrelease_fullstack); + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -4863,8 +4872,14 @@ static void add_xhlock(struct held_lock *hlock) xhlock->trace.nr_entries = 0; xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; xhlock->trace.entries = xhlock->trace_entries; - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); + + if (crossrelease_fullstack) { + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); + } else { + xhlock->trace.nr_entries = 1; + xhlock->trace.entries[0] = hlock->acquire_ip; + } } static inline int same_context_xhlock(struct hist_lock *xhlock) From 2dcd5adfb7401b762ddbe4b86dcacc2f3de6b97b Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:01 +0900 Subject: [PATCH 0588/1085] locking/lockdep: Remove the BROKEN flag from CONFIG_LOCKDEP_CROSSRELEASE and CONFIG_LOCKDEP_COMPLETIONS Now that the performance regression is fixed, re-enable CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dfdad67d8f6c..c1e720a22c71 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1092,8 +1092,8 @@ config PROVE_LOCKING select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC - select LOCKDEP_CROSSRELEASE if BROKEN - select LOCKDEP_COMPLETIONS if BROKEN + select LOCKDEP_CROSSRELEASE + select LOCKDEP_COMPLETIONS select TRACE_IRQFLAGS default n help From e121d64e16484d4a5eba94cd2fa9eb3848b7c9c2 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:02 +0900 Subject: [PATCH 0589/1085] locking/lockdep: Introduce CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK=y Add a Kconfig knob that enables the lockdep "crossrelease_fullstack" boot parameter. Suggested-by: Ingo Molnar Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-7-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ lib/Kconfig.debug | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 160b5d6df7cb..db933d063bfc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,7 +76,11 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK +static int crossrelease_fullstack = 1; +#else static int crossrelease_fullstack; +#endif static int __init allow_crossrelease_fullstack(char *str) { crossrelease_fullstack = 1; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c1e720a22c71..2b439a515c30 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1179,6 +1179,21 @@ config LOCKDEP_COMPLETIONS A deadlock caused by wait_for_completion() and complete() can be detected by lockdep using crossrelease feature. +config BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK + bool "Enable the boot parameter, crossrelease_fullstack" + depends on LOCKDEP_CROSSRELEASE + default n + help + The lockdep "cross-release" feature needs to record stack traces + (of calling functions) for all acquisitions, for eventual later + use during analysis. By default only a single caller is recorded, + because the unwind operation can be very expensive with deeper + stack chains. + + However a boot parameter, crossrelease_fullstack, was + introduced since sometimes deeper traces are required for full + analysis. This option turns on the boot parameter. + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP From a7967bc31584bd282682981295861e7bcba19e65 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:03 +0900 Subject: [PATCH 0590/1085] sched/completions: Add support for initializing completions with lockdep_map Sometimes we want to initialize completions with sparate lockdep maps to assign lock classes as desired. For example, the workqueue code needs to directly manage lockdep maps, since only the code is aware of how to classify lockdep maps properly. Provide additional macros initializing completions in that way. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-8-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- include/linux/completion.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/completion.h b/include/linux/completion.h index 91218036b1c8..4da49916ef3f 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -49,6 +49,13 @@ static inline void complete_release_commit(struct completion *x) lock_commit_crosslock((struct lockdep_map *)&x->map); } +#define init_completion_map(x, m) \ +do { \ + lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ + (m)->name, (m)->key, 0); \ + __init_completion(x); \ +} while (0) + #define init_completion(x) \ do { \ static struct lock_class_key __key; \ @@ -58,6 +65,7 @@ do { \ __init_completion(x); \ } while (0) #else +#define init_completion_map(x, m) __init_completion(x) #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} @@ -73,6 +81,9 @@ static inline void complete_release_commit(struct completion *x) {} { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } #endif +#define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ + (*({ init_completion_map(&(work), &(map)); &(work); })) + #define COMPLETION_INITIALIZER_ONSTACK(work) \ (*({ init_completion(&work); &work; })) @@ -102,8 +113,11 @@ static inline void complete_release_commit(struct completion *x) {} #ifdef CONFIG_LOCKDEP # define DECLARE_COMPLETION_ONSTACK(work) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) +# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \ + struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) #else # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) +# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work) #endif /** From fd1a5b04dfb899f84ddeb8acdaea6b98283df1e5 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:04 +0900 Subject: [PATCH 0591/1085] workqueue: Remove now redundant lock acquisitions wrt. workqueue flushes The workqueue code added manual lock acquisition annotations to catch deadlocks. After lockdepcrossrelease was introduced, some of those became redundant, since wait_for_completion() already does the acquisition and tracking. Remove the duplicate annotations. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-9-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 19 +++---------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1c49431f3121..c8a572cb49be 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -218,7 +218,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ - lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0); \ + lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) @@ -398,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, static struct lock_class_key __key; \ const char *__lock_name; \ \ - __lock_name = #fmt#args; \ + __lock_name = "(wq_completion)"#fmt#args; \ \ __alloc_workqueue_key((fmt), (flags), (max_active), \ &__key, __lock_name, ##args); \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 39831b2f3c5f..160fdc6e839a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2497,15 +2497,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - /* - * Explicitly init the crosslock for wq_barrier::done, make its lock - * key a subkey of the corresponding work. As a result we won't - * build a dependency between wq_barrier::done and unrelated work. - */ - lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, - "(complete)wq_barr::done", - target->lockdep_map.key, 1); - __init_completion(&barr->done); + init_completion_map(&barr->done, &target->lockdep_map); + barr->task = current; /* @@ -2611,16 +2604,13 @@ void flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), }; int next_color; if (WARN_ON(!wq_online)) return; - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->mutex); /* @@ -2883,9 +2873,6 @@ bool flush_work(struct work_struct *work) if (WARN_ON(!wq_online)) return false; - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); From 54e2fc28d9cf1be7dd2ebe74b20dc20cc2a3e55d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 25 Oct 2017 19:25:09 +0800 Subject: [PATCH 0592/1085] spi: sprd: Fix the possible negative value of BIT() When enabling the ADI hardware channels, if the channel id is 31, then we will get one negative value -1 for BIT() macro, which will write incorrect value to register. Fixes: 7e2903cb91df ("spi: Add ADI driver for Spreadtrum platform") Reported-by: Dan Carpenter Signed-off-by: Baolin Wang Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index 6a5ff3003044..5993bdbf79e4 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -303,7 +303,7 @@ static void sprd_adi_hw_init(struct sprd_adi *sadi) writel_relaxed(chn_config, sadi->base + REG_ADI_CHN_ADDR(chn_id)); - if (chn_id < 31) { + if (chn_id < 32) { value = readl_relaxed(sadi->base + REG_ADI_CHN_EN); value |= BIT(chn_id); writel_relaxed(value, sadi->base + REG_ADI_CHN_EN); From bf36eb5c4b3ef0ebfb19b1a67a5fa5821e6c9fa7 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Fri, 20 Oct 2017 12:14:47 -0300 Subject: [PATCH 0593/1085] perf report: Properly handle branch count in match_chain() Some of the code paths I introduced before returned too early without running the code to handle a node's branch count. By refactoring match_chain to only have one exit point, this can be remedied. Signed-off-by: Milian Wolff Acked-by: Namhyung Kim Cc: David Ahern Cc: Jin Yao Cc: Peter Zijlstra Cc: Ravi Bangoria Link: http://lkml.kernel.org/r/1707691.qaJ269GSZW@agathebauer Link: http://lkml.kernel.org/r/20171018185350.14893-2-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 150 ++++++++++++++++++++---------------- 1 file changed, 83 insertions(+), 67 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 35a920f09503..19bfcadcf891 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -666,83 +666,99 @@ static enum match_result match_chain_strings(const char *left, return ret; } +/* + * We need to always use relative addresses because we're aggregating + * callchains from multiple threads, i.e. different address spaces, so + * comparing absolute addresses make no sense as a symbol in a DSO may end up + * in a different address when used in a different binary or even the same + * binary but with some sort of address randomization technique, thus we need + * to compare just relative addresses. -acme + */ +static enum match_result match_chain_dso_addresses(struct map *left_map, u64 left_ip, + struct map *right_map, u64 right_ip) +{ + struct dso *left_dso = left_map ? left_map->dso : NULL; + struct dso *right_dso = right_map ? right_map->dso : NULL; + + if (left_dso != right_dso) + return left_dso < right_dso ? MATCH_LT : MATCH_GT; + + if (left_ip != right_ip) + return left_ip < right_ip ? MATCH_LT : MATCH_GT; + + return MATCH_EQ; +} + static enum match_result match_chain(struct callchain_cursor_node *node, struct callchain_list *cnode) { - struct symbol *sym = node->sym; - u64 left, right; - struct dso *left_dso = NULL; - struct dso *right_dso = NULL; - - if (callchain_param.key == CCKEY_SRCLINE) { - enum match_result match = match_chain_strings(cnode->srcline, - node->srcline); - - /* if no srcline is available, fallback to symbol name */ - if (match == MATCH_ERROR && cnode->ms.sym && node->sym) - match = match_chain_strings(cnode->ms.sym->name, - node->sym->name); + enum match_result match = MATCH_ERROR; + switch (callchain_param.key) { + case CCKEY_SRCLINE: + match = match_chain_strings(cnode->srcline, node->srcline); if (match != MATCH_ERROR) - return match; - - /* otherwise fall-back to IP-based comparison below */ - } - - if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) { - /* - * Compare inlined frames based on their symbol name because - * different inlined frames will have the same symbol start - */ - if (cnode->ms.sym->inlined || node->sym->inlined) - return match_chain_strings(cnode->ms.sym->name, - node->sym->name); - - left = cnode->ms.sym->start; - right = sym->start; - left_dso = cnode->ms.map->dso; - right_dso = node->map->dso; - } else { - left = cnode->ip; - right = node->ip; - } - - if (left == right && left_dso == right_dso) { - if (node->branch) { - cnode->branch_count++; - - if (node->branch_from) { - /* - * It's "to" of a branch - */ - cnode->brtype_stat.branch_to = true; - - if (node->branch_flags.predicted) - cnode->predicted_count++; - - if (node->branch_flags.abort) - cnode->abort_count++; - - branch_type_count(&cnode->brtype_stat, - &node->branch_flags, - node->branch_from, - node->ip); + break; + /* otherwise fall-back to symbol-based comparison below */ + __fallthrough; + case CCKEY_FUNCTION: + if (node->sym && cnode->ms.sym) { + /* + * Compare inlined frames based on their symbol name + * because different inlined frames will have the same + * symbol start. Otherwise do a faster comparison based + * on the symbol start address. + */ + if (cnode->ms.sym->inlined || node->sym->inlined) { + match = match_chain_strings(cnode->ms.sym->name, + node->sym->name); + if (match != MATCH_ERROR) + break; } else { - /* - * It's "from" of a branch - */ - cnode->brtype_stat.branch_to = false; - cnode->cycles_count += - node->branch_flags.cycles; - cnode->iter_count += node->nr_loop_iter; - cnode->iter_cycles += node->iter_cycles; + match = match_chain_dso_addresses(cnode->ms.map, cnode->ms.sym->start, + node->map, node->sym->start); + break; } } - - return MATCH_EQ; + /* otherwise fall-back to IP-based comparison below */ + __fallthrough; + case CCKEY_ADDRESS: + default: + match = match_chain_dso_addresses(cnode->ms.map, cnode->ip, node->map, node->ip); + break; } - return left > right ? MATCH_GT : MATCH_LT; + if (match == MATCH_EQ && node->branch) { + cnode->branch_count++; + + if (node->branch_from) { + /* + * It's "to" of a branch + */ + cnode->brtype_stat.branch_to = true; + + if (node->branch_flags.predicted) + cnode->predicted_count++; + + if (node->branch_flags.abort) + cnode->abort_count++; + + branch_type_count(&cnode->brtype_stat, + &node->branch_flags, + node->branch_from, + node->ip); + } else { + /* + * It's "from" of a branch + */ + cnode->brtype_stat.branch_to = false; + cnode->cycles_count += node->branch_flags.cycles; + cnode->iter_count += node->nr_loop_iter; + cnode->iter_cycles += node->iter_cycles; + } + } + + return match; } /* From b38775cf7678d7715b35dded3dcfab66e244baae Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 19 Oct 2017 13:38:33 +0200 Subject: [PATCH 0594/1085] perf report: Cache failed lookups of inlined frames When no inlined frames could be found for a given address, we did not store this information anywhere. That means we potentially do the costly inliner lookup repeatedly for cases where we know it can never succeed. This patch makes dso__parse_addr_inlines always return a valid inline_node. It will be empty when no inliners are found. This enables us to cache the empty list in the DSO, thereby improving the performance when many addresses fail to find the inliners. For my trivial example, the performance impact is already quite significant: Before: ~~~~~ Performance counter stats for 'perf report --stdio --inline -g srcline -s srcline' (5 runs): 594.804032 task-clock (msec) # 0.998 CPUs utilized ( +- 0.07% ) 53 context-switches # 0.089 K/sec ( +- 4.09% ) 0 cpu-migrations # 0.000 K/sec ( +-100.00% ) 5,687 page-faults # 0.010 M/sec ( +- 0.02% ) 2,300,918,213 cycles # 3.868 GHz ( +- 0.09% ) 4,395,839,080 instructions # 1.91 insn per cycle ( +- 0.00% ) 939,177,205 branches # 1578.969 M/sec ( +- 0.00% ) 11,824,633 branch-misses # 1.26% of all branches ( +- 0.10% ) 0.596246531 seconds time elapsed ( +- 0.07% ) ~~~~~ After: ~~~~~ Performance counter stats for 'perf report --stdio --inline -g srcline -s srcline' (5 runs): 113.111405 task-clock (msec) # 0.990 CPUs utilized ( +- 0.89% ) 29 context-switches # 0.255 K/sec ( +- 54.25% ) 0 cpu-migrations # 0.000 K/sec 5,380 page-faults # 0.048 M/sec ( +- 0.01% ) 432,378,779 cycles # 3.823 GHz ( +- 0.75% ) 670,057,633 instructions # 1.55 insn per cycle ( +- 0.01% ) 141,001,247 branches # 1246.570 M/sec ( +- 0.01% ) 2,346,845 branch-misses # 1.66% of all branches ( +- 0.19% ) 0.114222393 seconds time elapsed ( +- 1.19% ) ~~~~~ Signed-off-by: Milian Wolff Reviewed-by: Andi Kleen Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171019113836.5548-3-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 15 +++++++-------- tools/perf/util/srcline.c | 16 +--------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 3d049cb313ac..177c1d4088f8 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2115,9 +2115,10 @@ static int append_inlines(struct callchain_cursor *cursor, struct inline_node *inline_node; struct inline_list *ilist; u64 addr; + int ret = 1; if (!symbol_conf.inline_name || !map || !sym) - return 1; + return ret; addr = map__rip_2objdump(map, ip); @@ -2125,22 +2126,20 @@ static int append_inlines(struct callchain_cursor *cursor, if (!inline_node) { inline_node = dso__parse_addr_inlines(map->dso, addr, sym); if (!inline_node) - return 1; - + return ret; inlines__tree_insert(&map->dso->inlined_nodes, inline_node); } list_for_each_entry(ilist, &inline_node->val, list) { - int ret = callchain_cursor_append(cursor, ip, map, - ilist->symbol, false, - NULL, 0, 0, 0, - ilist->srcline); + ret = callchain_cursor_append(cursor, ip, map, + ilist->symbol, false, + NULL, 0, 0, 0, ilist->srcline); if (ret != 0) return ret; } - return 0; + return ret; } static int unwind_entry(struct unwind_entry *entry, void *arg) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 8bea6621d657..fc3888664b20 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -353,17 +353,8 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, INIT_LIST_HEAD(&node->val); node->addr = addr; - if (!addr2line(dso_name, addr, NULL, NULL, dso, TRUE, node, sym)) - goto out_free_inline_node; - - if (list_empty(&node->val)) - goto out_free_inline_node; - + addr2line(dso_name, addr, NULL, NULL, dso, true, node, sym); return node; - -out_free_inline_node: - inline_node__delete(node); - return NULL; } #else /* HAVE_LIBBFD_SUPPORT */ @@ -480,11 +471,6 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, out: pclose(fp); - if (list_empty(&node->val)) { - inline_node__delete(node); - return NULL; - } - return node; } From 21ac9d547fdde79c1e8692587d9044fde549214b Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 19 Oct 2017 13:38:34 +0200 Subject: [PATCH 0595/1085] perf report: Cache srclines for callchain nodes On one hand this ensures that the memory is properly freed when the DSO gets freed. On the other hand this significantly speeds up the processing of the callchain nodes when lots of srclines are requested. For one of my data files e.g.: Before: Performance counter stats for 'perf report -s srcline -g srcline --stdio': 52496.495043 task-clock (msec) # 0.999 CPUs utilized 634 context-switches # 0.012 K/sec 2 cpu-migrations # 0.000 K/sec 191,561 page-faults # 0.004 M/sec 165,074,498,235 cycles # 3.144 GHz 334,170,832,408 instructions # 2.02 insn per cycle 90,220,029,745 branches # 1718.591 M/sec 654,525,177 branch-misses # 0.73% of all branches 52.533273822 seconds time elapsedProcessed 236605 events and lost 40 chunks! After: Performance counter stats for 'perf report -s srcline -g srcline --stdio': 22606.323706 task-clock (msec) # 1.000 CPUs utilized 31 context-switches # 0.001 K/sec 0 cpu-migrations # 0.000 K/sec 185,471 page-faults # 0.008 M/sec 71,188,113,681 cycles # 3.149 GHz 133,204,943,083 instructions # 1.87 insn per cycle 34,886,384,979 branches # 1543.214 M/sec 278,214,495 branch-misses # 0.80% of all branches 22.609857253 seconds time elapsed Note that the difference is only this large when `--inline` is not passed. In such situations, we would use the inliner cache and thus do not run this code path that often. I think that this cache should actually be used in other places, too. When looking at the valgrind leak report for perf report, we see tons of srclines being leaked, most notably from calls to hist_entry__get_srcline. The problem is that get_srcline has many different formatting options (show_sym, show_addr, potentially even unwind_inlines when calling __get_srcline directly). As such, the srcline cannot easily be cached for all calls, or we'd have to add caches for all formatting combinations (6 so far). An alternative would be to remove the formatting options and handle that on a different level - i.e. print the sym/addr on demand wherever we actually output something. And the unwind_inlines could be moved into a separate function that does not return the srcline. Signed-off-by: Milian Wolff Reviewed-by: Andi Kleen Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171019113836.5548-4-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 2 ++ tools/perf/util/dso.h | 1 + tools/perf/util/machine.c | 19 ++++++++--- tools/perf/util/srcline.c | 66 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/srcline.h | 7 +++++ 5 files changed, 91 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 75c8250b3b8a..3192b608e91b 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1203,6 +1203,7 @@ struct dso *dso__new(const char *name) dso->symbols[i] = dso->symbol_names[i] = RB_ROOT; dso->data.cache = RB_ROOT; dso->inlined_nodes = RB_ROOT; + dso->srclines = RB_ROOT; dso->data.fd = -1; dso->data.status = DSO_DATA_STATUS_UNKNOWN; dso->symtab_type = DSO_BINARY_TYPE__NOT_FOUND; @@ -1237,6 +1238,7 @@ void dso__delete(struct dso *dso) /* free inlines first, as they reference symbols */ inlines__tree_delete(&dso->inlined_nodes); + srcline__tree_delete(&dso->srclines); for (i = 0; i < MAP__NR_TYPES; ++i) symbols__delete(&dso->symbols[i]); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 122eca0d242d..821b16c67030 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -142,6 +142,7 @@ struct dso { struct rb_root symbols[MAP__NR_TYPES]; struct rb_root symbol_names[MAP__NR_TYPES]; struct rb_root inlined_nodes; + struct rb_root srclines; struct { u64 addr; struct symbol *symbol; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 177c1d4088f8..94d8f1ccedd9 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1711,11 +1711,22 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, static char *callchain_srcline(struct map *map, struct symbol *sym, u64 ip) { - if (!map || callchain_param.key == CCKEY_FUNCTION) - return NULL; + char *srcline = NULL; - return get_srcline(map->dso, map__rip_2objdump(map, ip), - sym, false, callchain_param.key == CCKEY_ADDRESS); + if (!map || callchain_param.key == CCKEY_FUNCTION) + return srcline; + + srcline = srcline__tree_find(&map->dso->srclines, ip); + if (!srcline) { + bool show_sym = false; + bool show_addr = callchain_param.key == CCKEY_ADDRESS; + + srcline = get_srcline(map->dso, map__rip_2objdump(map, ip), + sym, show_sym, show_addr); + srcline__tree_insert(&map->dso->srclines, ip, srcline); + } + + return srcline; } struct iterations { diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index fc3888664b20..c143c3bc1ef8 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -542,6 +542,72 @@ char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, return __get_srcline(dso, addr, sym, show_sym, show_addr, false); } +struct srcline_node { + u64 addr; + char *srcline; + struct rb_node rb_node; +}; + +void srcline__tree_insert(struct rb_root *tree, u64 addr, char *srcline) +{ + struct rb_node **p = &tree->rb_node; + struct rb_node *parent = NULL; + struct srcline_node *i, *node; + + node = zalloc(sizeof(struct srcline_node)); + if (!node) { + perror("not enough memory for the srcline node"); + return; + } + + node->addr = addr; + node->srcline = srcline; + + while (*p != NULL) { + parent = *p; + i = rb_entry(parent, struct srcline_node, rb_node); + if (addr < i->addr) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->rb_node, parent, p); + rb_insert_color(&node->rb_node, tree); +} + +char *srcline__tree_find(struct rb_root *tree, u64 addr) +{ + struct rb_node *n = tree->rb_node; + + while (n) { + struct srcline_node *i = rb_entry(n, struct srcline_node, + rb_node); + + if (addr < i->addr) + n = n->rb_left; + else if (addr > i->addr) + n = n->rb_right; + else + return i->srcline; + } + + return NULL; +} + +void srcline__tree_delete(struct rb_root *tree) +{ + struct srcline_node *pos; + struct rb_node *next = rb_first(tree); + + while (next) { + pos = rb_entry(next, struct srcline_node, rb_node); + next = rb_next(&pos->rb_node); + rb_erase(&pos->rb_node, tree); + free_srcline(pos->srcline); + zfree(&pos); + } +} + struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, struct symbol *sym) { diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index ebe38cd22294..1c4d6210860b 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -15,6 +15,13 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, bool show_sym, bool show_addr, bool unwind_inlines); void free_srcline(char *srcline); +/* insert the srcline into the DSO, which will take ownership */ +void srcline__tree_insert(struct rb_root *tree, u64 addr, char *srcline); +/* find previously inserted srcline */ +char *srcline__tree_find(struct rb_root *tree, u64 addr); +/* delete all srclines within the tree */ +void srcline__tree_delete(struct rb_root *tree); + #define SRCLINE_UNKNOWN ((char *) "??:0") struct inline_list { From 1fb7d06a509e82893e59e0f0b223e7d5d6d0ef8c Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 19 Oct 2017 13:38:35 +0200 Subject: [PATCH 0596/1085] perf report: Use srcline from callchain for hist entries This also removes the symbol name from the srcline column, more on this below. This ensures we use the correct srcline, which could originate from a potentially inlined function. The hist entries used to query for the srcline based purely on the IP, which leads to wrong results for inlined entries. Before: ~~~~~ perf report --inline -s srcline -g none --stdio ... # Children Self Source:Line # ........ ........ .................................................................................................................................. # 94.23% 0.00% __libc_start_main+18446603487898210537 94.23% 0.00% _start+41 44.58% 0.00% main+100 44.58% 0.00% std::_Norm_helper::_S_do_it+100 44.58% 0.00% std::__complex_abs+100 44.58% 0.00% std::abs+100 44.58% 0.00% std::norm+100 36.01% 0.00% hypot+18446603487892193300 25.81% 0.00% main+41 25.81% 0.00% std::__detail::_Adaptor, double>::operator()+41 25.81% 0.00% std::uniform_real_distribution::operator() >+41 25.75% 25.75% random.h:143 18.39% 0.00% main+57 18.39% 0.00% std::__detail::_Adaptor, double>::operator()+57 18.39% 0.00% std::uniform_real_distribution::operator() >+57 13.80% 13.80% random.tcc:3330 5.64% 0.00% ??:0 4.13% 4.13% __hypot_finite+163 4.13% 0.00% __hypot_finite+18446603487892193443 ... ~~~~~ After: ~~~~~ perf report --inline -s srcline -g none --stdio ... # Children Self Source:Line # ........ ........ ........................................... # 94.30% 1.19% main.cpp:39 94.23% 0.00% __libc_start_main+18446603487898210537 94.23% 0.00% _start+41 48.44% 1.70% random.h:1823 48.44% 0.00% random.h:1814 46.74% 2.53% random.h:185 44.68% 0.10% complex:589 44.68% 0.00% complex:597 44.68% 0.00% complex:654 44.68% 0.00% complex:664 40.61% 13.80% random.tcc:3330 36.01% 0.00% hypot+18446603487892193300 26.81% 0.00% random.h:151 26.81% 0.00% random.h:332 25.75% 25.75% random.h:143 5.64% 0.00% ??:0 4.13% 4.13% __hypot_finite+163 4.13% 0.00% __hypot_finite+18446603487892193443 ... ~~~~~ Note that this change removes the symbol from the source:line hist column. If this information is desired, users should explicitly query for it if needed. I.e. run this command instead: ~~~~~ perf report --inline -s sym,srcline -g none --stdio ... # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 1K of event 'cycles:uppp' # Event count (approx.): 1381229476 # # Children Self Symbol Source:Line # ........ ........ ................................................................................................................................... ........................................... # 94.30% 1.19% [.] main main.cpp:39 94.23% 0.00% [.] __libc_start_main __libc_start_main+18446603487898210537 94.23% 0.00% [.] _start _start+41 48.44% 0.00% [.] std::uniform_real_distribution::operator() > (inlined) random.h:1814 48.44% 0.00% [.] std::uniform_real_distribution::operator() > (inlined) random.h:1823 46.74% 0.00% [.] std::__detail::_Adaptor, double>::operator() (inlined) random.h:185 44.68% 0.00% [.] std::_Norm_helper::_S_do_it (inlined) complex:654 44.68% 0.00% [.] std::__complex_abs (inlined) complex:589 44.68% 0.00% [.] std::abs (inlined) complex:597 44.68% 0.00% [.] std::norm (inlined) complex:664 39.80% 13.59% [.] std::generate_canonical > random.tcc:3330 36.01% 0.00% [.] hypot hypot+18446603487892193300 26.81% 0.00% [.] std::__detail::__mod (inlined) random.h:151 26.81% 0.00% [.] std::linear_congruential_engine::operator() (inlined) random.h:332 25.75% 0.00% [.] std::__detail::_Mod::__calc (inlined) random.h:143 25.19% 25.19% [.] std::generate_canonical > random.h:143 4.13% 4.13% [.] __hypot_finite __hypot_finite+163 4.13% 0.00% [.] __hypot_finite __hypot_finite+18446603487892193443 ... ~~~~~ Compared to the old behavior, this reduces duplication in the output. Before we used to print the symbol name in the srcline column even when the sym column was explicitly requested. I.e. the output was: ~~~~~ perf report --inline -s sym,srcline -g none --stdio ... # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 1K of event 'cycles:uppp' # Event count (approx.): 1381229476 # # Children Self Symbol Source:Line # ........ ........ ................................................................................................................................... .................................................................................................................................. # 94.23% 0.00% [.] __libc_start_main __libc_start_main+18446603487898210537 94.23% 0.00% [.] _start _start+41 44.58% 0.00% [.] main main+100 44.58% 0.00% [.] std::_Norm_helper::_S_do_it (inlined) std::_Norm_helper::_S_do_it+100 44.58% 0.00% [.] std::__complex_abs (inlined) std::__complex_abs+100 44.58% 0.00% [.] std::abs (inlined) std::abs+100 44.58% 0.00% [.] std::norm (inlined) std::norm+100 36.01% 0.00% [.] hypot hypot+18446603487892193300 25.81% 0.00% [.] main main+41 25.81% 0.00% [.] std::__detail::_Adaptor, double>::operator() (inlined) std::__detail::_Adaptor, double>::operator()+41 25.81% 0.00% [.] std::uniform_real_distribution::operator() > (inlined) std::uniform_real_distribution::operator() >+41 25.69% 25.69% [.] std::generate_canonical > random.h:143 18.39% 0.00% [.] main main+57 18.39% 0.00% [.] std::__detail::_Adaptor, double>::operator() (inlined) std::__detail::_Adaptor, double>::operator()+57 18.39% 0.00% [.] std::uniform_real_distribution::operator() > (inlined) std::uniform_real_distribution::operator() >+57 13.80% 13.80% [.] std::generate_canonical > random.tcc:3330 4.13% 4.13% [.] __hypot_finite __hypot_finite+163 4.13% 0.00% [.] __hypot_finite __hypot_finite+18446603487892193443 ... ~~~~~ Signed-off-by: Milian Wolff Reviewed-by: Andi Kleen Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171019113836.5548-5-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 1 + tools/perf/util/event.c | 1 + tools/perf/util/hist.c | 2 ++ tools/perf/util/symbol.h | 1 + 4 files changed, 5 insertions(+) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 19bfcadcf891..3a3916934a92 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1090,6 +1090,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * { al->map = node->map; al->sym = node->sym; + al->srcline = node->srcline; if (node->map) al->addr = node->map->map_ip(node->map, node->ip); else diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 47eff4767edb..3c411e7e36aa 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -1604,6 +1604,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al, al->sym = NULL; al->cpu = sample->cpu; al->socket = -1; + al->srcline = NULL; if (al->cpu >= 0) { struct perf_env *env = machine->env; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index b0fa9c217e1c..25d143053ab5 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -596,6 +596,7 @@ __hists__add_entry(struct hists *hists, .map = al->map, .sym = al->sym, }, + .srcline = al->srcline ? strdup(al->srcline) : NULL, .socket = al->socket, .cpu = al->cpu, .cpumode = al->cpumode, @@ -950,6 +951,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, .map = al->map, .sym = al->sym, }, + .srcline = al->srcline ? strdup(al->srcline) : NULL, .parent = iter->parent, .raw_data = sample->raw_data, .raw_size = sample->raw_size, diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index d880a059babb..d548ea5cb418 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -209,6 +209,7 @@ struct addr_location { struct thread *thread; struct map *map; struct symbol *sym; + const char *srcline; u64 addr; char level; u8 filtered; From d8a88dd243a170a226aba33e7c53704db2f82aa6 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 19 Oct 2017 13:38:36 +0200 Subject: [PATCH 0597/1085] perf util: Enable handling of inlined frames by default Now that we have caches in place to speed up the process of finding inlined frames and srcline information repeatedly, we can enable this useful option by default. Suggested-by: Ingo Molnar Signed-off-by: Milian Wolff Reviewed-by: Andi Kleen Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171019113836.5548-6-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 3 ++- tools/perf/Documentation/perf-script.txt | 3 ++- tools/perf/util/symbol.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 383a98d992ed..ddde2b54af57 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -434,7 +434,8 @@ include::itrace.txt[] --inline:: If a callgraph address belongs to an inlined function, the inline stack - will be printed. Each entry is function name or file/line. + will be printed. Each entry is function name or file/line. Enabled by + default, disable with --no-inline. include::callchain-overhead-calculation.txt[] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index bcc1ba35a2d8..25e677344728 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -327,7 +327,8 @@ include::itrace.txt[] --inline:: If a callgraph address belongs to an inlined function, the inline stack - will be printed. Each entry has function name and file/line. + will be printed. Each entry has function name and file/line. Enabled by + default, disable with --no-inline. SEE ALSO -------- diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 066e38aa4063..ce6993bebf8c 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -45,6 +45,7 @@ struct symbol_conf symbol_conf = { .show_hist_headers = true, .symfs = "", .event_group = true, + .inline_name = true, }; static enum dso_binary_type binary_type_symtab[] = { From e319e1fbd9d42420ab6eec0bfd75eb9ad7ca63b1 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:05 +0900 Subject: [PATCH 0598/1085] block, locking/lockdep: Assign a lock_class per gendisk used for wait_for_completion() Darrick posted the following warning and Dave Chinner analyzed it: > ====================================================== > WARNING: possible circular locking dependency detected > 4.14.0-rc1-fixes #1 Tainted: G W > ------------------------------------------------------ > loop0/31693 is trying to acquire lock: > (&(&ip->i_mmaplock)->mr_lock){++++}, at: [] xfs_ilock+0x23c/0x330 [xfs] > > but now in release context of a crosslock acquired at the following: > ((complete)&ret.event){+.+.}, at: [] submit_bio_wait+0x7f/0xb0 > > which lock already depends on the new lock. > > the existing dependency chain (in reverse order) is: > > -> #2 ((complete)&ret.event){+.+.}: > lock_acquire+0xab/0x200 > wait_for_completion_io+0x4e/0x1a0 > submit_bio_wait+0x7f/0xb0 > blkdev_issue_zeroout+0x71/0xa0 > xfs_bmapi_convert_unwritten+0x11f/0x1d0 [xfs] > xfs_bmapi_write+0x374/0x11f0 [xfs] > xfs_iomap_write_direct+0x2ac/0x430 [xfs] > xfs_file_iomap_begin+0x20d/0xd50 [xfs] > iomap_apply+0x43/0xe0 > dax_iomap_rw+0x89/0xf0 > xfs_file_dax_write+0xcc/0x220 [xfs] > xfs_file_write_iter+0xf0/0x130 [xfs] > __vfs_write+0xd9/0x150 > vfs_write+0xc8/0x1c0 > SyS_write+0x45/0xa0 > entry_SYSCALL_64_fastpath+0x1f/0xbe > > -> #1 (&xfs_nondir_ilock_class){++++}: > lock_acquire+0xab/0x200 > down_write_nested+0x4a/0xb0 > xfs_ilock+0x263/0x330 [xfs] > xfs_setattr_size+0x152/0x370 [xfs] > xfs_vn_setattr+0x6b/0x90 [xfs] > notify_change+0x27d/0x3f0 > do_truncate+0x5b/0x90 > path_openat+0x237/0xa90 > do_filp_open+0x8a/0xf0 > do_sys_open+0x11c/0x1f0 > entry_SYSCALL_64_fastpath+0x1f/0xbe > > -> #0 (&(&ip->i_mmaplock)->mr_lock){++++}: > up_write+0x1c/0x40 > xfs_iunlock+0x1d0/0x310 [xfs] > xfs_file_fallocate+0x8a/0x310 [xfs] > loop_queue_work+0xb7/0x8d0 > kthread_worker_fn+0xb9/0x1f0 > > Chain exists of: > &(&ip->i_mmaplock)->mr_lock --> &xfs_nondir_ilock_class --> (complete)&ret.event > > Possible unsafe locking scenario by crosslock: > > CPU0 CPU1 > ---- ---- > lock(&xfs_nondir_ilock_class); > lock((complete)&ret.event); > lock(&(&ip->i_mmaplock)->mr_lock); > unlock((complete)&ret.event); > > *** DEADLOCK *** The warning is a false positive, caused by the fact that all wait_for_completion()s in submit_bio_wait() are waiting with the same lock class. However, some bios have nothing to do with others, for example in the case of loop devices, there's no direct connection between the bios of an upper device and the bios of a lower device(=loop device). The safest way to assign different lock classes to different devices is to do it for each gendisk. In other words, this patch assigns a lockdep_map per gendisk and uses it when initializing completion in submit_bio_wait(). Analyzed-by: Dave Chinner Reported-by: Darrick J. Wong Signed-off-by: Byungchul Park Reviewed-by: Jens Axboe Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-10-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- block/bio.c | 2 +- block/genhd.c | 10 ++-------- include/linux/genhd.h | 22 ++++++++++++++++++++-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/block/bio.c b/block/bio.c index 5e901bfc0576..cc60213e56d8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -935,7 +935,7 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK(done); + DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; diff --git a/block/genhd.c b/block/genhd.c index dd305c65ffb0..630c0da6cfcf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1354,13 +1354,7 @@ dev_t blk_lookup_devt(const char *name, int partno) } EXPORT_SYMBOL(blk_lookup_devt); -struct gendisk *alloc_disk(int minors) -{ - return alloc_disk_node(minors, NUMA_NO_NODE); -} -EXPORT_SYMBOL(alloc_disk); - -struct gendisk *alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; struct disk_part_tbl *ptbl; @@ -1411,7 +1405,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } return disk; } -EXPORT_SYMBOL(alloc_disk_node); +EXPORT_SYMBOL(__alloc_disk_node); struct kobject *get_disk(struct gendisk *disk) { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ea652bfcd675..19d18710546a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -206,6 +206,7 @@ struct gendisk { #endif /* CONFIG_BLK_DEV_INTEGRITY */ int node_id; struct badblocks *bb; + struct lockdep_map lockdep_map; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) @@ -590,8 +591,7 @@ extern void __delete_partition(struct percpu_ref *); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); -extern struct gendisk *alloc_disk_node(int minors, int node_id); -extern struct gendisk *alloc_disk(int minors); +extern struct gendisk *__alloc_disk_node(int minors, int node_id); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void blk_register_region(dev_t devt, unsigned long range, @@ -615,6 +615,24 @@ extern ssize_t part_fail_store(struct device *dev, const char *buf, size_t count); #endif /* CONFIG_FAIL_MAKE_REQUEST */ +#define alloc_disk_node(minors, node_id) \ +({ \ + static struct lock_class_key __key; \ + const char *__name; \ + struct gendisk *__disk; \ + \ + __name = "(gendisk_completion)"#minors"("#node_id")"; \ + \ + __disk = __alloc_disk_node(minors, node_id); \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \ + \ + __disk; \ +}) + +#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) + static inline int hd_ref_init(struct hd_struct *part) { if (percpu_ref_init(&part->ref, __delete_partition, 0, From 3593eb944c65c7a0adfd679949e67f96d97d1768 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 10 Oct 2017 11:52:02 +0200 Subject: [PATCH 0599/1085] s390/cpum_cf: add hardware counter support for IBM z14 Add the hardware counters that are available with z14. With z14, the number of problem-state counters is reduced. The initialization is updated respectively. Signed-off-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_cf_events.c | 278 +++++++++++++++++++------ 1 file changed, 218 insertions(+), 60 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index d3133285b7d1..2b938ef0babd 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -9,34 +9,42 @@ /* BEGIN: CPUM_CF COUNTER DEFINITIONS =================================== */ -CPUMF_EVENT_ATTR(cf, CPU_CYCLES, 0x0000); -CPUMF_EVENT_ATTR(cf, INSTRUCTIONS, 0x0001); -CPUMF_EVENT_ATTR(cf, L1I_DIR_WRITES, 0x0002); -CPUMF_EVENT_ATTR(cf, L1I_PENALTY_CYCLES, 0x0003); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_CPU_CYCLES, 0x0020); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_INSTRUCTIONS, 0x0021); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_L1I_DIR_WRITES, 0x0022); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_L1I_PENALTY_CYCLES, 0x0023); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_L1D_DIR_WRITES, 0x0024); -CPUMF_EVENT_ATTR(cf, PROBLEM_STATE_L1D_PENALTY_CYCLES, 0x0025); -CPUMF_EVENT_ATTR(cf, L1D_DIR_WRITES, 0x0004); -CPUMF_EVENT_ATTR(cf, L1D_PENALTY_CYCLES, 0x0005); -CPUMF_EVENT_ATTR(cf, PRNG_FUNCTIONS, 0x0040); -CPUMF_EVENT_ATTR(cf, PRNG_CYCLES, 0x0041); -CPUMF_EVENT_ATTR(cf, PRNG_BLOCKED_FUNCTIONS, 0x0042); -CPUMF_EVENT_ATTR(cf, PRNG_BLOCKED_CYCLES, 0x0043); -CPUMF_EVENT_ATTR(cf, SHA_FUNCTIONS, 0x0044); -CPUMF_EVENT_ATTR(cf, SHA_CYCLES, 0x0045); -CPUMF_EVENT_ATTR(cf, SHA_BLOCKED_FUNCTIONS, 0x0046); -CPUMF_EVENT_ATTR(cf, SHA_BLOCKED_CYCLES, 0x0047); -CPUMF_EVENT_ATTR(cf, DEA_FUNCTIONS, 0x0048); -CPUMF_EVENT_ATTR(cf, DEA_CYCLES, 0x0049); -CPUMF_EVENT_ATTR(cf, DEA_BLOCKED_FUNCTIONS, 0x004a); -CPUMF_EVENT_ATTR(cf, DEA_BLOCKED_CYCLES, 0x004b); -CPUMF_EVENT_ATTR(cf, AES_FUNCTIONS, 0x004c); -CPUMF_EVENT_ATTR(cf, AES_CYCLES, 0x004d); -CPUMF_EVENT_ATTR(cf, AES_BLOCKED_FUNCTIONS, 0x004e); -CPUMF_EVENT_ATTR(cf, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_fvn1, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn1, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES, 0x0022); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES, 0x0023); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES, 0x0024); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES, 0x0025); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_fvn3, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn3, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_CYCLES, 0x004f); CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); @@ -170,36 +178,105 @@ CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_NO_SPECIAL, 0x00db); CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_SPECIAL, 0x00dc); CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z14, L1D_WRITES_RO_EXCL, 0x0080); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z14, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z14, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z14, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z14, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z14, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z14, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z14, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z14, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z14, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z14, LAST_HOST_TRANSLATIONS, 0x00e9); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); -static struct attribute *cpumcf_pmu_event_attr[] __initdata = { - CPUMF_EVENT_PTR(cf, CPU_CYCLES), - CPUMF_EVENT_PTR(cf, INSTRUCTIONS), - CPUMF_EVENT_PTR(cf, L1I_DIR_WRITES), - CPUMF_EVENT_PTR(cf, L1I_PENALTY_CYCLES), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_CPU_CYCLES), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_INSTRUCTIONS), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_L1I_DIR_WRITES), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_L1I_PENALTY_CYCLES), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_L1D_DIR_WRITES), - CPUMF_EVENT_PTR(cf, PROBLEM_STATE_L1D_PENALTY_CYCLES), - CPUMF_EVENT_PTR(cf, L1D_DIR_WRITES), - CPUMF_EVENT_PTR(cf, L1D_PENALTY_CYCLES), - CPUMF_EVENT_PTR(cf, PRNG_FUNCTIONS), - CPUMF_EVENT_PTR(cf, PRNG_CYCLES), - CPUMF_EVENT_PTR(cf, PRNG_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf, PRNG_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf, SHA_FUNCTIONS), - CPUMF_EVENT_PTR(cf, SHA_CYCLES), - CPUMF_EVENT_PTR(cf, SHA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf, SHA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf, DEA_FUNCTIONS), - CPUMF_EVENT_PTR(cf, DEA_CYCLES), - CPUMF_EVENT_PTR(cf, DEA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf, DEA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf, AES_FUNCTIONS), - CPUMF_EVENT_PTR(cf, AES_CYCLES), - CPUMF_EVENT_PTR(cf, AES_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf, AES_BLOCKED_CYCLES), +static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn3, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_generic_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_CYCLES), NULL, }; @@ -352,6 +429,63 @@ static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z14, L1D_WRITES_RO_EXCL), + CPUMF_EVENT_PTR(cf_z14, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z14, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z14, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z14, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -378,7 +512,8 @@ static const struct attribute_group *cpumcf_pmu_attr_groups[] = { static __init struct attribute **merge_attr(struct attribute **a, - struct attribute **b) + struct attribute **b, + struct attribute **c) { struct attribute **new; int j, i; @@ -387,6 +522,8 @@ static __init struct attribute **merge_attr(struct attribute **a, ; for (i = 0; b[i]; i++) j++; + for (i = 0; c[i]; i++) + j++; j++; new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); @@ -397,6 +534,8 @@ static __init struct attribute **merge_attr(struct attribute **a, new[j++] = a[i]; for (i = 0; b[i]; i++) new[j++] = b[i]; + for (i = 0; c[i]; i++) + new[j++] = c[i]; new[j] = NULL; return new; @@ -404,10 +543,26 @@ static __init struct attribute **merge_attr(struct attribute **a, __init const struct attribute_group **cpumf_cf_event_group(void) { - struct attribute **combined, **model; + struct attribute **combined, **model, **cfvn, **csvn; struct attribute *none[] = { NULL }; + struct cpumf_ctr_info ci; struct cpuid cpu_id; + /* Determine generic counters set(s) */ + qctri(&ci); + switch (ci.cfvn) { + case 1: + cfvn = cpumcf_fvn1_pmu_event_attr; + break; + case 3: + cfvn = cpumcf_fvn3_pmu_event_attr; + break; + default: + cfvn = none; + } + csvn = cpumcf_svn_generic_pmu_event_attr; + + /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); switch (cpu_id.machine) { case 0x2097: @@ -426,12 +581,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x2965: model = cpumcf_z13_pmu_event_attr; break; + case 0x3906: + model = cpumcf_z14_pmu_event_attr; + break; default: model = none; break; } - combined = merge_attr(cpumcf_pmu_event_attr, model); + combined = merge_attr(cfvn, csvn, model); if (combined) cpumcf_pmu_events_group.attrs = combined; return cpumcf_pmu_attr_groups; From 54b933c6c954a8b7b0c2b40a1c4d3f7279d11e22 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Wed, 25 Oct 2017 19:28:27 +0800 Subject: [PATCH 0600/1085] sched/idle: Micro-optimize the idle loop Move the loop-invariant calculation of 'cpu' in do_idle() out of the loop body, because the current CPU is always constant. This improves the generated code both on x86-64 and ARM64: x86-64: Before patch (execution in loop): 864: 0f ae e8 lfence 867: 65 8b 05 c2 38 f1 7e mov %gs:0x7ef138c2(%rip),%eax 86e: 89 c0 mov %eax,%eax 870: 48 0f a3 05 68 19 08 bt %rax,0x1081968(%rip) 877: 01 After patch (execution in loop): 872: 0f ae e8 lfence 875: 4c 0f a3 25 63 19 08 bt %r12,0x1081963(%rip) 87c: 01 ARM64: Before patch (execution in loop): c58: d5033d9f dsb ld c5c: d538d080 mrs x0, tpidr_el1 c60: b8606a61 ldr w1, [x19,x0] c64: 1100fc20 add w0, w1, #0x3f c68: 7100003f cmp w1, #0x0 c6c: 1a81b000 csel w0, w0, w1, lt c70: 13067c00 asr w0, w0, #6 c74: 93407c00 sxtw x0, w0 c78: f8607a80 ldr x0, [x20,x0,lsl #3] c7c: 9ac12401 lsr x1, x0, x1 c80: 36000581 tbz w1, #0, d30 After patch (execution in loop): c84: d5033d9f dsb ld c88: f9400260 ldr x0, [x19] c8c: ea14001f tst x0, x20 c90: 54000580 b.eq d40 Signed-off-by: Cheng Jian [ Rewrote the title and the changelog. ] Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1508930907-107755-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b2e8f0afbb42..7dae9eb8c042 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -209,6 +209,7 @@ exit_idle: */ static void do_idle(void) { + int cpu = smp_processor_id(); /* * If the arch has a polling bit, we maintain an invariant: * @@ -225,7 +226,7 @@ static void do_idle(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } From 0462d9921e3dfcb022600e4db2d67e6ceab57769 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:47:48 +0300 Subject: [PATCH 0601/1085] vmur: convert urdev.ref_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable urdev.ref_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Martin Schwidefsky --- drivers/s390/char/vmur.c | 8 ++++---- drivers/s390/char/vmur.h | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index a50cf930a623..fa90ef05afc0 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -110,7 +110,7 @@ static struct urdev *urdev_alloc(struct ccw_device *cdev) mutex_init(&urd->io_mutex); init_waitqueue_head(&urd->wait); spin_lock_init(&urd->open_lock); - atomic_set(&urd->ref_count, 1); + refcount_set(&urd->ref_count, 1); urd->cdev = cdev; get_device(&cdev->dev); return urd; @@ -126,7 +126,7 @@ static void urdev_free(struct urdev *urd) static void urdev_get(struct urdev *urd) { - atomic_inc(&urd->ref_count); + refcount_inc(&urd->ref_count); } static struct urdev *urdev_get_from_cdev(struct ccw_device *cdev) @@ -159,7 +159,7 @@ static struct urdev *urdev_get_from_devno(u16 devno) static void urdev_put(struct urdev *urd) { - if (atomic_dec_and_test(&urd->ref_count)) + if (refcount_dec_and_test(&urd->ref_count)) urdev_free(urd); } @@ -945,7 +945,7 @@ static int ur_set_offline_force(struct ccw_device *cdev, int force) rc = -EBUSY; goto fail_urdev_put; } - if (!force && (atomic_read(&urd->ref_count) > 2)) { + if (!force && (refcount_read(&urd->ref_count) > 2)) { /* There is still a user of urd (e.g. ur_open) */ TRACE("ur_set_offline: BUSY\n"); rc = -EBUSY; diff --git a/drivers/s390/char/vmur.h b/drivers/s390/char/vmur.h index fa320ad4593d..35ea9d11d121 100644 --- a/drivers/s390/char/vmur.h +++ b/drivers/s390/char/vmur.h @@ -11,6 +11,8 @@ #ifndef _VMUR_H_ #define _VMUR_H_ +#include + #define DEV_CLASS_UR_I 0x20 /* diag210 unit record input device class */ #define DEV_CLASS_UR_O 0x10 /* diag210 unit record output device class */ /* @@ -69,7 +71,7 @@ struct urdev { size_t reclen; /* Record length for *write* CCWs */ int class; /* VM device class */ int io_request_rc; /* return code from I/O request */ - atomic_t ref_count; /* reference counter */ + refcount_t ref_count; /* reference counter */ wait_queue_head_t wait; /* wait queue to serialize open */ int open_flag; /* "urdev is open" flag */ spinlock_t open_lock; /* serialize critical sections */ From 26284ca918e4e271bae4e96129fc3791b9bf0983 Mon Sep 17 00:00:00 2001 From: Marco Franchi Date: Wed, 25 Oct 2017 16:57:13 -0200 Subject: [PATCH 0602/1085] ASoC: pfuze100: Remove leading zero from '@08' notation Improve the binding example by removing the leading 0 from '@08' notation, which fixes the following build warnings: Warning (unit_address_format): Node /pfuze100@08 unit name should not have leading 0s Warning (unit_address_format): Node /pfuze200@08 unit name should not have leading 0s Warning (unit_address_format): Node /pfuze3000@08 unit name should not have leading 0s Signed-off-by: Marco Franchi Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/regulator/pfuze100.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt index 444c47831a40..c6dd3f5e485b 100644 --- a/Documentation/devicetree/bindings/regulator/pfuze100.txt +++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt @@ -21,7 +21,7 @@ Each regulator is defined using the standard binding for regulators. Example 1: PFUZE100 - pmic: pfuze100@08 { + pmic: pfuze100@8 { compatible = "fsl,pfuze100"; reg = <0x08>; @@ -122,7 +122,7 @@ Example 1: PFUZE100 Example 2: PFUZE200 - pmic: pfuze200@08 { + pmic: pfuze200@8 { compatible = "fsl,pfuze200"; reg = <0x08>; @@ -216,7 +216,7 @@ Example 2: PFUZE200 Example 3: PFUZE3000 - pmic: pfuze3000@08 { + pmic: pfuze3000@8 { compatible = "fsl,pfuze3000"; reg = <0x08>; From b82a5e4ea8f6de3cae47fc6223b1372fcf8f1081 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 26 Oct 2017 15:36:29 +0200 Subject: [PATCH 0603/1085] s390/decompressor: remove informational messages The decompressor for bzImage prints two informational messages which are not really helpful. The decompression step is fast and if something bad happens an error message will be printed. Remove the noise. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/compressed/misc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c index 33ca29333e18..886417992e3c 100644 --- a/arch/s390/boot/compressed/misc.c +++ b/arch/s390/boot/compressed/misc.c @@ -169,9 +169,7 @@ unsigned long decompress_kernel(void) free_mem_ptr = (unsigned long) &_end; free_mem_end_ptr = free_mem_ptr + HEAP_SIZE; - puts("Uncompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error); - puts("Ok, booting the kernel.\n"); return (unsigned long) output; } From e28aa8aeab433b62e85a2da8d9bff2ba81c2ea4e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Oct 2017 20:12:07 -0700 Subject: [PATCH 0604/1085] MAINTAINERS: remove David Safford as maintainer for encrypted+trusted keys Emails to David's listed email address bounce, and in the commit log there's no activity from him within the last 5 years. Cc: Mimi Zohar Cc: David Safford Signed-off-by: Eric Biggers Signed-off-by: James Morris --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fc..2c43cfabd438 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7625,7 +7625,6 @@ F: kernel/kexec* KEYS-ENCRYPTED M: Mimi Zohar -M: David Safford L: linux-security-module@vger.kernel.org L: keyrings@vger.kernel.org S: Supported @@ -7634,7 +7633,6 @@ F: include/keys/encrypted-type.h F: security/keys/encrypted-keys/ KEYS-TRUSTED -M: David Safford M: Mimi Zohar L: linux-security-module@vger.kernel.org L: keyrings@vger.kernel.org From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:28 +0200 Subject: [PATCH 0605/1085] sched/isolation: Move housekeeping related code to its own file The housekeeping code is currently tied to the NOHZ code. As we are planning to make housekeeping independent from it, start with moving the relevant code to its own file. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- drivers/net/ethernet/tile/tilegx.c | 2 +- include/linux/sched/isolation.h | 56 ++++++++++++++++++++++++++++++ include/linux/tick.h | 37 -------------------- init/main.c | 2 ++ kernel/rcu/tree_plugin.h | 1 + kernel/rcu/update.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/isolation.c | 33 ++++++++++++++++++ kernel/time/tick-sched.c | 18 ---------- kernel/watchdog.c | 1 + 12 files changed, 98 insertions(+), 56 deletions(-) create mode 100644 include/linux/sched/isolation.h create mode 100644 kernel/sched/isolation.c diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index c00102b8145a..27a3272dcd48 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h new file mode 100644 index 000000000000..b7cfbc46286c --- /dev/null +++ b/include/linux/sched/isolation.h @@ -0,0 +1,56 @@ +#ifndef _LINUX_SCHED_ISOLATION_H +#define _LINUX_SCHED_ISOLATION_H + +#include +#include +#include + +#ifdef CONFIG_NO_HZ_FULL +extern cpumask_var_t housekeeping_mask; + +static inline int housekeeping_any_cpu(void) +{ + return cpumask_any_and(housekeeping_mask, cpu_online_mask); +} + +extern void __init housekeeping_init(void); + +#else + +static inline int housekeeping_any_cpu(void) +{ + return smp_processor_id(); +} + +static inline void housekeeping_init(void) { } +#endif /* CONFIG_NO_HZ_FULL */ + + +static inline const struct cpumask *housekeeping_cpumask(void) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return housekeeping_mask; +#endif + return cpu_possible_mask; +} + +static inline bool is_housekeeping_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); +#endif + return true; +} + +static inline void housekeeping_affine(struct task_struct *t) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); + +#endif +} + +#endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index fe01e68bf520..68afc09aa8ac 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -137,7 +137,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; extern cpumask_var_t tick_nohz_full_mask; -extern cpumask_var_t housekeeping_mask; static inline bool tick_nohz_full_enabled(void) { @@ -161,11 +160,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) cpumask_or(mask, mask, tick_nohz_full_mask); } -static inline int housekeeping_any_cpu(void) -{ - return cpumask_any_and(housekeeping_mask, cpu_online_mask); -} - extern void tick_nohz_dep_set(enum tick_dep_bits bit); extern void tick_nohz_dep_clear(enum tick_dep_bits bit); extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); @@ -235,10 +229,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, extern void tick_nohz_full_kick_cpu(int cpu); extern void __tick_nohz_task_switch(void); #else -static inline int housekeeping_any_cpu(void) -{ - return smp_processor_id(); -} static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } @@ -260,33 +250,6 @@ static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void __tick_nohz_task_switch(void) { } #endif -static inline const struct cpumask *housekeeping_cpumask(void) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return housekeeping_mask; -#endif - return cpu_possible_mask; -} - -static inline bool is_housekeeping_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return cpumask_test_cpu(cpu, housekeeping_mask); -#endif - return true; -} - -static inline void housekeeping_affine(struct task_struct *t) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - set_cpus_allowed_ptr(t, housekeeping_mask); - -#endif -} - static inline void tick_nohz_task_switch(void) { if (tick_nohz_full_enabled()) diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..4610c99ae306 100644 --- a/init/main.c +++ b/init/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) early_irq_init(); init_IRQ(); tick_init(); + housekeeping_init(); rcu_init_nohz(); init_timers(); hrtimers_init(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..c7632f51c5a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "../time/tick-internal.h" diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..79abeb0ca329 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 78f54932ea1d..871d43d73375 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_NO_HZ_FULL) += isolation.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2288a145bf01..ad188acb7636 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56f343b8e749..591481db8c6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,6 +32,7 @@ #include #include #include +#include #include diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c new file mode 100644 index 000000000000..3589252ed476 --- /dev/null +++ b/kernel/sched/isolation.c @@ -0,0 +1,33 @@ +/* + * Housekeeping management. Manage the targets for routine code that can run on + * any CPU: unbound workqueues, timers, kthreads and any offloadable work. + * + * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * + */ + +#include +#include +#include +#include + +cpumask_var_t housekeeping_mask; + +void __init housekeeping_init(void) +{ + if (!tick_nohz_full_enabled()) + return; + + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b258c59d78a..27d7d522ac4e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static atomic_t tick_dep_mask; @@ -438,13 +437,6 @@ void __init tick_nohz_init(void) return; } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; - } - /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own @@ -453,7 +445,6 @@ void __init tick_nohz_init(void) if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); - cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; return; } @@ -466,9 +457,6 @@ void __init tick_nohz_init(void) cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); - for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); @@ -478,12 +466,6 @@ void __init tick_nohz_init(void) WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); - - /* - * We need at least one CPU to handle housekeeping work such - * as timekeeping, unbound timers, workqueues, ... - */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6bcb854909c0..3c44dbaa4b9f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include From 13316b31fdaaa45f06793eb7992588359ba6ab9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:29 +0200 Subject: [PATCH 0606/1085] sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version While trying to disable the watchog on nohz_full CPUs, the watchdog implements an ad-hoc version of housekeeping_cpumask(). Lets replace those re-invented lines. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3c44dbaa4b9f..562652c9c815 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -774,15 +774,10 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) { + if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_mask); - } else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#endif + + cpumask_copy(&watchdog_cpumask, housekeeping_cpumask()); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; From 9f0ca2d97ef0b5e966be2cfef26c7c094ec14e41 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:30 +0200 Subject: [PATCH 0607/1085] sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() housekeeping_any_cpu() doesn't handle correctly the case where CONFIG_NO_HZ_FULL=y and no CPU is in nohz_full mode. So far no caller needs this but let's prepare to avoid any future surprise. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-4-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index b7cfbc46286c..040df04fa78a 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,24 +7,19 @@ #ifdef CONFIG_NO_HZ_FULL extern cpumask_var_t housekeeping_mask; - -static inline int housekeeping_any_cpu(void) -{ - return cpumask_any_and(housekeeping_mask, cpu_online_mask); -} - extern void __init housekeeping_init(void); - #else - -static inline int housekeeping_any_cpu(void) -{ - return smp_processor_id(); -} - static inline void housekeeping_init(void) { } #endif /* CONFIG_NO_HZ_FULL */ +static inline int housekeeping_any_cpu(void) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); +#endif + return smp_processor_id(); +} static inline const struct cpumask *housekeeping_cpumask(void) { From 7e56a1cf4b28f5739526877b8dbad623fae2e4e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:31 +0200 Subject: [PATCH 0608/1085] sched/isolation: Make the housekeeping cpumask private Nobody needs to access this detail. housekeeping_cpumask() already takes care of it. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 31 +++++++++------------------- kernel/sched/isolation.c | 36 ++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 040df04fa78a..ed935ffc6ffa 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -6,46 +6,35 @@ #include #ifdef CONFIG_NO_HZ_FULL -extern cpumask_var_t housekeeping_mask; +extern int housekeeping_any_cpu(void); +extern const struct cpumask *housekeeping_cpumask(void); +extern void housekeeping_affine(struct task_struct *t); +extern bool housekeeping_test_cpu(int cpu); extern void __init housekeeping_init(void); + #else -static inline void housekeeping_init(void) { } -#endif /* CONFIG_NO_HZ_FULL */ static inline int housekeeping_any_cpu(void) { -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return cpumask_any_and(housekeeping_mask, cpu_online_mask); -#endif return smp_processor_id(); } static inline const struct cpumask *housekeeping_cpumask(void) { -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return housekeeping_mask; -#endif return cpu_possible_mask; } +static inline void housekeeping_affine(struct task_struct *t) { } +static inline void housekeeping_init(void) { } +#endif /* CONFIG_NO_HZ_FULL */ + static inline bool is_housekeeping_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) - return cpumask_test_cpu(cpu, housekeeping_mask); + return housekeeping_test_cpu(cpu); #endif return true; } -static inline void housekeeping_affine(struct task_struct *t) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - set_cpus_allowed_ptr(t, housekeeping_mask); - -#endif -} - #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3589252ed476..16445097eb25 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -11,7 +11,41 @@ #include #include -cpumask_var_t housekeeping_mask; +static cpumask_var_t housekeeping_mask; + +int housekeeping_any_cpu(void) +{ + if (tick_nohz_full_enabled()) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + + return smp_processor_id(); +} +EXPORT_SYMBOL_GPL(housekeeping_any_cpu); + +const struct cpumask *housekeeping_cpumask(void) +{ + if (tick_nohz_full_enabled()) + return housekeeping_mask; + + return cpu_possible_mask; +} +EXPORT_SYMBOL_GPL(housekeeping_cpumask); + +void housekeeping_affine(struct task_struct *t) +{ + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); +} +EXPORT_SYMBOL_GPL(housekeeping_affine); + +bool housekeeping_test_cpu(int cpu) +{ + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); + + return true; +} +EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { From e179f5a04ba46ee5c5439480c2bfd68c358168b7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:32 +0200 Subject: [PATCH 0609/1085] sched/isolation: Use its own static key Housekeeping code still depends on the nohz_full static key. Since we want to decouple housekeeping from NOHZ, let's create a housekeeping specific static key. It's mostly relevant for calls to is_housekeeping_cpu() from the scheduler. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-6-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 3 ++- kernel/sched/isolation.c | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index ed935ffc6ffa..194c586fbb12 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -6,6 +6,7 @@ #include #ifdef CONFIG_NO_HZ_FULL +DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); extern int housekeeping_any_cpu(void); extern const struct cpumask *housekeeping_cpumask(void); extern void housekeeping_affine(struct task_struct *t); @@ -31,7 +32,7 @@ static inline void housekeeping_init(void) { } static inline bool is_housekeeping_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return housekeeping_test_cpu(cpu); #endif return true; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 16445097eb25..bb8ba19a0235 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -10,12 +10,15 @@ #include #include #include +#include +DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); +EXPORT_SYMBOL_GPL(housekeeping_overriden); static cpumask_var_t housekeeping_mask; int housekeeping_any_cpu(void) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); @@ -24,7 +27,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); const struct cpumask *housekeeping_cpumask(void) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return housekeeping_mask; return cpu_possible_mask; @@ -33,14 +36,14 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); void housekeeping_affine(struct task_struct *t) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return cpumask_test_cpu(cpu, housekeeping_mask); return true; @@ -62,6 +65,8 @@ void __init housekeeping_init(void) cpumask_andnot(housekeeping_mask, cpu_possible_mask, tick_nohz_full_mask); + static_branch_enable(&housekeeping_overriden); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } From 204c083a009378dfa751175b5fcddc75988bab6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:33 +0200 Subject: [PATCH 0610/1085] sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() Fit it into the housekeeping_*() namespace. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 2 +- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 194c586fbb12..ad0f5d986a2e 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -29,7 +29,7 @@ static inline void housekeeping_affine(struct task_struct *t) { } static inline void housekeeping_init(void) { } #endif /* CONFIG_NO_HZ_FULL */ -static inline bool is_housekeeping_cpu(int cpu) +static inline bool housekeeping_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL if (static_branch_unlikely(&housekeeping_overriden)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad188acb7636..d0fb448dd43a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -527,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); @@ -536,14 +536,14 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i)) { cpu = i; goto unlock; } } } - if (!is_housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu)) cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 591481db8c6a..cdece8f967f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!is_housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) From 5c4991e24c69737bd41fc2737b1e3980abbf73f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:34 +0200 Subject: [PATCH 0611/1085] sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL Split the housekeeping config from CONFIG_NO_HZ_FULL. This way we finally separate the isolation code from NOHZ. Although a dependency to CONFIG_NO_HZ_FULL remains for now, while the housekeeping code still deals with NOHZ internals. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-8-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 6 +++--- init/Kconfig | 8 ++++++++ kernel/sched/Makefile | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index ad0f5d986a2e..93ac2367a520 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -5,7 +5,7 @@ #include #include -#ifdef CONFIG_NO_HZ_FULL +#ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); extern int housekeeping_any_cpu(void); extern const struct cpumask *housekeeping_cpumask(void); @@ -27,11 +27,11 @@ static inline const struct cpumask *housekeeping_cpumask(void) static inline void housekeeping_affine(struct task_struct *t) { } static inline void housekeeping_init(void) { } -#endif /* CONFIG_NO_HZ_FULL */ +#endif /* CONFIG_CPU_ISOLATION */ static inline bool housekeeping_cpu(int cpu) { -#ifdef CONFIG_NO_HZ_FULL +#ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overriden)) return housekeeping_test_cpu(cpu); #endif diff --git a/init/Kconfig b/init/Kconfig index 78cb2461012e..6f52e6f4bd9d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -472,6 +472,14 @@ config TASK_IO_ACCOUNTING endmenu # "CPU/Task time and stats accounting" +config CPU_ISOLATION + bool "CPU isolation" + depends on NO_HZ_FULL + help + Make sure that CPUs running critical tasks are not disturbed by + any source of "noise" such as unbound workqueues, timers, kthreads... + Unbound jobs get offloaded to housekeeping CPUs. + source "kernel/rcu/Kconfig" config BUILD_BIN2C diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 871d43d73375..dbe61302c352 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,4 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_NO_HZ_FULL) += isolation.o +obj-$(CONFIG_CPU_ISOLATION) += isolation.o From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:35 +0200 Subject: [PATCH 0612/1085] sched/isolation: Introduce housekeeping flags Before we implement isolcpus under housekeeping, we need the isolation features to be more finegrained. For example some people want NOHZ_FULL without the full scheduler isolation, others want full scheduler isolation without NOHZ_FULL. So let's cut all these isolation features piecewise, at the risk of overcutting it right now. We can still merge some flags later if they always make sense together. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- drivers/net/ethernet/tile/tilegx.c | 4 ++-- include/linux/sched/isolation.h | 26 +++++++++++++++++--------- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sched/core.c | 8 ++++---- kernel/sched/fair.c | 2 +- kernel/sched/isolation.c | 26 +++++++++++++++----------- kernel/watchdog.c | 3 ++- 8 files changed, 43 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 27a3272dcd48..b3e5816a4678 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void) tile_net_dev_init(name, mac); if (!network_cpus_init()) - cpumask_and(&network_cpus_map, housekeeping_cpumask(), - cpu_online_mask); + cpumask_and(&network_cpus_map, + housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask); return 0; } diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 93ac2367a520..9bb753eece3b 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -5,35 +5,43 @@ #include #include +enum hk_flags { + HK_FLAG_TIMER = 1, + HK_FLAG_RCU = (1 << 1), + HK_FLAG_MISC = (1 << 2), + HK_FLAG_SCHED = (1 << 3), +}; + #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); -extern int housekeeping_any_cpu(void); -extern const struct cpumask *housekeeping_cpumask(void); -extern void housekeeping_affine(struct task_struct *t); -extern bool housekeeping_test_cpu(int cpu); +extern int housekeeping_any_cpu(enum hk_flags flags); +extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); +extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); +extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); extern void __init housekeeping_init(void); #else -static inline int housekeeping_any_cpu(void) +static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } -static inline const struct cpumask *housekeeping_cpumask(void) +static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { return cpu_possible_mask; } -static inline void housekeeping_affine(struct task_struct *t) { } +static inline void housekeeping_affine(struct task_struct *t, + enum hk_flags flags) { } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ -static inline bool housekeeping_cpu(int cpu) +static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overriden)) - return housekeeping_test_cpu(cpu); + return housekeeping_test_cpu(cpu, flags); #endif return true; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c7632f51c5a0..34125d23e58a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 79abeb0ca329..e3e60efaafee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) LIST_HEAD(rcu_tasks_holdouts); /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); /* * Each pass through the following loop makes one check for diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0fb448dd43a..2210c0203e51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -527,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) return cpu; rcu_read_lock(); @@ -536,15 +536,15 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); return cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cdece8f967f0..f755de86a813 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index bb8ba19a0235..37a138a780ff 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -15,37 +15,39 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); static cpumask_var_t housekeeping_mask; +static unsigned int housekeeping_flags; -int housekeeping_any_cpu(void) +int housekeeping_any_cpu(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_any_and(housekeeping_mask, cpu_online_mask); - + if (housekeeping_flags & flags) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(void) +const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return housekeeping_mask; - + if (housekeeping_flags & flags) + return housekeeping_mask; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t) +void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping_flags & flags) + set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu) +bool housekeeping_test_cpu(int cpu, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_test_cpu(cpu, housekeeping_mask); - + if (housekeeping_flags & flags) + return cpumask_test_cpu(cpu, housekeeping_mask); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); @@ -65,6 +67,8 @@ void __init housekeeping_init(void) cpumask_andnot(housekeeping_mask, cpu_possible_mask, tick_nohz_full_mask); + housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + static_branch_enable(&housekeeping_overriden); /* We need at least one CPU to handle housekeeping work */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 562652c9c815..e9e2ebb3db29 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -777,7 +777,8 @@ void __init lockup_detector_init(void) if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_cpumask()); + cpumask_copy(&watchdog_cpumask, + housekeeping_cpumask(HK_FLAG_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; From 6f1982fedd59856bcc42a9b521be4c3ffd2f60a7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:36 +0200 Subject: [PATCH 0613/1085] sched/isolation: Handle the nohz_full= parameter We want to centralize the isolation management, done by the housekeeping subsystem. Therefore we need to handle the nohz_full= parameter from there. Since nohz_full= so far has involved unbound timers, watchdog, RCU and tilegx NAPI isolation, we keep that default behaviour. nohz_full= will be deprecated in the future. We want to control the isolation features from the isolcpus= parameter. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-10-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 1 + include/linux/tick.h | 2 ++ init/Kconfig | 1 - kernel/sched/isolation.c | 44 +++++++++++++++++++++++---------- kernel/time/tick-sched.c | 13 +++------- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 9bb753eece3b..e53cfa96e91e 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -10,6 +10,7 @@ enum hk_flags { HK_FLAG_RCU = (1 << 1), HK_FLAG_MISC = (1 << 2), HK_FLAG_SCHED = (1 << 3), + HK_FLAG_TICK = (1 << 4), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/include/linux/tick.h b/include/linux/tick.h index 68afc09aa8ac..e2a163a9f96c 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -228,6 +228,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, extern void tick_nohz_full_kick_cpu(int cpu); extern void __tick_nohz_task_switch(void); +extern void __init tick_nohz_full_setup(cpumask_var_t cpumask); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } @@ -248,6 +249,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void __tick_nohz_task_switch(void) { } +static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { } #endif static inline void tick_nohz_task_switch(void) diff --git a/init/Kconfig b/init/Kconfig index 6f52e6f4bd9d..f8564df58d0a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -474,7 +474,6 @@ endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION bool "CPU isolation" - depends on NO_HZ_FULL help Make sure that CPUs running critical tasks are not disturbed by any source of "noise" such as unbound workqueues, timers, kthreads... diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 37a138a780ff..1f61e440358d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -54,23 +54,41 @@ EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!tick_nohz_full_enabled()) + if (!housekeeping_flags) return; - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; - } - - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); - - housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; - static_branch_enable(&housekeeping_overriden); /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } + +#ifdef CONFIG_NO_HZ_FULL +static int __init housekeeping_nohz_full_setup(char *str) +{ + cpumask_var_t non_housekeeping_mask; + + alloc_bootmem_cpumask_var(&non_housekeeping_mask); + if (cpulist_parse(str, non_housekeeping_mask) < 0) { + pr_warn("Housekeeping: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); + + if (cpumask_empty(housekeeping_mask)) + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + + housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER | + HK_FLAG_RCU | HK_FLAG_MISC; + + tick_nohz_full_setup(non_housekeeping_mask); + + free_bootmem_cpumask_var(non_housekeeping_mask); + + return 1; +} +__setup("nohz_full=", housekeeping_nohz_full_setup); +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 27d7d522ac4e..69f3dbe38984 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -385,20 +385,13 @@ out: local_irq_restore(flags); } -/* Parse the boot-time nohz CPU list from the kernel parameters. */ -static int __init tick_nohz_full_setup(char *str) +/* Get the boot-time nohz CPU list from the kernel parameters. */ +void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); - free_bootmem_cpumask_var(tick_nohz_full_mask); - return 1; - } + cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; - - return 1; } -__setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { From edb9382175c3ebdced8ffdb3e0f20052ad9fdbe9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:37 +0200 Subject: [PATCH 0614/1085] sched/isolation: Move isolcpus= handling to the housekeeping code We want to centralize the isolation features, to be done by the housekeeping subsystem and scheduler domain isolation is a significant part of it. No intended behaviour change, we just reuse the housekeeping cpumask and core code. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- drivers/base/cpu.c | 11 +++++- include/linux/sched.h | 2 -- include/linux/sched/isolation.h | 1 + kernel/cgroup/cpuset.c | 15 +++----- kernel/sched/core.c | 16 +-------- kernel/sched/isolation.c | 61 ++++++++++++++++++++++++++------- kernel/sched/topology.c | 24 ++++--------- 7 files changed, 72 insertions(+), 58 deletions(-) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b4d817..a73ab95558f5 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "base.h" @@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev, struct device_attribute *attr, char *buf) { int n = 0, len = PAGE_SIZE-2; + cpumask_var_t isolated; - n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map)); + if (!alloc_cpumask_var(&isolated, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(isolated, cpu_possible_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); + n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated)); + + free_cpumask_var(isolated); return n; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f897dfc195e..1b0cc0d6df8d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -165,8 +165,6 @@ struct task_group; /* Task command name length: */ #define TASK_COMM_LEN 16 -extern cpumask_var_t cpu_isolated_map; - extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index e53cfa96e91e..d849431c8060 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -11,6 +11,7 @@ enum hk_flags { HK_FLAG_MISC = (1 << 2), HK_FLAG_SCHED = (1 << 3), HK_FLAG_TICK = (1 << 4), + HK_FLAG_DOMAIN = (1 << 5), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -57,7 +57,7 @@ #include #include #include - +#include #include #include #include @@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) - goto done; - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - non_isolated_cpus); + housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } @@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) @@ -789,7 +785,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, non_isolated_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -802,7 +798,6 @@ restart: BUG_ON(nslot != ndoms); done: - free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2210c0203e51..1a55c842bfbc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -84,9 +84,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* CPUs with isolated domains */ -cpumask_var_t cpu_isolated_map; - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -5735,10 +5732,6 @@ static inline void sched_init_smt(void) { } void __init sched_init_smp(void) { - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - sched_init_numa(); /* @@ -5748,16 +5741,12 @@ void __init sched_init_smp(void) */ mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); init_sched_dl_class(); @@ -5961,9 +5950,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); set_cpu_rq_start_time(smp_processor_id()); #endif diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 1f61e440358d..8f666bc5abe8 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -63,32 +63,69 @@ void __init housekeeping_init(void) WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } -#ifdef CONFIG_NO_HZ_FULL -static int __init housekeeping_nohz_full_setup(char *str) +static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); - if (cpulist_parse(str, non_housekeeping_mask) < 0) { - pr_warn("Housekeeping: Incorrect nohz_full cpumask\n"); + err = cpulist_parse(str, non_housekeeping_mask); + if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); + if (!housekeeping_flags) { + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, non_housekeeping_mask); + if (cpumask_empty(housekeeping_mask)) + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + } else { + cpumask_var_t tmp; - if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); + if (!cpumask_equal(tmp, housekeeping_mask)) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + free_bootmem_cpumask_var(tmp); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + free_bootmem_cpumask_var(tmp); + } - housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER | - HK_FLAG_RCU | HK_FLAG_MISC; + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + tick_nohz_full_setup(non_housekeeping_mask); + } else { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + } - tick_nohz_full_setup(non_housekeeping_mask); + housekeeping_flags |= flags; free_bootmem_cpumask_var(non_housekeeping_mask); return 1; } + +static int __init housekeeping_nohz_full_setup(char *str) +{ + unsigned int flags; + + flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + + return housekeeping_setup(str, flags); +} __setup("nohz_full=", housekeeping_nohz_full_setup); -#endif + +static int __init housekeeping_isolcpus_setup(char *str) +{ + return housekeeping_setup(str, HK_FLAG_DOMAIN); +} +__setup("isolcpus=", housekeeping_isolcpus_setup); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e3d31b0880dc..2e6b9126ffdc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "sched.h" @@ -469,21 +470,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* Setup the mask of CPUs configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1); - return 0; - } - return 1; -} -__setup("isolcpus=", isolated_cpu_setup); - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -1792,7 +1778,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); @@ -1875,7 +1861,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], doms_new = alloc_sched_domains(1); if (doms_new) { n = 1; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } } else { n = ndoms_new; @@ -1898,7 +1885,8 @@ match1: if (!doms_new) { n = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } /* Build new domains: */ From 150dfee95feb913876ced5cc559a342384e8ea97 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:38 +0200 Subject: [PATCH 0615/1085] sched/isolation: Add basic isolcpus flags Add flags to control NOHZ and domain isolation from "isolcpus=", in order to centralize the isolation features to a common interface. Domain isolation remains the default so not to break the existing isolcpus boot paramater behaviour. Further flags in the future may include 0hz (1hz tick offload) and timers, workqueue, RCU, kthread, watchdog, likely all merged together in a common flag ("async"?). In any case, this will have to be modifiable by cpusets. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 8f666bc5abe8..b71b436f59f2 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -11,6 +11,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -126,6 +127,29 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - return housekeeping_setup(str, HK_FLAG_DOMAIN); + unsigned int flags = 0; + + while (isalpha(*str)) { + if (!strncmp(str, "nohz,", 5)) { + str += 5; + flags |= HK_FLAG_TICK; + continue; + } + + if (!strncmp(str, "domain,", 7)) { + str += 7; + flags |= HK_FLAG_DOMAIN; + continue; + } + + pr_warn("isolcpus: Error, unknown flag\n"); + return 0; + } + + /* Default behaviour for isolcpus without flags */ + if (!flags) + flags |= HK_FLAG_DOMAIN; + + return housekeeping_setup(str, flags); } __setup("isolcpus=", housekeeping_isolcpus_setup); From af8e947079a7dab0480b5d6db6b093fd04b86fc9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 27 Oct 2017 13:11:10 +0900 Subject: [PATCH 0616/1085] x86/build: Beautify build log of syscall headers This makes the build log look nicer. Before: SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_64_x32.h SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_64.h SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_64.h SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_x32.h After: SYSTBL arch/x86/include/generated/asm/syscalls_32.h SYSHDR arch/x86/include/generated/asm/unistd_32_ia32.h SYSHDR arch/x86/include/generated/asm/unistd_64_x32.h SYSTBL arch/x86/include/generated/asm/syscalls_64.h SYSHDR arch/x86/include/generated/uapi/asm/unistd_32.h SYSHDR arch/x86/include/generated/uapi/asm/unistd_64.h SYSHDR arch/x86/include/generated/uapi/asm/unistd_x32.h Signed-off-by: Masahiro Yamada Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: "H. Peter Anvin" Cc: linux-kbuild@vger.kernel.org Link: http://lkml.kernel.org/r/1509077470-2735-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 57aa59fd140c..e34c7a931994 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -1,5 +1,5 @@ -out := $(obj)/../../include/generated/asm -uapi := $(obj)/../../include/generated/uapi/asm +out := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ From 7d9285e82db5defca4d9674ba089429eeca0c697 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: [PATCH 0617/1085] perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers" eBPF programs would like access to the (perf) event enabled and running times along with the event value, such that they can deal with event multiplexing (among other things). This patch extends the interface; a future eBPF patch will utilize the new functionality. [ Note, there's a same-content commit with a poor changelog and a meaningless title in the networking tree as well - but we need this change for subsequent perf work, so apply it here as well, with a proper changelog. Hopefully Git will be able to sort out this somewhat messy workflow, if there are no other, conflicting changes to these files. ] Signed-off-by: Yonghong Song [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David S. Miller Link: http://lkml.kernel.org/r/20171005161923.332790-2-yhs@fb.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 7 +++++-- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 15 +++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8e22f24ded6a..79b18a20cf5d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -806,6 +806,7 @@ struct perf_output_handle { struct bpf_perf_event_data_kern { struct pt_regs *regs; struct perf_sample_data *data; + struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF @@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value); +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline int perf_event_read_local(struct perf_event *event, u64 *value) +static inline int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { return -EINVAL; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e2636737b69b..c4b9ab01bba5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index 04989fb769f0..74671e101b92 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi From ca0dd44cf37bfe316751e1701439c3ff44beb05c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:23:44 +0200 Subject: [PATCH 0618/1085] perf/core: Fix perf_event_read_value() locking perf_event_read_value() is an external accessor, just like perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 74671e101b92..d636f1d0e2ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4398,7 +4398,7 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; @@ -4426,6 +4426,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) return total; } + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + count = __perf_event_read_value(event, enabled, running); + perf_event_ctx_unlock(event, ctx); + + return count; +} EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, @@ -4528,7 +4540,7 @@ static int perf_read_one(struct perf_event *event, u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event, &enabled, &running); + values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) From 0ee098c97a6ebe163180e74b740c0a37bcdaa173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:24:28 +0200 Subject: [PATCH 0619/1085] perf/core: Update ctx time before detaching events We should make sure the ctx time is updated before we detach events; which will want to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index d636f1d0e2ab..5fae98626b15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11129,6 +11129,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); From a9cd8194e1e6bd09619954721dfaf0f94fe2003e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:38:24 +0200 Subject: [PATCH 0620/1085] perf/core: Fix __perf_read_group_add() locking Event timestamps are serialized using ctx->lock, make sure to hold it over reading all values. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5fae98626b15..0f34f2a47eb6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4453,6 +4453,8 @@ static int __perf_read_group_add(struct perf_event *leader, if (ret) return ret; + raw_spin_lock_irqsave(&ctx->lock, flags); + /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one @@ -4475,8 +4477,6 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) From 3c5c8711dcb32115156cc7672fa095afa4339eaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:44:51 +0200 Subject: [PATCH 0621/1085] perf/core: Make sure to update ctx time before using it We should make sure to update ctx time before we use it to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f34f2a47eb6..b249e0f197a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1893,6 +1893,11 @@ __perf_remove_from_context(struct perf_event *event, { unsigned long flags = (unsigned long)info; + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); @@ -1955,8 +1960,11 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); From 8ca2bd41c7d1c135e9ac6f25970c2d491865088a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:12:35 +0200 Subject: [PATCH 0622/1085] perf/core: Rename 'enum perf_event_active_state' Its a weird name, active is one of the states, it should not be part of the name, also, its too long. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 +++--- kernel/events/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 79b18a20cf5d..b7532650de47 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -485,9 +485,9 @@ struct perf_addr_filters_head { }; /** - * enum perf_event_active_state - the states of a event + * enum perf_event_state - the states of a event */ -enum perf_event_active_state { +enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, @@ -578,7 +578,7 @@ struct perf_event { struct pmu *pmu; void *pmu_private; - enum perf_event_active_state state; + enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; diff --git a/kernel/events/core.c b/kernel/events/core.c index b249e0f197a7..6322e245176c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10801,7 +10801,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { - enum perf_event_active_state parent_state = parent_event->state; + enum perf_event_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; From 7f0ec32526d2446fdd79b0c6c6d08e64ef9fb1f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:17:58 +0200 Subject: [PATCH 0623/1085] perf/core: Remove wrong barrier The barrier and comment make no sense: - if what the barrier says is true, it should be wmb() but that should then be part of the arch driver, not the generic code. - if it is an SMP barrier, there must be a matching barrier, and there isn't one. So kill it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6322e245176c..205a4321f678 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2097,11 +2097,6 @@ event_sched_in(struct perf_event *event, event->hw.interrupts = 0; } - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - perf_pmu_disable(event->pmu); perf_set_shadow_time(event, ctx, tstamp); From 0c1cbc18df9e38182a0604b15535699c84d7342a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 16:26:44 +0200 Subject: [PATCH 0624/1085] perf/core: Fix perf_event_read() perf_event_read() has a number of issues regarding the timekeeping bits. - The IPI didn't update group times when it found INACTIVE - The direct call would not re-check ->state after taking ctx->lock which can result in ->count and timestamps getting out of sync. And we can make use of the ordering introduced for perf_event_stop() to make it more accurate for ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 55 +++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 205a4321f678..6f74f9c35490 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event, WRITE_ONCE(event->oncpu, smp_processor_id()); /* - * Order event::oncpu write to happen before the ACTIVE state - * is visible. + * Order event::oncpu write to happen before the ACTIVE state is + * visible. This allows perf_event_{stop,read}() to observe the correct + * ->oncpu if it sees ACTIVE. */ smp_wmb(); WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); @@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (!data->group) + update_event_times(event); + else + update_group_times(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info) pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { - update_event_times(sub); if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3748,23 +3752,35 @@ out: static int perf_event_read(struct perf_event *event, bool group) { + enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - struct perf_read_data data = { - .event = event, - .group = group, - .ret = 0, - }; +again: + if (state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data; + + /* + * Orders the ->state and ->oncpu loads such that if we see + * ACTIVE we must also see the right ->oncpu. + * + * Matches the smp_wmb() from event_sched_in(). + */ + smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; + data = (struct perf_read_data){ + .event = event, + .group = group, + .ret = 0, + }; + preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); @@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group) (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + + } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + state = event->state; + if (state != PERF_EVENT_STATE_INACTIVE) { + raw_spin_unlock_irqrestore(&ctx->lock, flags); + goto again; + } + /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time + * May read while context is not active (e.g., thread is + * blocked), in that case we cannot update context time */ - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } + if (group) update_group_times(event); else From 0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:16:28 +0200 Subject: [PATCH 0625/1085] perf/core: Rewrite event timekeeping The current even timekeeping, which computes enabled and running times, uses 3 distinct timestamps to reflect the various event states: OFF (stopped), INACTIVE (enabled) and ACTIVE (running). Furthermore, the update rules are such that even INACTIVE events need their timestamps updated. This is undesirable because we'd like to not touch INACTIVE events if at all possible, this makes event scheduling (much) more expensive than needed. Rewrite the timekeeping to directly use event->state, this greatly simplifies the code and results in only having to update things when we change state, or an up-to-date value is requested (read). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 19 +- kernel/events/core.c | 385 +++++++++++++------------------------ 2 files changed, 130 insertions(+), 274 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b7532650de47..874b71a70058 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -588,26 +588,10 @@ struct perf_event { * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. - * - * They are computed from tstamp_enabled, tstamp_running and - * tstamp_stopped when the event is in INACTIVE or ACTIVE state. */ u64 total_time_enabled; u64 total_time_running; - - /* - * These are timestamps used for computing total_time_enabled - * and total_time_running when the event is in INACTIVE or - * ACTIVE state, measured in nanoseconds from an arbitrary point - * in time. - * tstamp_enabled: the notional time when the event was enabled - * tstamp_running: the notional time when the event was scheduled on - * tstamp_stopped: in INACTIVE state, the notional time when the - * event was scheduled off. - */ - u64 tstamp_enabled; - u64 tstamp_running; - u64 tstamp_stopped; + u64 tstamp; /* * timestamp shadows the actual context timing but it can @@ -699,7 +683,6 @@ struct perf_event { #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ - int cgrp_defer_enabled; #endif struct list_head sb_list; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f74f9c35490..2551e8ce7224 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespecive of what the group member states are. This results in + * __perf_effective_state(). + * + * A futher ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} - /* * Update cpuctx->cgrp so that it is set when first cgroup event is added and * cleared when last cgroup event is removed. @@ -972,17 +1020,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return 0; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} - static inline void list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) @@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event) return ctx ? ctx->time : 0; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; @@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that @@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) list_del_init(&event->group_entry); - update_group_times(event); - /* * If event was in error state, then keep it * that way, otherwise bogus counts will be @@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of the event */ if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + perf_event_set_state(event, PERF_EVENT_STATE_OFF); ctx->generation++; } @@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; perf_pmu_disable(ctx->pmu); @@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event, perf_pmu_enable(ctx->pmu); - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } - update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + + perf_event_set_state(event, PERF_EVENT_STATE_OFF); } /* @@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event) } static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) + struct perf_event_context *ctx) { /* * use the correct time source for the time snapshot @@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event, * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); + perf_cgroup_set_shadow_time(event, event->tstamp); else - event->shadow_ctx_time = tstamp - ctx->timestamp; + event->shadow_ctx_time = event->tstamp - ctx->timestamp; } #define MAX_INTERRUPTS (~0ULL) @@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); int ret = 0; lockdep_assert_held(&ctx->lock); @@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event, * ->oncpu if it sees ACTIVE. */ smp_wmb(); - WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); + perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx, tstamp); + perf_set_shadow_time(event, ctx); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = ctx->pmu; - u64 now = ctx->time; - bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -2167,27 +2132,13 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. + * The events up to the failed event are scheduled out normally. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - simulate = true; + break; - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } + event_sched_out(event, cpuctx, ctx); } event_sched_out(group_event, cpuctx, ctx); @@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } -/* - * Complement to update_event_times(). This computes the tstamp_* values to - * continue 'enabled' state from @now, and effectively discards the time - * between the prior tstamp_stopped and now (as we were in the OFF state, or - * just switched (context) time base). - * - * This further assumes '@event->state == INACTIVE' (we just came from OFF) and - * cannot have been scheduled in yet. And going into INACTIVE state means - * '@event->tstamp_stopped = @now'. - * - * Thus given the rules of update_event_times(): - * - * total_time_enabled = tstamp_stopped - tstamp_enabled - * total_time_running = tstamp_stopped - tstamp_running - * - * We can insert 'tstamp_stopped == now' and reverse them to compute new - * tstamp_* values. - */ -static void __perf_event_enable_time(struct perf_event *event, u64 now) -{ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); - - event->tstamp_stopped = now; - event->tstamp_enabled = now - event->total_time_enabled; - event->tstamp_running = now - event->total_time_running; -} - static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - list_add_event(event, ctx); perf_group_attach(event); - /* - * We can be called with event->state == STATE_OFF when we create with - * .disabled = 1. In that case the IOC_ENABLE will call this function. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2499,28 +2415,6 @@ again: raw_spin_unlock_irq(&ctx->lock); } -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - __perf_event_enable_time(event, tstamp); - list_for_each_entry(sub, &event->sibling_list, group_entry) { - /* XXX should not be > INACTIVE if event isn't */ - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(sub, tstamp); - } -} - /* * Cross CPU call to enable a performance event */ @@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->is_active) ctx_sched_out(ctx, cpuctx, EVENT_TIME); - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: + if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); - /* fall-through */ - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } + perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event @@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } @@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info) update_cgrp_time_from_event(event); } - if (!data->group) - update_event_times(event); - else - update_group_times(event); + perf_event_update_time(event); + if (data->group) + perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value, { unsigned long flags; int ret = 0; - u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } - now = event->shadow_ctx_time + perf_clock(); - if (enabled) - *enabled = now - event->tstamp_enabled; + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) { + if (event->oncpu == smp_processor_id()) event->pmu->read(event); - if (running) - *running = now - event->tstamp_running; - } else if (running) { - *running = event->total_time_running; - } *value = local64_read(&event->count); + if (enabled || running) { + u64 now = event->shadow_ctx_time + perf_clock(); + u64 __enabled, __running; + + __perf_update_times(event, now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) + *running = __running; + } out: local_irq_restore(flags); @@ -3818,10 +3693,9 @@ again: update_cgrp_time_from_event(event); } + perf_event_update_time(event); if (group) - update_group_times(event); - else - update_event_times(event); + perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event, *now = perf_clock(); ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; + __perf_update_times(event, ctx_time, enabled, running); } static void perf_event_init_userpage(struct perf_event *event) @@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* From 30199bee2d8da237086f7e8067e31814967e3d04 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 15:53:34 -0700 Subject: [PATCH 0626/1085] scsi: aic94xx: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: Tomas Henzl Cc: Quentin Lambert Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/aic94xx/aic94xx_hwi.c | 3 +-- drivers/scsi/aic94xx/aic94xx_hwi.h | 5 ++--- drivers/scsi/aic94xx/aic94xx_scb.c | 6 +++--- drivers/scsi/aic94xx/aic94xx_tmf.c | 13 ++++++------- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.c b/drivers/scsi/aic94xx/aic94xx_hwi.c index f2671a8fa7e3..7cbc7213b2b2 100644 --- a/drivers/scsi/aic94xx/aic94xx_hwi.c +++ b/drivers/scsi/aic94xx/aic94xx_hwi.c @@ -1178,8 +1178,7 @@ static void asd_start_scb_timers(struct list_head *list) struct asd_ascb *ascb; list_for_each_entry(ascb, list, list) { if (!ascb->uldd_timer) { - ascb->timer.data = (unsigned long) ascb; - ascb->timer.function = asd_ascb_timedout; + ascb->timer.function = (TIMER_FUNC_TYPE)asd_ascb_timedout; ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT; add_timer(&ascb->timer); } diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.h b/drivers/scsi/aic94xx/aic94xx_hwi.h index 8c1c28239e93..8f147e720cfd 100644 --- a/drivers/scsi/aic94xx/aic94xx_hwi.h +++ b/drivers/scsi/aic94xx/aic94xx_hwi.h @@ -291,8 +291,7 @@ static inline void asd_init_ascb(struct asd_ha_struct *asd_ha, INIT_LIST_HEAD(&ascb->list); ascb->scb = ascb->dma_scb.vaddr; ascb->ha = asd_ha; - ascb->timer.function = NULL; - init_timer(&ascb->timer); + timer_setup(&ascb->timer, NULL, 0); ascb->tc_index = -1; } @@ -392,7 +391,7 @@ void asd_control_led(struct asd_ha_struct *asd_ha, int phy_id, int op); void asd_turn_led(struct asd_ha_struct *asd_ha, int phy_id, int op); int asd_enable_phys(struct asd_ha_struct *asd_ha, const u8 phy_mask); -void asd_ascb_timedout(unsigned long data); +void asd_ascb_timedout(struct timer_list *t); int asd_chip_hardrst(struct asd_ha_struct *asd_ha); #endif diff --git a/drivers/scsi/aic94xx/aic94xx_scb.c b/drivers/scsi/aic94xx/aic94xx_scb.c index fdac7c2fef37..22873ce8bbfa 100644 --- a/drivers/scsi/aic94xx/aic94xx_scb.c +++ b/drivers/scsi/aic94xx/aic94xx_scb.c @@ -866,12 +866,12 @@ void asd_build_initiate_link_adm_task(struct asd_ascb *ascb, int phy_id, * Upper layers can implement their own timeout function, say to free * resources they have with this SCB, and then call this one at the * end of their timeout function. To do this, one should initialize - * the ascb->timer.{function, data, expires} prior to calling the post + * the ascb->timer.{function, expires} prior to calling the post * function. The timer is started by the post function. */ -void asd_ascb_timedout(unsigned long data) +void asd_ascb_timedout(struct timer_list *t) { - struct asd_ascb *ascb = (void *) data; + struct asd_ascb *ascb = from_timer(ascb, t, timer); struct asd_seq_data *seq = &ascb->ha->seq; unsigned long flags; diff --git a/drivers/scsi/aic94xx/aic94xx_tmf.c b/drivers/scsi/aic94xx/aic94xx_tmf.c index d4c35df3d4ae..4637119c09d8 100644 --- a/drivers/scsi/aic94xx/aic94xx_tmf.c +++ b/drivers/scsi/aic94xx/aic94xx_tmf.c @@ -35,15 +35,14 @@ static int asd_enqueue_internal(struct asd_ascb *ascb, void (*tasklet_complete)(struct asd_ascb *, struct done_list_struct *), - void (*timed_out)(unsigned long)) + void (*timed_out)(struct timer_list *t)) { int res; ascb->tasklet_complete = tasklet_complete; ascb->uldd_timer = 1; - ascb->timer.data = (unsigned long) ascb; - ascb->timer.function = timed_out; + ascb->timer.function = (TIMER_FUNC_TYPE)timed_out; ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT; add_timer(&ascb->timer); @@ -87,9 +86,9 @@ static void asd_clear_nexus_tasklet_complete(struct asd_ascb *ascb, asd_ascb_free(ascb); } -static void asd_clear_nexus_timedout(unsigned long data) +static void asd_clear_nexus_timedout(struct timer_list *t) { - struct asd_ascb *ascb = (void *)data; + struct asd_ascb *ascb = from_timer(ascb, t, timer); struct tasklet_completion_status *tcs = ascb->uldd_task; ASD_DPRINTK("%s: here\n", __func__); @@ -261,9 +260,9 @@ static int asd_clear_nexus_index(struct sas_task *task) /* ---------- TMFs ---------- */ -static void asd_tmf_timedout(unsigned long data) +static void asd_tmf_timedout(struct timer_list *t) { - struct asd_ascb *ascb = (void *) data; + struct asd_ascb *ascb = from_timer(ascb, t, timer); struct tasklet_completion_status *tcs = ascb->uldd_task; ASD_DPRINTK("tmf timed out\n"); From b386eec6b0af30c947bf439bfabffb8fd6ab5fc0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 16:15:26 -0700 Subject: [PATCH 0627/1085] scsi: be2iscsi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Subbu Seetharaman Cc: Ketan Mukadam Cc: Jitendra Bhivare Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/be2iscsi/be_main.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index b4542e7e2ad5..d8bd6f2c9c83 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -5230,12 +5230,11 @@ static void beiscsi_eqd_update_work(struct work_struct *work) msecs_to_jiffies(BEISCSI_EQD_UPDATE_INTERVAL)); } -static void beiscsi_hw_tpe_check(unsigned long ptr) +static void beiscsi_hw_tpe_check(struct timer_list *t) { - struct beiscsi_hba *phba; + struct beiscsi_hba *phba = from_timer(phba, t, hw_check); u32 wait; - phba = (struct beiscsi_hba *)ptr; /* if not TPE, do nothing */ if (!beiscsi_detect_tpe(phba)) return; @@ -5248,11 +5247,10 @@ static void beiscsi_hw_tpe_check(unsigned long ptr) msecs_to_jiffies(wait)); } -static void beiscsi_hw_health_check(unsigned long ptr) +static void beiscsi_hw_health_check(struct timer_list *t) { - struct beiscsi_hba *phba; + struct beiscsi_hba *phba = from_timer(phba, t, hw_check); - phba = (struct beiscsi_hba *)ptr; beiscsi_detect_ue(phba); if (beiscsi_detect_ue(phba)) { __beiscsi_log(phba, KERN_ERR, @@ -5264,7 +5262,7 @@ static void beiscsi_hw_health_check(unsigned long ptr) if (!test_bit(BEISCSI_HBA_UER_SUPP, &phba->state)) return; /* modify this timer to check TPE */ - phba->hw_check.function = beiscsi_hw_tpe_check; + phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_tpe_check; } mod_timer(&phba->hw_check, @@ -5351,7 +5349,7 @@ static int beiscsi_enable_port(struct beiscsi_hba *phba) * Timer function gets modified for TPE detection. * Always reinit to do health check first. */ - phba->hw_check.function = beiscsi_hw_health_check; + phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_health_check; mod_timer(&phba->hw_check, jiffies + msecs_to_jiffies(BEISCSI_UE_DETECT_INTERVAL)); return 0; @@ -5708,9 +5706,7 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev, * Start UE detection here. UE before this will cause stall in probe * and eventually fail the probe. */ - init_timer(&phba->hw_check); - phba->hw_check.function = beiscsi_hw_health_check; - phba->hw_check.data = (unsigned long)phba; + timer_setup(&phba->hw_check, beiscsi_hw_health_check, 0); mod_timer(&phba->hw_check, jiffies + msecs_to_jiffies(BEISCSI_UE_DETECT_INTERVAL)); beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_INIT, From abef751056130c90199e3bfc32916715795cde39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Aug 2017 16:38:10 -0700 Subject: [PATCH 0628/1085] scsi: bnx2i: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. There was a seemingly missing call to initialize the timer in one handler, so this was added to remove the open-coded initialization. Cc: QLogic-Storage-Upstream@qlogic.com Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/bnx2i/bnx2i.h | 2 +- drivers/scsi/bnx2i/bnx2i_hwi.c | 4 ++-- drivers/scsi/bnx2i/bnx2i_iscsi.c | 15 ++++----------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/bnx2i/bnx2i.h b/drivers/scsi/bnx2i/bnx2i.h index 89ef1a1678d1..663a63d4dae4 100644 --- a/drivers/scsi/bnx2i/bnx2i.h +++ b/drivers/scsi/bnx2i/bnx2i.h @@ -858,7 +858,7 @@ extern int bnx2i_alloc_qp_resc(struct bnx2i_hba *hba, struct bnx2i_endpoint *ep); extern void bnx2i_free_qp_resc(struct bnx2i_hba *hba, struct bnx2i_endpoint *ep); -extern void bnx2i_ep_ofld_timer(unsigned long data); +extern void bnx2i_ep_ofld_timer(struct timer_list *t); extern struct bnx2i_endpoint *bnx2i_find_ep_in_ofld_list( struct bnx2i_hba *hba, u32 iscsi_cid); extern struct bnx2i_endpoint *bnx2i_find_ep_in_destroy_list( diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index 42921dbba927..61a93994d5ed 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -698,9 +698,9 @@ void bnx2i_update_iscsi_conn(struct iscsi_conn *conn) * * routine to handle connection offload/destroy request timeout */ -void bnx2i_ep_ofld_timer(unsigned long data) +void bnx2i_ep_ofld_timer(struct timer_list *t) { - struct bnx2i_endpoint *ep = (struct bnx2i_endpoint *) data; + struct bnx2i_endpoint *ep = from_timer(ep, t, ofld_timer); if (ep->state == EP_STATE_OFLD_START) { printk(KERN_ALERT "ofld_timer: CONN_OFLD timeout\n"); diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 03c104b47f31..de0a507577ef 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -1611,9 +1611,8 @@ static int bnx2i_conn_start(struct iscsi_cls_conn *cls_conn) * this should normally not sleep for a long time so it should * not disrupt the caller. */ + timer_setup(&bnx2i_conn->ep->ofld_timer, bnx2i_ep_ofld_timer, 0); bnx2i_conn->ep->ofld_timer.expires = 1 * HZ + jiffies; - bnx2i_conn->ep->ofld_timer.function = bnx2i_ep_ofld_timer; - bnx2i_conn->ep->ofld_timer.data = (unsigned long) bnx2i_conn->ep; add_timer(&bnx2i_conn->ep->ofld_timer); /* update iSCSI context for this conn, wait for CNIC to complete */ wait_event_interruptible(bnx2i_conn->ep->ofld_wait, @@ -1729,10 +1728,8 @@ static int bnx2i_tear_down_conn(struct bnx2i_hba *hba, } ep->state = EP_STATE_CLEANUP_START; - init_timer(&ep->ofld_timer); + timer_setup(&ep->ofld_timer, bnx2i_ep_ofld_timer, 0); ep->ofld_timer.expires = hba->conn_ctx_destroy_tmo + jiffies; - ep->ofld_timer.function = bnx2i_ep_ofld_timer; - ep->ofld_timer.data = (unsigned long) ep; add_timer(&ep->ofld_timer); bnx2i_ep_destroy_list_add(hba, ep); @@ -1835,10 +1832,8 @@ static struct iscsi_endpoint *bnx2i_ep_connect(struct Scsi_Host *shost, bnx2i_ep->state = EP_STATE_OFLD_START; bnx2i_ep_ofld_list_add(hba, bnx2i_ep); - init_timer(&bnx2i_ep->ofld_timer); + timer_setup(&bnx2i_ep->ofld_timer, bnx2i_ep_ofld_timer, 0); bnx2i_ep->ofld_timer.expires = 2 * HZ + jiffies; - bnx2i_ep->ofld_timer.function = bnx2i_ep_ofld_timer; - bnx2i_ep->ofld_timer.data = (unsigned long) bnx2i_ep; add_timer(&bnx2i_ep->ofld_timer); if (bnx2i_send_conn_ofld_req(hba, bnx2i_ep)) { @@ -2054,10 +2049,8 @@ int bnx2i_hw_ep_disconnect(struct bnx2i_endpoint *bnx2i_ep) session = conn->session; } - init_timer(&bnx2i_ep->ofld_timer); + timer_setup(&bnx2i_ep->ofld_timer, bnx2i_ep_ofld_timer, 0); bnx2i_ep->ofld_timer.expires = hba->conn_teardown_tmo + jiffies; - bnx2i_ep->ofld_timer.function = bnx2i_ep_ofld_timer; - bnx2i_ep->ofld_timer.data = (unsigned long) bnx2i_ep; add_timer(&bnx2i_ep->ofld_timer); if (!test_bit(BNX2I_CNIC_REGISTERED, &hba->reg_with_cnic)) From af53b89becdcee2f15c60e6a9e951bd96d382e1c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 21:51:09 -0700 Subject: [PATCH 0629/1085] scsi: dc395x: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Oliver Neukum Cc: Ali Akcaagac Cc: Jamie Lenehan Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/dc395x.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index 5ee7f44cf869..60ef8df42b95 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c @@ -395,7 +395,7 @@ static void request_sense(struct AdapterCtlBlk *acb, struct DeviceCtlBlk *dcb, struct ScsiReqBlk *srb); static void set_xfer_rate(struct AdapterCtlBlk *acb, struct DeviceCtlBlk *dcb); -static void waiting_timeout(unsigned long ptr); +static void waiting_timeout(struct timer_list *t); /*--------------------------------------------------------------------------- @@ -857,9 +857,6 @@ static void waiting_set_timer(struct AdapterCtlBlk *acb, unsigned long to) { if (timer_pending(&acb->waiting_timer)) return; - init_timer(&acb->waiting_timer); - acb->waiting_timer.function = waiting_timeout; - acb->waiting_timer.data = (unsigned long) acb; if (time_before(jiffies + to, acb->last_reset - HZ / 2)) acb->waiting_timer.expires = acb->last_reset - HZ / 2 + 1; @@ -936,10 +933,10 @@ static void waiting_process_next(struct AdapterCtlBlk *acb) /* Wake up waiting queue */ -static void waiting_timeout(unsigned long ptr) +static void waiting_timeout(struct timer_list *t) { unsigned long flags; - struct AdapterCtlBlk *acb = (struct AdapterCtlBlk *)ptr; + struct AdapterCtlBlk *acb = from_timer(acb, t, waiting_timer); dprintkdbg(DBG_1, "waiting_timeout: Queue woken up by timer. acb=%p\n", acb); DC395x_LOCK_IO(acb->scsi_host, flags); @@ -4366,8 +4363,8 @@ static void adapter_init_params(struct AdapterCtlBlk *acb) INIT_LIST_HEAD(&acb->srb_free_list); /* temp SRB for Q tag used or abort command used */ acb->tmp_srb = &acb->srb; - init_timer(&acb->waiting_timer); - init_timer(&acb->selto_timer); + timer_setup(&acb->waiting_timer, waiting_timeout, 0); + timer_setup(&acb->selto_timer, NULL, 0); acb->srb_count = DC395x_MAX_SRB_CNT; From 13059106242bc96f8f5ab79c0f6cb15c7b756f40 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 21 Sep 2017 13:12:15 -0700 Subject: [PATCH 0630/1085] scsi: fcoe: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: QLogic-Storage-Upstream@qlogic.com Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Johannes Thumshirn Cc: linux-scsi@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Signed-off-by: Kees Cook Acked-by: Johannes Thumshirn --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 11 +++++------ drivers/scsi/fcoe/fcoe.c | 2 +- drivers/scsi/fcoe/fcoe_transport.c | 6 ++++-- include/scsi/libfcoe.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 6844ba361616..e6b9de7d41ac 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -823,7 +823,7 @@ static int bnx2fc_net_config(struct fc_lport *lport, struct net_device *netdev) skb_queue_head_init(&port->fcoe_pending_queue); port->fcoe_pending_queue_active = 0; - setup_timer(&port->timer, fcoe_queue_timer, (unsigned long) lport); + timer_setup(&port->timer, fcoe_queue_timer, 0); fcoe_link_speed_update(lport); @@ -845,9 +845,9 @@ static int bnx2fc_net_config(struct fc_lport *lport, struct net_device *netdev) return 0; } -static void bnx2fc_destroy_timer(unsigned long data) +static void bnx2fc_destroy_timer(struct timer_list *t) { - struct bnx2fc_hba *hba = (struct bnx2fc_hba *)data; + struct bnx2fc_hba *hba = from_timer(hba, t, destroy_timer); printk(KERN_ERR PFX "ERROR:bnx2fc_destroy_timer - " "Destroy compl not received!!\n"); @@ -1946,11 +1946,10 @@ static void bnx2fc_fw_destroy(struct bnx2fc_hba *hba) { if (test_and_clear_bit(BNX2FC_FLAG_FW_INIT_DONE, &hba->flags)) { if (bnx2fc_send_fw_fcoe_destroy_msg(hba) == 0) { - init_timer(&hba->destroy_timer); + timer_setup(&hba->destroy_timer, bnx2fc_destroy_timer, + 0); hba->destroy_timer.expires = BNX2FC_FW_TIMEOUT + jiffies; - hba->destroy_timer.function = bnx2fc_destroy_timer; - hba->destroy_timer.data = (unsigned long)hba; add_timer(&hba->destroy_timer); wait_event_interruptible(hba->destroy_wait, test_bit(BNX2FC_FLAG_DESTROY_CMPL, diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 85f9a3eba387..5cc09dce4d25 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -754,7 +754,7 @@ static int fcoe_netdev_config(struct fc_lport *lport, struct net_device *netdev) skb_queue_head_init(&port->fcoe_pending_queue); port->fcoe_pending_queue_active = 0; - setup_timer(&port->timer, fcoe_queue_timer, (unsigned long)lport); + timer_setup(&port->timer, fcoe_queue_timer, 0); fcoe_link_speed_update(lport); diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c index 375c536cbc68..1ba5f51713a3 100644 --- a/drivers/scsi/fcoe/fcoe_transport.c +++ b/drivers/scsi/fcoe/fcoe_transport.c @@ -455,9 +455,11 @@ EXPORT_SYMBOL_GPL(fcoe_check_wait_queue); * * Calls fcoe_check_wait_queue on timeout */ -void fcoe_queue_timer(ulong lport) +void fcoe_queue_timer(struct timer_list *t) { - fcoe_check_wait_queue((struct fc_lport *)lport, NULL); + struct fcoe_port *port = from_timer(port, t, timer); + + fcoe_check_wait_queue(port->lport, NULL); } EXPORT_SYMBOL_GPL(fcoe_queue_timer); diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h index 722d3264d3bf..cb8a273732cf 100644 --- a/include/scsi/libfcoe.h +++ b/include/scsi/libfcoe.h @@ -382,7 +382,7 @@ static inline struct net_device *fcoe_get_netdev(const struct fc_lport *lport) void fcoe_clean_pending_queue(struct fc_lport *); void fcoe_check_wait_queue(struct fc_lport *lport, struct sk_buff *skb); -void fcoe_queue_timer(ulong lport); +void fcoe_queue_timer(struct timer_list *t); int fcoe_get_paged_crc_eof(struct sk_buff *skb, int tlen, struct fcoe_percpu_s *fps); From 7932589f47d5d5eaa9948bd479b42819b6b10509 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 17:17:42 -0700 Subject: [PATCH 0631/1085] scsi: gdth: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/gdth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index a4473356a9dc..c35f05c4c6bb 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -3705,7 +3705,7 @@ static void gdth_log_event(gdth_evt_data *dvr, char *buffer) #ifdef GDTH_STATISTICS static u8 gdth_timer_running; -static void gdth_timeout(unsigned long data) +static void gdth_timeout(struct timer_list *unused) { u32 i; Scsi_Cmnd *nscp; @@ -3743,8 +3743,6 @@ static void gdth_timer_init(void) gdth_timer_running = 1; TRACE2(("gdth_detect(): Initializing timer !\n")); gdth_timer.expires = jiffies + HZ; - gdth_timer.data = 0L; - gdth_timer.function = gdth_timeout; add_timer(&gdth_timer); } #else @@ -5165,7 +5163,7 @@ static int __init gdth_init(void) /* initializations */ gdth_polling = TRUE; gdth_clear_events(); - init_timer(&gdth_timer); + timer_setup(&gdth_timer, gdth_timeout, 0); /* As default we do not probe for EISA or ISA controllers */ if (probe_eisa_isa) { From b0a2dc66152aa409860e02804169b0007244ebd7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 1 Sep 2017 23:21:24 -0700 Subject: [PATCH 0632/1085] scsi: isci: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Intel SCU Linux support Cc: Artur Paszkiewicz Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Artur Paszkiewicz --- drivers/scsi/isci/host.c | 12 ++++++------ drivers/scsi/isci/isci.h | 6 ++---- drivers/scsi/isci/phy.c | 4 ++-- drivers/scsi/isci/port.c | 4 ++-- drivers/scsi/isci/port_config.c | 8 ++++---- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/scsi/isci/host.c b/drivers/scsi/isci/host.c index 609dafd661d1..13b37cdffa8e 100644 --- a/drivers/scsi/isci/host.c +++ b/drivers/scsi/isci/host.c @@ -958,9 +958,9 @@ static enum sci_status sci_controller_start_next_phy(struct isci_host *ihost) return status; } -static void phy_startup_timeout(unsigned long data) +static void phy_startup_timeout(struct timer_list *t) { - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct isci_host *ihost = container_of(tmr, typeof(*ihost), phy_timer); unsigned long flags; enum sci_status status; @@ -1592,9 +1592,9 @@ static const struct sci_base_state sci_controller_state_table[] = { [SCIC_FAILED] = {} }; -static void controller_timeout(unsigned long data) +static void controller_timeout(struct timer_list *t) { - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct isci_host *ihost = container_of(tmr, typeof(*ihost), timer); struct sci_base_state_machine *sm = &ihost->sm; unsigned long flags; @@ -1737,9 +1737,9 @@ static u8 max_spin_up(struct isci_host *ihost) MAX_CONCURRENT_DEVICE_SPIN_UP_COUNT); } -static void power_control_timeout(unsigned long data) +static void power_control_timeout(struct timer_list *t) { - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct isci_host *ihost = container_of(tmr, typeof(*ihost), power_control.timer); struct isci_phy *iphy; unsigned long flags; diff --git a/drivers/scsi/isci/isci.h b/drivers/scsi/isci/isci.h index 234ab46fce33..680e30947671 100644 --- a/drivers/scsi/isci/isci.h +++ b/drivers/scsi/isci/isci.h @@ -498,12 +498,10 @@ struct sci_timer { }; static inline -void sci_init_timer(struct sci_timer *tmr, void (*fn)(unsigned long)) +void sci_init_timer(struct sci_timer *tmr, void (*fn)(struct timer_list *t)) { - tmr->timer.function = fn; - tmr->timer.data = (unsigned long) tmr; tmr->cancel = 0; - init_timer(&tmr->timer); + timer_setup(&tmr->timer, fn, 0); } static inline void sci_mod_timer(struct sci_timer *tmr, unsigned long msec) diff --git a/drivers/scsi/isci/phy.c b/drivers/scsi/isci/phy.c index cb87b2ef7c92..1deca8c5a94f 100644 --- a/drivers/scsi/isci/phy.c +++ b/drivers/scsi/isci/phy.c @@ -315,9 +315,9 @@ sci_phy_link_layer_initialization(struct isci_phy *iphy, return SCI_SUCCESS; } -static void phy_sata_timeout(unsigned long data) +static void phy_sata_timeout(struct timer_list *t) { - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct isci_phy *iphy = container_of(tmr, typeof(*iphy), sata_timer); struct isci_host *ihost = iphy->owning_port->owning_controller; unsigned long flags; diff --git a/drivers/scsi/isci/port.c b/drivers/scsi/isci/port.c index a4dd5c91508c..1df45f028ea7 100644 --- a/drivers/scsi/isci/port.c +++ b/drivers/scsi/isci/port.c @@ -769,9 +769,9 @@ bool sci_port_link_detected(struct isci_port *iport, struct isci_phy *iphy) return true; } -static void port_timeout(unsigned long data) +static void port_timeout(struct timer_list *t) { - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct isci_port *iport = container_of(tmr, typeof(*iport), timer); struct isci_host *ihost = iport->owning_controller; unsigned long flags; diff --git a/drivers/scsi/isci/port_config.c b/drivers/scsi/isci/port_config.c index ac879745ef80..edb7be786c65 100644 --- a/drivers/scsi/isci/port_config.c +++ b/drivers/scsi/isci/port_config.c @@ -319,10 +319,10 @@ sci_mpc_agent_validate_phy_configuration(struct isci_host *ihost, return sci_port_configuration_agent_validate_ports(ihost, port_agent); } -static void mpc_agent_timeout(unsigned long data) +static void mpc_agent_timeout(struct timer_list *t) { u8 index; - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct sci_port_configuration_agent *port_agent; struct isci_host *ihost; unsigned long flags; @@ -654,10 +654,10 @@ static void sci_apc_agent_link_down( } /* configure the phys into ports when the timer fires */ -static void apc_agent_timeout(unsigned long data) +static void apc_agent_timeout(struct timer_list *t) { u32 index; - struct sci_timer *tmr = (struct sci_timer *)data; + struct sci_timer *tmr = from_timer(tmr, t, timer); struct sci_port_configuration_agent *port_agent; struct isci_host *ihost; unsigned long flags; From 1f0849dac19ba35087e7a2056dde8206aa23c5e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 12:51:53 -0700 Subject: [PATCH 0633/1085] scsi: libfc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This removes several redundant setup calls in favor of just changing the timer function directly. Cc: Johannes Thumshirn Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: fcoe-devel@open-fcoe.org Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Acked-by: Johannes Thumshirn --- drivers/scsi/libfc/fc_fcp.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c index 772c35a5c49e..1a4e701a8449 100644 --- a/drivers/scsi/libfc/fc_fcp.c +++ b/drivers/scsi/libfc/fc_fcp.c @@ -97,7 +97,7 @@ static void fc_fcp_complete_locked(struct fc_fcp_pkt *); static void fc_tm_done(struct fc_seq *, struct fc_frame *, void *); static void fc_fcp_error(struct fc_fcp_pkt *, struct fc_frame *); static void fc_fcp_recovery(struct fc_fcp_pkt *, u8 code); -static void fc_fcp_timeout(unsigned long); +static void fc_fcp_timeout(struct timer_list *); static void fc_fcp_rec(struct fc_fcp_pkt *); static void fc_fcp_rec_error(struct fc_fcp_pkt *, struct fc_frame *); static void fc_fcp_rec_resp(struct fc_seq *, struct fc_frame *, void *); @@ -155,8 +155,7 @@ static struct fc_fcp_pkt *fc_fcp_pkt_alloc(struct fc_lport *lport, gfp_t gfp) fsp->lp = lport; fsp->xfer_ddp = FC_XID_UNKNOWN; refcount_set(&fsp->ref_cnt, 1); - init_timer(&fsp->timer); - fsp->timer.data = (unsigned long)fsp; + timer_setup(&fsp->timer, NULL, 0); INIT_LIST_HEAD(&fsp->list); spin_lock_init(&fsp->scsi_pkt_lock); } else { @@ -1215,7 +1214,7 @@ static int fc_fcp_cmd_send(struct fc_lport *lport, struct fc_fcp_pkt *fsp, fsp->seq_ptr = seq; fc_fcp_pkt_hold(fsp); /* hold for fc_fcp_pkt_destroy */ - setup_timer(&fsp->timer, fc_fcp_timeout, (unsigned long)fsp); + fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout; if (rpriv->flags & FC_RP_FLAGS_REC_SUPPORTED) fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp)); @@ -1298,9 +1297,9 @@ static int fc_fcp_pkt_abort(struct fc_fcp_pkt *fsp) * fc_lun_reset_send() - Send LUN reset command * @data: The FCP packet that identifies the LUN to be reset */ -static void fc_lun_reset_send(unsigned long data) +static void fc_lun_reset_send(struct timer_list *t) { - struct fc_fcp_pkt *fsp = (struct fc_fcp_pkt *)data; + struct fc_fcp_pkt *fsp = from_timer(fsp, t, timer); struct fc_lport *lport = fsp->lp; if (lport->tt.fcp_cmd_send(lport, fsp, fc_tm_done)) { @@ -1308,7 +1307,7 @@ static void fc_lun_reset_send(unsigned long data) return; if (fc_fcp_lock_pkt(fsp)) return; - setup_timer(&fsp->timer, fc_lun_reset_send, (unsigned long)fsp); + fsp->timer.function = (TIMER_FUNC_TYPE)fc_lun_reset_send; fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp)); fc_fcp_unlock_pkt(fsp); } @@ -1334,7 +1333,7 @@ static int fc_lun_reset(struct fc_lport *lport, struct fc_fcp_pkt *fsp, fsp->wait_for_comp = 1; init_completion(&fsp->tm_done); - fc_lun_reset_send((unsigned long)fsp); + fc_lun_reset_send(&fsp->timer); /* * wait for completion of reset @@ -1431,9 +1430,9 @@ static void fc_fcp_cleanup(struct fc_lport *lport) * received we see if data was received recently. If it has been then we * continue waiting, otherwise, we abort the command. */ -static void fc_fcp_timeout(unsigned long data) +static void fc_fcp_timeout(struct timer_list *t) { - struct fc_fcp_pkt *fsp = (struct fc_fcp_pkt *)data; + struct fc_fcp_pkt *fsp = from_timer(fsp, t, timer); struct fc_rport *rport = fsp->rport; struct fc_rport_libfc_priv *rpriv = rport->dd_data; @@ -1446,7 +1445,7 @@ static void fc_fcp_timeout(unsigned long data) if (fsp->lp->qfull) { FC_FCP_DBG(fsp, "fcp timeout, resetting timer delay %d\n", fsp->timer_delay); - setup_timer(&fsp->timer, fc_fcp_timeout, (unsigned long)fsp); + fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout; fc_fcp_timer_set(fsp, fsp->timer_delay); goto unlock; } From 2c4b9637b69cab846d8c443bf6775356321950c4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 16:25:40 -0700 Subject: [PATCH 0634/1085] scsi: libiscsi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Lee Duncan Cc: Chris Leech Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: open-iscsi@googlegroups.com Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Reviewed-by: Chris Leech --- drivers/scsi/libiscsi.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index bd4605a34f54..af59192cc578 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1805,9 +1805,9 @@ int iscsi_target_alloc(struct scsi_target *starget) } EXPORT_SYMBOL_GPL(iscsi_target_alloc); -static void iscsi_tmf_timedout(unsigned long data) +static void iscsi_tmf_timedout(struct timer_list *t) { - struct iscsi_conn *conn = (struct iscsi_conn *)data; + struct iscsi_conn *conn = from_timer(conn, t, tmf_timer); struct iscsi_session *session = conn->session; spin_lock(&session->frwd_lock); @@ -1838,8 +1838,6 @@ static int iscsi_exec_task_mgmt_fn(struct iscsi_conn *conn, } conn->tmfcmd_pdus_cnt++; conn->tmf_timer.expires = timeout * HZ + jiffies; - conn->tmf_timer.function = iscsi_tmf_timedout; - conn->tmf_timer.data = (unsigned long)conn; add_timer(&conn->tmf_timer); ISCSI_DBG_EH(session, "tmf set timeout\n"); @@ -2089,9 +2087,9 @@ done: } EXPORT_SYMBOL_GPL(iscsi_eh_cmd_timed_out); -static void iscsi_check_transport_timeouts(unsigned long data) +static void iscsi_check_transport_timeouts(struct timer_list *t) { - struct iscsi_conn *conn = (struct iscsi_conn *)data; + struct iscsi_conn *conn = from_timer(conn, t, transport_timer); struct iscsi_session *session = conn->session; unsigned long recv_timeout, next_timeout = 0, last_recv; @@ -2913,9 +2911,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, conn->exp_statsn = 0; conn->tmf_state = TMF_INITIAL; - init_timer(&conn->transport_timer); - conn->transport_timer.data = (unsigned long)conn; - conn->transport_timer.function = iscsi_check_transport_timeouts; + timer_setup(&conn->transport_timer, iscsi_check_transport_timeouts, 0); INIT_LIST_HEAD(&conn->mgmtqueue); INIT_LIST_HEAD(&conn->cmdqueue); @@ -2939,7 +2935,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, goto login_task_data_alloc_fail; conn->login_task->data = conn->data = data; - init_timer(&conn->tmf_timer); + timer_setup(&conn->tmf_timer, iscsi_tmf_timedout, 0); init_waitqueue_head(&conn->ehwait); return cls_conn; From 74a0f5739292a636d337ad6f8d7cd0061636e0eb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 16:27:10 -0700 Subject: [PATCH 0635/1085] scsi: smartpqi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Martin K. Petersen" Cc: Don Brace Cc: "James E.J. Bottomley" Cc: esc.storagedev@microsemi.com Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Acked-by: Don Brace --- drivers/scsi/smartpqi/smartpqi_init.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 83bdbd84eb01..90f6effc32b4 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -2860,11 +2860,12 @@ out: #define PQI_HEARTBEAT_TIMER_INTERVAL (10 * HZ) -static void pqi_heartbeat_timer_handler(unsigned long data) +static void pqi_heartbeat_timer_handler(struct timer_list *t) { int num_interrupts; u32 heartbeat_count; - struct pqi_ctrl_info *ctrl_info = (struct pqi_ctrl_info *)data; + struct pqi_ctrl_info *ctrl_info = from_timer(ctrl_info, t, + heartbeat_timer); pqi_check_ctrl_health(ctrl_info); if (pqi_ctrl_offline(ctrl_info)) @@ -2902,8 +2903,6 @@ static void pqi_start_heartbeat_timer(struct pqi_ctrl_info *ctrl_info) ctrl_info->heartbeat_timer.expires = jiffies + PQI_HEARTBEAT_TIMER_INTERVAL; - ctrl_info->heartbeat_timer.data = (unsigned long)ctrl_info; - ctrl_info->heartbeat_timer.function = pqi_heartbeat_timer_handler; add_timer(&ctrl_info->heartbeat_timer); } @@ -6465,7 +6464,7 @@ static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node) INIT_DELAYED_WORK(&ctrl_info->rescan_work, pqi_rescan_worker); INIT_DELAYED_WORK(&ctrl_info->update_time_work, pqi_update_time_worker); - init_timer(&ctrl_info->heartbeat_timer); + timer_setup(&ctrl_info->heartbeat_timer, pqi_heartbeat_timer_handler, 0); INIT_WORK(&ctrl_info->ctrl_offline_work, pqi_ctrl_offline_worker); sema_init(&ctrl_info->sync_request_sem, From 69c71252298acad7a25e9499929f723790f48e26 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 09:51:13 -0300 Subject: [PATCH 0636/1085] perf script: Add a few missing conversions to fprintf style In a1a587073ccd ("perf script: Use fprintf like printing uniformly") there were a few cases that were missed, fix it. Reported-by: yuzhoujian Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-sq9hvfk5mkjdqzlpyiq7jkos@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a3add2cd7856..db4ff1510dc5 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1447,7 +1447,7 @@ static void process_event(struct perf_script *script, perf_sample__fprintf_start(sample, thread, evsel, fp); if (PRINT_FIELD(PERIOD)) - printf("%10" PRIu64 " ", sample->period); + fprintf(fp, "%10" PRIu64 " ", sample->period); if (PRINT_FIELD(EVNAME)) { const char *evname = perf_evsel__name(evsel); @@ -1455,8 +1455,7 @@ static void process_event(struct perf_script *script, if (!script->name_width) script->name_width = perf_evlist__max_name_len(script->session->evlist); - printf("%*s: ", script->name_width, - evname ? evname : "[unknown]"); + fprintf(fp, "%*s: ", script->name_width, evname ?: "[unknown]"); } if (print_flags) @@ -1513,8 +1512,8 @@ static void process_event(struct perf_script *script, perf_sample__fprintf_insn(sample, attr, thread, machine, fp); if (PRINT_FIELD(PHYS_ADDR)) - printf("%16" PRIx64, sample->phys_addr); - printf("\n"); + fprintf(fp, "%16" PRIx64, sample->phys_addr); + fprintf(fp, "\n"); } static struct scripting_ops *scripting_ops; From 5ce2c5b4e484a87a8af48649775796fb349684db Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 09:55:22 -0300 Subject: [PATCH 0637/1085] perf script: Use pr_debug where appropriate We have facilities for reporting unexpected, unlikely errors, use them. Cc: Adrian Hunter Cc: Andi Kleen Cc: David Ahern Cc: Jiri Olsa Cc: Michael Ellerman Cc: Namhyung Kim Cc: Wang Nan Cc: yuzhoujian Link: http://lkml.kernel.org/n/tip-c7j22xfjf1j773g7ufp607q0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index db4ff1510dc5..af0267072d43 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -769,27 +769,26 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, * but the exit is not. Let the caller patch it up. */ if (kernel != machine__kernel_ip(machine, end)) { - printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n", - start, end); + pr_debug("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n", start, end); return -ENXIO; } memset(&al, 0, sizeof(al)); if (end - start > MAXBB - MAXINSN) { if (last) - printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end); + pr_debug("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end); else - printf("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start); + pr_debug("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start); return 0; } thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al); if (!al.map || !al.map->dso) { - printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); + pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); return 0; } if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) { - printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); + pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); return 0; } @@ -802,7 +801,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, *is64bit = al.map->dso->is_64_bit; if (len <= 0) - printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n", + pr_debug("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n", start, end); return len; } From 894f3f1732cb4ae543d406e4fca1585e8c303a1c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 10:26:52 -0300 Subject: [PATCH 0638/1085] perf script: Use event_format__fprintf() Another case where we a1a587073ccd ("perf script: Use fprintf like printing uniformly") forgot to redirect output to the FILE descriptor, fix this too. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Cc: yuzhoujian Link: http://lkml.kernel.org/n/tip-jmwx4pgfezw98ezfoj9t957s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index af0267072d43..b27f21638207 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1465,9 +1465,10 @@ static void process_event(struct perf_script *script, return; } - if (PRINT_FIELD(TRACE)) - event_format__print(evsel->tp_format, sample->cpu, - sample->raw_data, sample->raw_size); + if (PRINT_FIELD(TRACE)) { + event_format__fprintf(evsel->tp_format, sample->cpu, + sample->raw_data, sample->raw_size, fp); + } if (attr->type == PERF_TYPE_SYNTH && PRINT_FIELD(SYNTH)) perf_sample__fprintf_synth(sample, evsel, fp); From e669e833da8d71a02e175dfffceaabc3eb010207 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 14:22:34 -0300 Subject: [PATCH 0639/1085] perf evsel: Restore evsel->priv as a tool private area When we started using it for stats and did it not just in builtin-stat.c, but also for builtin-script.c, then it stopped being a tool private area, so introduce a new pointer for these stats and leave ->priv to its original purpose. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Cc: yuzhoujian Fixes: cfc8874a4859 ("perf script: Process cpu/threads maps") Link: http://lkml.kernel.org/n/tip-jtpzx3rjqo78snmmsdzwb2eb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- tools/perf/util/evsel.h | 3 +++ tools/perf/util/stat.c | 16 ++++++++-------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index dd525417880a..988bdfa5d832 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -845,7 +845,7 @@ static void print_noise(struct perf_evsel *evsel, double avg) if (run_count == 1) return; - ps = evsel->priv; + ps = evsel->stats; print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); } @@ -1432,7 +1432,7 @@ static void counter_aggr_cb(struct perf_evsel *counter, void *data, bool first __maybe_unused) { struct caggr_data *cd = data; - struct perf_stat_evsel *ps = counter->priv; + struct perf_stat_evsel *ps = counter->stats; cd->avg += avg_stats(&ps->res_stats[0]); cd->avg_enabled += avg_stats(&ps->res_stats[1]); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index db658785d828..64782b19089d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -68,6 +68,8 @@ struct perf_evsel_config_term { } val; }; +struct perf_stat_evsel; + /** struct perf_evsel - event selector * * @evlist - evlist this evsel is in, if it is in one. @@ -101,6 +103,7 @@ struct perf_evsel { const char *unit; struct event_format *tp_format; off_t id_offset; + struct perf_stat_evsel *stats; void *priv; u64 db_id; struct cgroup_sel *cgrp; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 35e9848734d6..933de91831fa 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -69,7 +69,7 @@ double rel_stddev_stats(double stddev, double avg) bool __perf_evsel_stat__is(struct perf_evsel *evsel, enum perf_stat_evsel_id id) { - struct perf_stat_evsel *ps = evsel->priv; + struct perf_stat_evsel *ps = evsel->stats; return ps->id == id; } @@ -93,7 +93,7 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = { void perf_stat_evsel_id_init(struct perf_evsel *evsel) { - struct perf_stat_evsel *ps = evsel->priv; + struct perf_stat_evsel *ps = evsel->stats; int i; /* ps->id is 0 hence PERF_STAT_EVSEL_ID__NONE by default */ @@ -109,7 +109,7 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel) static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) { int i; - struct perf_stat_evsel *ps = evsel->priv; + struct perf_stat_evsel *ps = evsel->stats; for (i = 0; i < 3; i++) init_stats(&ps->res_stats[i]); @@ -119,8 +119,8 @@ static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) { - evsel->priv = zalloc(sizeof(struct perf_stat_evsel)); - if (evsel->priv == NULL) + evsel->stats = zalloc(sizeof(struct perf_stat_evsel)); + if (evsel->stats == NULL) return -ENOMEM; perf_evsel__reset_stat_priv(evsel); return 0; @@ -128,11 +128,11 @@ static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) { - struct perf_stat_evsel *ps = evsel->priv; + struct perf_stat_evsel *ps = evsel->stats; if (ps) free(ps->group_data); - zfree(&evsel->priv); + zfree(&evsel->stats); } static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel, @@ -318,7 +318,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, struct perf_evsel *counter) { struct perf_counts_values *aggr = &counter->counts->aggr; - struct perf_stat_evsel *ps = counter->priv; + struct perf_stat_evsel *ps = counter->stats; u64 *count = counter->counts->aggr.values; u64 val; int i, ret; From a14390fde64e862c4fe8b92ecb293ee4b12d5bfe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 10:30:20 -0300 Subject: [PATCH 0640/1085] perf script: Allow creating per-event dump files Introduce a new option to dump trace output to files named by the monitored events and update perf-script documentation accordingly. Shown below is output of perf script command with the newly introduced option. $ perf record -e cycles -e cs -ag -- sleep 1 $ perf script --per-event-dump $ ls perf.data.cycles.dump perf.data.cs.dump Without per-event-dump support, drawing flamegraphs for different events would require post processing to separate events. You can monitor only one event at a time if you want to get flamegraphs for different events. Using this option, you can get the trace output files named by the monitored events, and could draw flamegraphs according to the event's name. Based-on-a-patch-by: yuzhoujian Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/r/1508921599-10832-3-git-send-email-yuzhoujian@didichuxing.com Link: http://lkml.kernel.org/n/tip-8ngzsjdhgiovkupl3r5yy570@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 4 ++ tools/perf/builtin-script.c | 59 +++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 25e677344728..2811fcf684cb 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -325,6 +325,10 @@ include::itrace.txt[] Set the maximum number of program blocks to print with brstackasm for each sample. +--per-event-dump:: + Create per event files with a "perf.data.EVENT.dump" name instead of + printing to stdout, useful, for instance, for generating flamegraphs. + --inline:: If a callgraph address belongs to an inlined function, the inline stack will be printed. Each entry has function name and file/line. Enabled by diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b27f21638207..fb5e49b3bc44 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1392,6 +1392,7 @@ struct perf_script { bool show_switch_events; bool show_namespace_events; bool allocated; + bool per_event_dump; struct cpu_map *cpus; struct thread_map *threads; int name_width; @@ -1438,7 +1439,7 @@ static void process_event(struct perf_script *script, struct thread *thread = al->thread; struct perf_event_attr *attr = &evsel->attr; unsigned int type = output_type(attr->type); - FILE *fp = stdout; + FILE *fp = evsel->priv; if (output[type].fields == 0) return; @@ -1887,6 +1888,52 @@ static void sig_handler(int sig __maybe_unused) session_done = 1; } +static void perf_script__fclose_per_event_dump(struct perf_script *script) +{ + struct perf_evlist *evlist = script->session->evlist; + struct perf_evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (!evsel->priv) + break; + fclose(evsel->priv); + evsel->priv = NULL; + } +} + +static int perf_script__fopen_per_event_dump(struct perf_script *script) +{ + struct perf_evsel *evsel; + + evlist__for_each_entry(script->session->evlist, evsel) { + char filename[PATH_MAX]; + snprintf(filename, sizeof(filename), "%s.%s.dump", + script->session->file->path, perf_evsel__name(evsel)); + evsel->priv = fopen(filename, "w"); + if (evsel->priv == NULL) + goto out_err_fclose; + } + + return 0; + +out_err_fclose: + perf_script__fclose_per_event_dump(script); + return -1; +} + +static int perf_script__setup_per_event_dump(struct perf_script *script) +{ + struct perf_evsel *evsel; + + if (script->per_event_dump) + return perf_script__fopen_per_event_dump(script); + + evlist__for_each_entry(script->session->evlist, evsel) + evsel->priv = stdout; + + return 0; +} + static int __cmd_script(struct perf_script *script) { int ret; @@ -1908,8 +1955,16 @@ static int __cmd_script(struct perf_script *script) if (script->show_namespace_events) script->tool.namespaces = process_namespaces_event; + if (perf_script__setup_per_event_dump(script)) { + pr_err("Couldn't create the per event dump files\n"); + return -1; + } + ret = perf_session__process_events(script->session); + if (script->per_event_dump) + perf_script__fclose_per_event_dump(script); + if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); @@ -2827,6 +2882,8 @@ int cmd_script(int argc, const char **argv) "Show context switch events (if recorded)"), OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events, "Show namespace events (if recorded)"), + OPT_BOOLEAN('\0', "per-event-dump", &script.per_event_dump, + "Dump trace output to files named by the monitored events"), OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), OPT_INTEGER(0, "max-blocks", &max_blocks, "Maximum number of code blocks to dump with brstackinsn"), From 433727948904301b01f1d5ebf39893c96cd4bab7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 14:41:01 -0300 Subject: [PATCH 0641/1085] tools include uapi: Grab a copy of linux/prctl.h We will use it to generate tables for beautifying prctl's 'option' arg and some of the others eventually. Cc: Andy Lutomirski Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-cg8mpmz4hk9nfih685emnbk9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 200 +++++++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 201 insertions(+) create mode 100644 tools/include/uapi/linux/prctl.h diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h new file mode 100644 index 000000000000..a8d0759a9e40 --- /dev/null +++ b/tools/include/uapi/linux/prctl.h @@ -0,0 +1,200 @@ +#ifndef _LINUX_PRCTL_H +#define _LINUX_PRCTL_H + +#include + +/* Values to pass as first argument to prctl() */ + +#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ +#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ + +/* Get/set current->mm->dumpable */ +#define PR_GET_DUMPABLE 3 +#define PR_SET_DUMPABLE 4 + +/* Get/set unaligned access control bits (if meaningful) */ +#define PR_GET_UNALIGN 5 +#define PR_SET_UNALIGN 6 +# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ +# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ + +/* Get/set whether or not to drop capabilities on setuid() away from + * uid 0 (as per security/commoncap.c) */ +#define PR_GET_KEEPCAPS 7 +#define PR_SET_KEEPCAPS 8 + +/* Get/set floating-point emulation control bits (if meaningful) */ +#define PR_GET_FPEMU 9 +#define PR_SET_FPEMU 10 +# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ +# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ + +/* Get/set floating-point exception mode (if meaningful) */ +#define PR_GET_FPEXC 11 +#define PR_SET_FPEXC 12 +# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ +# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ +# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ +# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ +# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ +# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ +# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ +# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ +# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ +# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ + +/* Get/set whether we use statistical process timing or accurate timestamp + * based process timing */ +#define PR_GET_TIMING 13 +#define PR_SET_TIMING 14 +# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, + statistical process timing */ +# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based + process timing */ + +#define PR_SET_NAME 15 /* Set process name */ +#define PR_GET_NAME 16 /* Get process name */ + +/* Get/set process endian */ +#define PR_GET_ENDIAN 19 +#define PR_SET_ENDIAN 20 +# define PR_ENDIAN_BIG 0 +# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ +# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ + +/* Get/set process seccomp mode */ +#define PR_GET_SECCOMP 21 +#define PR_SET_SECCOMP 22 + +/* Get/set the capability bounding set (as per security/commoncap.c) */ +#define PR_CAPBSET_READ 23 +#define PR_CAPBSET_DROP 24 + +/* Get/set the process' ability to use the timestamp counter instruction */ +#define PR_GET_TSC 25 +#define PR_SET_TSC 26 +# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ +# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ + +/* Get/set securebits (as per security/commoncap.c) */ +#define PR_GET_SECUREBITS 27 +#define PR_SET_SECUREBITS 28 + +/* + * Get/set the timerslack as used by poll/select/nanosleep + * A value of 0 means "use default" + */ +#define PR_SET_TIMERSLACK 29 +#define PR_GET_TIMERSLACK 30 + +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 + +/* + * Set early/late kill mode for hwpoison memory corruption. + * This influences when the process gets killed on a memory corruption. + */ +#define PR_MCE_KILL 33 +# define PR_MCE_KILL_CLEAR 0 +# define PR_MCE_KILL_SET 1 + +# define PR_MCE_KILL_LATE 0 +# define PR_MCE_KILL_EARLY 1 +# define PR_MCE_KILL_DEFAULT 2 + +#define PR_MCE_KILL_GET 34 + +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; + +/* + * Set specific pid that is allowed to ptrace the current task. + * A value of 0 mean "no process". + */ +#define PR_SET_PTRACER 0x59616d61 +# define PR_SET_PTRACER_ANY ((unsigned long)-1) + +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + +/* + * If no_new_privs is set, then operations that grant new privileges (i.e. + * execve) will either fail or not grant them. This affects suid/sgid, + * file capabilities, and LSMs. + * + * Operations that merely manipulate or drop existing privileges (setresuid, + * capset, etc.) will still work. Drop those privileges if you want them gone. + * + * Changing LSM security domain is considered a new privilege. So, for example, + * asking selinux for a specific new context (e.g. with runcon) will result + * in execve returning -EPERM. + * + * See Documentation/prctl/no_new_privs.txt for more details. + */ +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 + +#define PR_SET_THP_DISABLE 41 +#define PR_GET_THP_DISABLE 42 + +/* + * Tell the kernel to start/stop helping userspace manage bounds tables. + */ +#define PR_MPX_ENABLE_MANAGEMENT 43 +#define PR_MPX_DISABLE_MANAGEMENT 44 + +#define PR_SET_FP_MODE 45 +#define PR_GET_FP_MODE 46 +# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ +# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ + +/* Control the ambient capability set */ +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 + +#endif /* _LINUX_PRCTL_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 322629423b49..8d8b37198666 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -6,6 +6,7 @@ include/uapi/drm/i915_drm.h include/uapi/linux/fcntl.h include/uapi/linux/kvm.h include/uapi/linux/perf_event.h +include/uapi/linux/prctl.h include/uapi/linux/sched.h include/uapi/linux/stat.h include/uapi/linux/vhost.h From d688d0376c6eb452565c16c95b26cd2c95aa8a82 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Oct 2017 15:19:35 -0300 Subject: [PATCH 0642/1085] perf trace beauty prctl: Generate 'option' string table from kernel headers This is one more case where the way that syscall parameter values are defined in kernel headers are easy to parse using a shell script that will then generate the string table that gets used by the prctl 'option' argument beautifier. This way as soon as the header syncronization mechanism in perf's build system detects a change in a copy of a kernel ABI header and that file is syncronized, we get 'perf trace' updated automagically. Further work needed for the PR_SET_ values, as well for using eBPF to copy the non-integer arguments to/from the kernel. E.g.: System wide prctl tracing: # perf trace -e prctl 1668.028 ( 0.025 ms): TaskSchedulerR/10649 prctl(option: SET_NAME, arg2: 0x2b61d5db15d0) = 0 3365.663 ( 0.018 ms): chrome/10650 prctl(option: SET_SECCOMP, arg2: 2, arg4: 8 ) = -1 EFAULT Bad address 3366.585 ( 0.010 ms): chrome/10650 prctl(option: SET_NO_NEW_PRIVS, arg2: 1 ) = 0 3367.173 ( 0.009 ms): TaskSchedulerR/10652 prctl(option: SET_NAME, arg2: 0x2b61d2aaa300) = 0 3367.222 ( 0.003 ms): TaskSchedulerR/10653 prctl(option: SET_NAME, arg2: 0x2b61d2aaa1e0) = 0 3367.244 ( 0.002 ms): TaskSchedulerR/10654 prctl(option: SET_NAME, arg2: 0x2b61d2aaa0c0) = 0 3367.265 ( 0.002 ms): TaskSchedulerR/10655 prctl(option: SET_NAME, arg2: 0x2b61d2ac7f90) = 0 3367.281 ( 0.002 ms): Chrome_ChildIO/10656 prctl(option: SET_NAME, arg2: 0x7efbe406bb11) = 0 3367.220 ( 0.004 ms): TaskSchedulerS/10651 prctl(option: SET_NAME, arg2: 0x2b61d2ac1be0) = 0 3370.906 ( 0.010 ms): GpuMemoryThrea/10657 prctl(option: SET_NAME, arg2: 0x7efbe386ab11) = 0 3370.983 ( 0.003 ms): File/10658 prctl(option: SET_NAME, arg2: 0x7efbe3069b11 ) = 0 3384.272 ( 0.020 ms): Compositor/10659 prctl(option: SET_NAME, arg2: 0x7efbe2868b11 ) = 0 3612.091 ( 0.012 ms): DOM Worker/11489 prctl(option: SET_NAME, arg2: 0x7f49ab97ebf2 ) = 0 4512.437 ( 0.004 ms): (sa1)/11490 prctl(option: SET_NAME, arg2: 0x7ffca15af844 ) = 0 4512.468 ( 0.002 ms): (sa1)/11490 prctl(option: SET_MM, arg2: ARG_START, arg3: 0x7f5cb7c81000) = 0 4512.472 ( 0.001 ms): (sa1)/11490 prctl(option: SET_MM, arg2: ARG_END, arg3: 0x7f5cb7c81006) = 0 4514.667 ( 0.002 ms): (sa1)/11490 prctl(option: GET_SECUREBITS ) = 0 Cc: Adrian Hunter Cc: Andy Lutomirski Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-q0s2uw579o5ei6xlh2zjirgz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 13 +++- tools/perf/builtin-trace.c | 5 +- tools/perf/trace/beauty/Build | 1 + tools/perf/trace/beauty/beauty.h | 9 +++ tools/perf/trace/beauty/prctl.c | 82 +++++++++++++++++++++++++ tools/perf/trace/beauty/prctl_option.sh | 17 +++++ 6 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 tools/perf/trace/beauty/prctl.c create mode 100755 tools/perf/trace/beauty/prctl_option.sh diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5f7408118a2d..c872ca607b39 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -448,6 +448,13 @@ madvise_behavior_tbl := $(srctree)/tools/perf/trace/beauty/madvise_behavior.sh $(madvise_behavior_array): $(madvise_hdr_dir)/mman-common.h $(madvise_behavior_tbl) $(Q)$(SHELL) '$(madvise_behavior_tbl)' $(madvise_hdr_dir) > $@ +prctl_option_array := $(beauty_outdir)/prctl_option_array.c +prctl_hdr_dir := $(srctree)/tools/include/uapi/linux/ +prctl_option_tbl := $(srctree)/tools/perf/trace/beauty/prctl_option.sh + +$(prctl_option_array): $(prctl_hdr_dir)/prctl.h $(prctl_option_tbl) + $(Q)$(SHELL) '$(prctl_option_tbl)' $(prctl_hdr_dir) > $@ + all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) @@ -549,7 +556,8 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(kvm_ioctl_array) \ $(vhost_virtio_ioctl_array) \ $(madvise_behavior_array) \ - $(perf_ioctl_array) + $(perf_ioctl_array) \ + $(prctl_option_array) $(OUTPUT)%.o: %.c prepare FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ @@ -829,7 +837,8 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)$(sndrv_pcm_ioctl_array) \ $(OUTPUT)$(kvm_ioctl_array) \ $(OUTPUT)$(vhost_virtio_ioctl_array) \ - $(OUTPUT)$(perf_ioctl_array) + $(OUTPUT)$(perf_ioctl_array) \ + $(OUTPUT)$(prctl_option_array) $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean # diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8b23982dd9f2..78855916f4b0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -578,7 +578,6 @@ static struct syscall_fmt { } syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, - { .name = "arch_prctl", .alias = "prctl", }, { .name = "bpf", .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, { .name = "brk", .hexret = true, @@ -703,6 +702,10 @@ static struct syscall_fmt { [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, { .name = "poll", .timeout = true, }, { .name = "ppoll", .timeout = true, }, + { .name = "prctl", .alias = "arch_prctl", + .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ }, + [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ }, + [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, }, { .name = "pread", .alias = "pread64", }, { .name = "preadv", .alias = "pread", }, { .name = "prlimit64", diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index 175d633c6b49..2f68b76ec39b 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -4,4 +4,5 @@ ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) libperf-y += ioctl.o endif libperf-y += pkey_alloc.o +libperf-y += prctl.o libperf-y += statx.o diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 4b58581a6053..b29f94ef5d6a 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -84,6 +84,15 @@ size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, st size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, struct syscall_arg *arg); #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags +size_t syscall_arg__scnprintf_prctl_option(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_PRCTL_OPTION syscall_arg__scnprintf_prctl_option + +size_t syscall_arg__scnprintf_prctl_arg2(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_PRCTL_ARG2 syscall_arg__scnprintf_prctl_arg2 + +size_t syscall_arg__scnprintf_prctl_arg3(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_PRCTL_ARG3 syscall_arg__scnprintf_prctl_arg3 + size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg); #define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c new file mode 100644 index 000000000000..246130dad6c4 --- /dev/null +++ b/tools/perf/trace/beauty/prctl.c @@ -0,0 +1,82 @@ +/* + * trace/beauty/prctl.c + * + * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "trace/beauty/beauty.h" +#include +#include + +#include "trace/beauty/generated/prctl_option_array.c" + +static size_t prctl__scnprintf_option(int option, char *bf, size_t size) +{ + static DEFINE_STRARRAY(prctl_options); + return strarray__scnprintf(&strarray__prctl_options, bf, size, "%d", option); +} + +static size_t prctl__scnprintf_set_mm(int option, char *bf, size_t size) +{ + static DEFINE_STRARRAY(prctl_set_mm_options); + return strarray__scnprintf(&strarray__prctl_set_mm_options, bf, size, "%d", option); +} + +size_t syscall_arg__scnprintf_prctl_arg2(char *bf, size_t size, struct syscall_arg *arg) +{ + int option = syscall_arg__val(arg, 0); + + if (option == PR_SET_MM) + return prctl__scnprintf_set_mm(arg->val, bf, size); + /* + * We still don't grab the contents of pointers on entry or exit, + * so just print them as hex numbers + */ + if (option == PR_SET_NAME) + return syscall_arg__scnprintf_hex(bf, size, arg); + + return syscall_arg__scnprintf_long(bf, size, arg); +} + +size_t syscall_arg__scnprintf_prctl_arg3(char *bf, size_t size, struct syscall_arg *arg) +{ + int option = syscall_arg__val(arg, 0); + + if (option == PR_SET_MM) + return syscall_arg__scnprintf_hex(bf, size, arg); + + return syscall_arg__scnprintf_long(bf, size, arg); +} + +size_t syscall_arg__scnprintf_prctl_option(char *bf, size_t size, struct syscall_arg *arg) +{ + unsigned long option = arg->val; + enum { + SPO_ARG2 = (1 << 1), + SPO_ARG3 = (1 << 2), + SPO_ARG4 = (1 << 3), + SPO_ARG5 = (1 << 4), + SPO_ARG6 = (1 << 5), + }; + const u8 all_but2 = SPO_ARG3 | SPO_ARG4 | SPO_ARG5 | SPO_ARG6; + const u8 all = SPO_ARG2 | all_but2; + const u8 masks[] = { + [PR_GET_DUMPABLE] = all, + [PR_SET_DUMPABLE] = all_but2, + [PR_SET_NAME] = all_but2, + [PR_GET_CHILD_SUBREAPER] = all_but2, + [PR_SET_CHILD_SUBREAPER] = all_but2, + [PR_GET_SECUREBITS] = all, + [PR_SET_SECUREBITS] = all_but2, + [PR_SET_MM] = SPO_ARG4 | SPO_ARG5 | SPO_ARG6, + [PR_GET_PDEATHSIG] = all, + [PR_SET_PDEATHSIG] = all_but2, + }; + + if (option < ARRAY_SIZE(masks)) + arg->mask |= masks[option]; + + return prctl__scnprintf_option(option, bf, size); +} diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh new file mode 100755 index 000000000000..0be4138fbe71 --- /dev/null +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +header_dir=$1 + +printf "static const char *prctl_options[] = {\n" +regex='^#define[[:space:]]+PR_([GS]ET\w+)[[:space:]]*([[:xdigit:]]+).*' +egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ + sed -r "s/$regex/\2 \1/g" | \ + sort -n | xargs printf "\t[%s] = \"%s\",\n" +printf "};\n" + +printf "static const char *prctl_set_mm_options[] = {\n" +regex='^#[[:space:]]+define[[:space:]]+PR_SET_MM_(\w+)[[:space:]]*([[:digit:]]+).*' +egrep $regex ${header_dir}/prctl.h | \ + sed -r "s/$regex/\2 \1/g" | \ + sort -n | xargs printf "\t[%s] = \"%s\",\n" +printf "};\n" From 250a53d6fcd86012935d1cf71eb2e3d6e88c412c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 27 Oct 2017 10:34:33 +0200 Subject: [PATCH 0643/1085] genirq: Document vcpu_info usage for percpu_devid interrupts It is currently unclear how to set the VCPU affinity for a percpu_devid interrupt , since the Linux irq_data structure describes the state for multiple interrupts, one for each physical CPU on the system. Since each such interrupt can be associated with different VCPUs or none at all, associating a single VCPU state with such an interrupt does not capture the necessary semantics. The implementers of irq_set_affinity are the Intel and AMD IOMMUs, and the ARM GIC irqchip. The Intel and AMD callers do not appear to use percpu_devid interrupts, and the ARM GIC implementation only checks the pointer against NULL vs. non-NULL. Therefore, simply update the function documentation to explain the expected use in the context of percpu_devid interrupts, allowing future changes or additions to irqchip implementers to do the right thing. Signed-off-by: Christoffer Dall Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: kvm@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: Eric Auger Cc: kvmarm@lists.cs.columbia.edu Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/1509093281-15225-13-git-send-email-cdall@linaro.org --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e667912d0e9c..c65f282cbc9a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -381,7 +381,8 @@ int irq_select_affinity_usr(unsigned int irq) /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU * affinity for an irq. The vCPU specific data is passed from From f48729a999ee57b9e831245779e68200dd2bde09 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 23 Oct 2017 11:58:37 +0200 Subject: [PATCH 0644/1085] clocksource/drivers/timer-of: Add timer_of_exit function The timer-of API does not provide a function to undo what has been done by the timer_of_init() function. Add a timer_of_exit() function. Signed-off-by: Benjamin Gaignard Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-of.c | 12 ++++++++++++ drivers/clocksource/timer-of.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c index c79122d8e10d..7c64a5c1bfc1 100644 --- a/drivers/clocksource/timer-of.c +++ b/drivers/clocksource/timer-of.c @@ -176,3 +176,15 @@ out_fail: timer_base_exit(&to->of_base); return ret; } + +void timer_of_exit(struct timer_of *to) +{ + if (to->flags & TIMER_OF_IRQ) + timer_irq_exit(&to->of_irq); + + if (to->flags & TIMER_OF_CLOCK) + timer_clk_exit(&to->of_clk); + + if (to->flags & TIMER_OF_BASE) + timer_base_exit(&to->of_base); +} diff --git a/drivers/clocksource/timer-of.h b/drivers/clocksource/timer-of.h index e0d727255f72..44f57e02725d 100644 --- a/drivers/clocksource/timer-of.h +++ b/drivers/clocksource/timer-of.h @@ -66,4 +66,7 @@ static inline unsigned long timer_of_period(struct timer_of *to) extern int __init timer_of_init(struct device_node *np, struct timer_of *to); + +extern void timer_of_exit(struct timer_of *to); + #endif From 6dcf2fb5e8db3704f50af1f198256cb4e2453f8b Mon Sep 17 00:00:00 2001 From: "Edward A. James" Date: Fri, 27 Oct 2017 11:55:05 -0500 Subject: [PATCH 0645/1085] hwmon: (pmbus/core) Prevent unintentional setting of page to 0xFF The pmbus core may call read/write word data functions with a page value of -1, intending to perform the operation without setting the page. However, the read/write word data functions accept only unsigned 8-bit page numbers, and therefore cannot check for negative page number to avoid setting the page. This results in setting the page number to 0xFF. This may result in errors or undefined behavior of some devices (specifically the ir35221, which allows the page to be set to 0xFF, but some subsequent operations to read registers may fail). Switch the pmbus_set_page page parameter to an integer and perform the check for negative page there. Make read/write functions consistent in accepting an integer page number parameter. Signed-off-by: Edward A. James Fixes: cbcdec6202c9 ("hwmon: (pmbus): Access word data for STATUS_WORD") Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/pmbus.h | 6 +++--- drivers/hwmon/pmbus/pmbus_core.c | 25 +++++++++++-------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h index 4efa2bd4f6d8..fa613bd209e3 100644 --- a/drivers/hwmon/pmbus/pmbus.h +++ b/drivers/hwmon/pmbus/pmbus.h @@ -404,9 +404,9 @@ extern const struct regulator_ops pmbus_regulator_ops; /* Function declarations */ void pmbus_clear_cache(struct i2c_client *client); -int pmbus_set_page(struct i2c_client *client, u8 page); -int pmbus_read_word_data(struct i2c_client *client, u8 page, u8 reg); -int pmbus_write_word_data(struct i2c_client *client, u8 page, u8 reg, u16 word); +int pmbus_set_page(struct i2c_client *client, int page); +int pmbus_read_word_data(struct i2c_client *client, int page, u8 reg); +int pmbus_write_word_data(struct i2c_client *client, int page, u8 reg, u16 word); int pmbus_read_byte_data(struct i2c_client *client, int page, u8 reg); int pmbus_write_byte(struct i2c_client *client, int page, u8 value); int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg, diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 302f0aef59de..52a58b8b6e1b 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -136,13 +136,13 @@ void pmbus_clear_cache(struct i2c_client *client) } EXPORT_SYMBOL_GPL(pmbus_clear_cache); -int pmbus_set_page(struct i2c_client *client, u8 page) +int pmbus_set_page(struct i2c_client *client, int page) { struct pmbus_data *data = i2c_get_clientdata(client); int rv = 0; int newpage; - if (page != data->currpage) { + if (page >= 0 && page != data->currpage) { rv = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); newpage = i2c_smbus_read_byte_data(client, PMBUS_PAGE); if (newpage != page) @@ -158,11 +158,9 @@ int pmbus_write_byte(struct i2c_client *client, int page, u8 value) { int rv; - if (page >= 0) { - rv = pmbus_set_page(client, page); - if (rv < 0) - return rv; - } + rv = pmbus_set_page(client, page); + if (rv < 0) + return rv; return i2c_smbus_write_byte(client, value); } @@ -186,7 +184,8 @@ static int _pmbus_write_byte(struct i2c_client *client, int page, u8 value) return pmbus_write_byte(client, page, value); } -int pmbus_write_word_data(struct i2c_client *client, u8 page, u8 reg, u16 word) +int pmbus_write_word_data(struct i2c_client *client, int page, u8 reg, + u16 word) { int rv; @@ -219,7 +218,7 @@ static int _pmbus_write_word_data(struct i2c_client *client, int page, int reg, return pmbus_write_word_data(client, page, reg, word); } -int pmbus_read_word_data(struct i2c_client *client, u8 page, u8 reg) +int pmbus_read_word_data(struct i2c_client *client, int page, u8 reg) { int rv; @@ -255,11 +254,9 @@ int pmbus_read_byte_data(struct i2c_client *client, int page, u8 reg) { int rv; - if (page >= 0) { - rv = pmbus_set_page(client, page); - if (rv < 0) - return rv; - } + rv = pmbus_set_page(client, page); + if (rv < 0) + return rv; return i2c_smbus_read_byte_data(client, reg); } From fc483a9bf795ebf49bf0c872d125391d8fee2f3f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 19 Sep 2017 14:59:34 +0200 Subject: [PATCH 0646/1085] hwmon: (stts751) Fix buffer size passed to snprintf Function snprintf already cares for the terminating NUL at the end of the string, the caller doesn't need to do it. Signed-off-by: Jean Delvare Cc: Andrea Merello Cc: Guenter Roeck Signed-off-by: Guenter Roeck --- drivers/hwmon/stts751.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/stts751.c b/drivers/hwmon/stts751.c index 3f940fb67dc6..7fe152d92350 100644 --- a/drivers/hwmon/stts751.c +++ b/drivers/hwmon/stts751.c @@ -396,7 +396,7 @@ static ssize_t show_max_alarm(struct device *dev, struct device_attribute *attr, if (ret < 0) return ret; - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->max_alert); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->max_alert); } static ssize_t show_min_alarm(struct device *dev, struct device_attribute *attr, @@ -413,7 +413,7 @@ static ssize_t show_min_alarm(struct device *dev, struct device_attribute *attr, if (ret < 0) return ret; - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->min_alert); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->min_alert); } static ssize_t show_input(struct device *dev, struct device_attribute *attr, @@ -428,7 +428,7 @@ static ssize_t show_input(struct device *dev, struct device_attribute *attr, if (ret < 0) return ret; - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->temp); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->temp); } static ssize_t show_therm(struct device *dev, struct device_attribute *attr, @@ -436,7 +436,7 @@ static ssize_t show_therm(struct device *dev, struct device_attribute *attr, { struct stts751_priv *priv = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->therm); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->therm); } static ssize_t set_therm(struct device *dev, struct device_attribute *attr, @@ -478,7 +478,7 @@ static ssize_t show_hyst(struct device *dev, struct device_attribute *attr, { struct stts751_priv *priv = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->hyst); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->hyst); } static ssize_t set_hyst(struct device *dev, struct device_attribute *attr, @@ -518,7 +518,7 @@ static ssize_t show_therm_trip(struct device *dev, if (ret < 0) return ret; - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->therm_trip); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->therm_trip); } static ssize_t show_max(struct device *dev, struct device_attribute *attr, @@ -526,7 +526,7 @@ static ssize_t show_max(struct device *dev, struct device_attribute *attr, { struct stts751_priv *priv = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->event_max); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->event_max); } static ssize_t set_max(struct device *dev, struct device_attribute *attr, @@ -560,7 +560,7 @@ static ssize_t show_min(struct device *dev, struct device_attribute *attr, { struct stts751_priv *priv = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE - 1, "%d\n", priv->event_min); + return snprintf(buf, PAGE_SIZE, "%d\n", priv->event_min); } static ssize_t set_min(struct device *dev, struct device_attribute *attr, @@ -594,7 +594,7 @@ static ssize_t show_interval(struct device *dev, struct device_attribute *attr, { struct stts751_priv *priv = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE - 1, "%d\n", + return snprintf(buf, PAGE_SIZE, "%d\n", stts751_intervals[priv->interval]); } From 68546abf7a3a63f199e53d6dcaa7375df37a6aaa Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 4 Sep 2017 18:33:53 -0700 Subject: [PATCH 0647/1085] hwmon: (k10temp) Move chip specific code into probe function Introduce a local data structure and determine the temperature read function at probe time to reduce runtime complexity. Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 60 ++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index ce3b91f22e30..fc8076c7e1a1 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -61,31 +61,44 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); */ #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4 -static void amd_nb_smu_index_read(struct pci_dev *pdev, unsigned int devfn, - int offset, u32 *val) +struct k10temp_data { + struct pci_dev *pdev; + void (*read_tempreg)(struct pci_dev *pdev, u32 *regval); +}; + +static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval) +{ + pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval); +} + +static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn, + unsigned int base, int offset, u32 *val) { mutex_lock(&nb_smu_ind_mutex); pci_bus_write_config_dword(pdev->bus, devfn, - 0xb8, offset); + base, offset); pci_bus_read_config_dword(pdev->bus, devfn, - 0xbc, val); + base + 4, val); mutex_unlock(&nb_smu_ind_mutex); } +static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval) +{ + amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8, + F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval); +} + static ssize_t temp1_input_show(struct device *dev, struct device_attribute *attr, char *buf) { + struct k10temp_data *data = dev_get_drvdata(dev); u32 regval; - struct pci_dev *pdev = dev_get_drvdata(dev); + unsigned int temp; - if (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model == 0x60) { - amd_nb_smu_index_read(pdev, PCI_DEVFN(0, 0), - F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, - ®val); - } else { - pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, ®val); - } - return sprintf(buf, "%u\n", (regval >> 21) * 125); + data->read_tempreg(data->pdev, ®val); + temp = (regval >> 21) * 125; + + return sprintf(buf, "%u\n", temp); } static ssize_t temp1_max_show(struct device *dev, @@ -98,11 +111,12 @@ static ssize_t show_temp_crit(struct device *dev, struct device_attribute *devattr, char *buf) { struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); + struct k10temp_data *data = dev_get_drvdata(dev); int show_hyst = attr->index; u32 regval; int value; - pci_read_config_dword(dev_get_drvdata(dev), + pci_read_config_dword(data->pdev, REG_HARDWARE_THERMAL_CONTROL, ®val); value = ((regval >> 16) & 0x7f) * 500 + 52000; if (show_hyst) @@ -119,7 +133,8 @@ static umode_t k10temp_is_visible(struct kobject *kobj, struct attribute *attr, int index) { struct device *dev = container_of(kobj, struct device, kobj); - struct pci_dev *pdev = dev_get_drvdata(dev); + struct k10temp_data *data = dev_get_drvdata(dev); + struct pci_dev *pdev = data->pdev; if (index >= 2) { u32 reg_caps, reg_htc; @@ -187,6 +202,7 @@ static int k10temp_probe(struct pci_dev *pdev, { int unreliable = has_erratum_319(pdev); struct device *dev = &pdev->dev; + struct k10temp_data *data; struct device *hwmon_dev; if (unreliable) { @@ -199,7 +215,19 @@ static int k10temp_probe(struct pci_dev *pdev, "unreliable CPU thermal sensor; check erratum 319\n"); } - hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", pdev, + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->pdev = pdev; + + if (boot_cpu_data.x86 == 0x15 && (boot_cpu_data.x86_model == 0x60 || + boot_cpu_data.x86_model == 0x70)) + data->read_tempreg = read_tempreg_nb_f15; + else + data->read_tempreg = read_tempreg_pci; + + hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", data, k10temp_groups); return PTR_ERR_OR_ZERO(hwmon_dev); } From 9af0a9aecdb945cd5513941ffdcbcc031009b402 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 4 Sep 2017 18:33:53 -0700 Subject: [PATCH 0648/1085] hwmon: (k10temp) Add support for family 17h Add support for temperature sensors on Family 17h (Ryzen) processors. Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index fc8076c7e1a1..c4dac53206c3 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -36,6 +36,10 @@ MODULE_PARM_DESC(force, "force loading on processors with erratum 319"); /* Provide lock for writing to NB_SMU_IND_ADDR */ static DEFINE_MUTEX(nb_smu_ind_mutex); +#ifndef PCI_DEVICE_ID_AMD_17H_DF_F3 +#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 +#endif + /* CPUID function 0x80000001, ebx */ #define CPUID_PKGTYPE_MASK 0xf0000000 #define CPUID_PKGTYPE_F 0x00000000 @@ -61,6 +65,9 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); */ #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4 +/* F17h M01h Access througn SMN */ +#define F17H_M01H_REPORTED_TEMP_CTRL_OFFSET 0x00059800 + struct k10temp_data { struct pci_dev *pdev; void (*read_tempreg)(struct pci_dev *pdev, u32 *regval); @@ -88,6 +95,12 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval) F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval); } +static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval) +{ + amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60, + F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval); +} + static ssize_t temp1_input_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -224,6 +237,8 @@ static int k10temp_probe(struct pci_dev *pdev, if (boot_cpu_data.x86 == 0x15 && (boot_cpu_data.x86_model == 0x60 || boot_cpu_data.x86_model == 0x70)) data->read_tempreg = read_tempreg_nb_f15; + else if (boot_cpu_data.x86 == 0x17) + data->read_tempreg = read_tempreg_nb_f17; else data->read_tempreg = read_tempreg_pci; @@ -242,6 +257,7 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; MODULE_DEVICE_TABLE(pci, k10temp_id_table); From 1b50b776355fa6c6d7b3281a63c275d5c18d629d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 4 Sep 2017 18:33:53 -0700 Subject: [PATCH 0649/1085] hwmon: (k10temp) Add support for temperature offsets Add support for handling temperature offset values for various AMD CPUs, similar to the code used in the coretemp driver for Intel CPUs. This is primarily for Ryzen CPUs (which has documented temperature offsets), but the code is kept generic to simplify adding additional CPUs. Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index c4dac53206c3..46a54ed23410 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -71,6 +71,24 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); struct k10temp_data { struct pci_dev *pdev; void (*read_tempreg)(struct pci_dev *pdev, u32 *regval); + int temp_offset; +}; + +struct tctl_offset { + u8 model; + char const *id; + int offset; +}; + +static const struct tctl_offset tctl_offset_table[] = { + { 0x17, "AMD Ryzen 7 1600X", 20000 }, + { 0x17, "AMD Ryzen 7 1700X", 20000 }, + { 0x17, "AMD Ryzen 7 1800X", 20000 }, + { 0x17, "AMD Ryzen Threadripper 1950X", 27000 }, + { 0x17, "AMD Ryzen Threadripper 1920X", 27000 }, + { 0x17, "AMD Ryzen Threadripper 1950", 10000 }, + { 0x17, "AMD Ryzen Threadripper 1920", 10000 }, + { 0x17, "AMD Ryzen Threadripper 1910", 10000 }, }; static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval) @@ -110,6 +128,7 @@ static ssize_t temp1_input_show(struct device *dev, data->read_tempreg(data->pdev, ®val); temp = (regval >> 21) * 125; + temp -= data->temp_offset; return sprintf(buf, "%u\n", temp); } @@ -217,6 +236,7 @@ static int k10temp_probe(struct pci_dev *pdev, struct device *dev = &pdev->dev; struct k10temp_data *data; struct device *hwmon_dev; + int i; if (unreliable) { if (!force) { @@ -242,6 +262,16 @@ static int k10temp_probe(struct pci_dev *pdev, else data->read_tempreg = read_tempreg_pci; + for (i = 0; i < ARRAY_SIZE(tctl_offset_table); i++) { + const struct tctl_offset *entry = &tctl_offset_table[i]; + + if (boot_cpu_data.x86 == entry->model && + strstr(boot_cpu_data.x86_model_id, entry->id)) { + data->temp_offset = entry->offset; + break; + } + } + hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", data, k10temp_groups); return PTR_ERR_OR_ZERO(hwmon_dev); From 186731145f920fb1514200043bcaf9c689693857 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 10 Sep 2017 11:44:46 +0200 Subject: [PATCH 0650/1085] hwmon: (sht15) Root out platform data After finding out there are active users of this sensor I noticed: - It has a single PXA27x board file using the platform data - The platform data is only used to carry two GPIO pins, all other fields are unused - The driver does not use GPIO descriptors but the legacy GPIO API I saw we can swiftly fix this by: - Killing off the platform data entirely - Define a GPIO descriptor lookup table in the board file - Use the standard devm_gpiod_get() to grab the GPIO descriptors from either the device tree or the board file table. This compiles, but needs testing. Cc: arm@kernel.org Cc: Marco Franchi Cc: Davide Hug Cc: Jonathan Cameron Signed-off-by: Linus Walleij Acked-by: Arnd Bergmann Tested-by: Marco Franchi Acked-by: Arnd Bergmann Signed-off-by: Guenter Roeck --- Documentation/hwmon/sht15 | 3 +- arch/arm/mach-pxa/stargate2.c | 17 +-- drivers/hwmon/sht15.c | 167 +++++++++------------------- include/linux/platform_data/sht15.h | 38 ------- 4 files changed, 65 insertions(+), 160 deletions(-) delete mode 100644 include/linux/platform_data/sht15.h diff --git a/Documentation/hwmon/sht15 b/Documentation/hwmon/sht15 index 778987d1856f..5e3207c3b177 100644 --- a/Documentation/hwmon/sht15 +++ b/Documentation/hwmon/sht15 @@ -42,8 +42,7 @@ chip. These coefficients are used to internally calibrate the signals from the sensors. Disabling the reload of those coefficients allows saving 10ms for each measurement and decrease power consumption, while losing on precision. -Some options may be set directly in the sht15_platform_data structure -or via sysfs attributes. +Some options may be set via sysfs attributes. Notes: * The regulator supply name is set to "vcc". diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c index 2d45d18b1a5e..6b7df6fd2448 100644 --- a/arch/arm/mach-pxa/stargate2.c +++ b/arch/arm/mach-pxa/stargate2.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,6 @@ #include #include #include -#include #include "devices.h" #include "generic.h" @@ -137,17 +137,18 @@ static unsigned long sg2_im2_unified_pin_config[] __initdata = { GPIO10_GPIO, /* large basic connector pin 23 */ }; -static struct sht15_platform_data platform_data_sht15 = { - .gpio_data = 100, - .gpio_sck = 98, +static struct gpiod_lookup_table sht15_gpiod_table = { + .dev_id = "sht15", + .table = { + /* FIXME: should this have |GPIO_OPEN_DRAIN set? */ + GPIO_LOOKUP("gpio-pxa", 100, "data", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-pxa", 98, "clk", GPIO_ACTIVE_HIGH), + }, }; static struct platform_device sht15 = { .name = "sht15", .id = -1, - .dev = { - .platform_data = &platform_data_sht15, - }, }; static struct regulator_consumer_supply stargate2_sensor_3_con[] = { @@ -608,6 +609,7 @@ static void __init imote2_init(void) imote2_stargate2_init(); + gpiod_add_lookup_table(&sht15_gpiod_table); platform_add_devices(imote2_devices, ARRAY_SIZE(imote2_devices)); i2c_register_board_info(0, imote2_i2c_board_info, @@ -988,6 +990,7 @@ static void __init stargate2_init(void) imote2_stargate2_init(); + gpiod_add_lookup_table(&sht15_gpiod_table); platform_add_devices(ARRAY_AND_SIZE(stargate2_devices)); i2c_register_board_info(0, ARRAY_AND_SIZE(stargate2_i2c_board_info)); diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index e4d642b673c6..0e3e5f83f5cf 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -18,13 +18,11 @@ #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -34,7 +32,8 @@ #include #include #include -#include +#include +#include /* Commands */ #define SHT15_MEASURE_TEMP 0x03 @@ -122,7 +121,8 @@ static const u8 sht15_crc8_table[] = { /** * struct sht15_data - device instance specific data - * @pdata: platform data (gpio's etc). + * @sck: clock GPIO line + * @data: data GPIO line * @read_work: bh of interrupt handler. * @wait_queue: wait queue for getting values from device. * @val_temp: last temperature value read from device. @@ -150,7 +150,8 @@ static const u8 sht15_crc8_table[] = { * @interrupt_handled: flag used to indicate a handler has been scheduled. */ struct sht15_data { - struct sht15_platform_data *pdata; + struct gpio_desc *sck; + struct gpio_desc *data; struct work_struct read_work; wait_queue_head_t wait_queue; uint16_t val_temp; @@ -205,16 +206,16 @@ static int sht15_connection_reset(struct sht15_data *data) { int i, err; - err = gpio_direction_output(data->pdata->gpio_data, 1); + err = gpiod_direction_output(data->data, 1); if (err) return err; ndelay(SHT15_TSCKL); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); for (i = 0; i < 9; ++i) { - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); } return 0; @@ -227,11 +228,11 @@ static int sht15_connection_reset(struct sht15_data *data) */ static inline void sht15_send_bit(struct sht15_data *data, int val) { - gpio_set_value(data->pdata->gpio_data, val); + gpiod_set_value(data->data, val); ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); /* clock low time */ } @@ -248,23 +249,23 @@ static int sht15_transmission_start(struct sht15_data *data) int err; /* ensure data is high and output */ - err = gpio_direction_output(data->pdata->gpio_data, 1); + err = gpiod_direction_output(data->data, 1); if (err) return err; ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - gpio_set_value(data->pdata->gpio_data, 0); + gpiod_set_value(data->data, 0); ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - gpio_set_value(data->pdata->gpio_data, 1); + gpiod_set_value(data->data, 1); ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); return 0; } @@ -292,20 +293,20 @@ static int sht15_wait_for_response(struct sht15_data *data) { int err; - err = gpio_direction_input(data->pdata->gpio_data); + err = gpiod_direction_input(data->data); if (err) return err; - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - if (gpio_get_value(data->pdata->gpio_data)) { - gpio_set_value(data->pdata->gpio_sck, 0); + if (gpiod_get_value(data->data)) { + gpiod_set_value(data->sck, 0); dev_err(data->dev, "Command not acknowledged\n"); err = sht15_connection_reset(data); if (err) return err; return -EIO; } - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); return 0; } @@ -360,17 +361,17 @@ static int sht15_ack(struct sht15_data *data) { int err; - err = gpio_direction_output(data->pdata->gpio_data, 0); + err = gpiod_direction_output(data->data, 0); if (err) return err; ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_data, 1); + gpiod_set_value(data->data, 1); - return gpio_direction_input(data->pdata->gpio_data); + return gpiod_direction_input(data->data); } /** @@ -383,13 +384,13 @@ static int sht15_end_transmission(struct sht15_data *data) { int err; - err = gpio_direction_output(data->pdata->gpio_data, 1); + err = gpiod_direction_output(data->data, 1); if (err) return err; ndelay(SHT15_TSU); - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - gpio_set_value(data->pdata->gpio_sck, 0); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); return 0; } @@ -405,10 +406,10 @@ static u8 sht15_read_byte(struct sht15_data *data) for (i = 0; i < 8; ++i) { byte <<= 1; - gpio_set_value(data->pdata->gpio_sck, 1); + gpiod_set_value(data->sck, 1); ndelay(SHT15_TSCKH); - byte |= !!gpio_get_value(data->pdata->gpio_data); - gpio_set_value(data->pdata->gpio_sck, 0); + byte |= !!gpiod_get_value(data->data); + gpiod_set_value(data->sck, 0); ndelay(SHT15_TSCKL); } return byte; @@ -428,7 +429,7 @@ static int sht15_send_status(struct sht15_data *data, u8 status) err = sht15_send_cmd(data, SHT15_WRITE_STATUS); if (err) return err; - err = gpio_direction_output(data->pdata->gpio_data, 1); + err = gpiod_direction_output(data->data, 1); if (err) return err; ndelay(SHT15_TSU); @@ -528,14 +529,14 @@ static int sht15_measurement(struct sht15_data *data, if (ret) return ret; - ret = gpio_direction_input(data->pdata->gpio_data); + ret = gpiod_direction_input(data->data); if (ret) return ret; atomic_set(&data->interrupt_handled, 0); - enable_irq(gpio_to_irq(data->pdata->gpio_data)); - if (gpio_get_value(data->pdata->gpio_data) == 0) { - disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data)); + enable_irq(gpiod_to_irq(data->data)); + if (gpiod_get_value(data->data) == 0) { + disable_irq_nosync(gpiod_to_irq(data->data)); /* Only relevant if the interrupt hasn't occurred. */ if (!atomic_read(&data->interrupt_handled)) schedule_work(&data->read_work); @@ -547,7 +548,7 @@ static int sht15_measurement(struct sht15_data *data, data->state = SHT15_READING_NOTHING; return -EIO; } else if (ret == 0) { /* timeout occurred */ - disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data)); + disable_irq_nosync(gpiod_to_irq(data->data)); ret = sht15_connection_reset(data); if (ret) return ret; @@ -826,15 +827,15 @@ static void sht15_bh_read_data(struct work_struct *work_s) read_work); /* Firstly, verify the line is low */ - if (gpio_get_value(data->pdata->gpio_data)) { + if (gpiod_get_value(data->data)) { /* * If not, then start the interrupt again - care here as could * have gone low in meantime so verify it hasn't! */ atomic_set(&data->interrupt_handled, 0); - enable_irq(gpio_to_irq(data->pdata->gpio_data)); + enable_irq(gpiod_to_irq(data->data)); /* If still not occurred or another handler was scheduled */ - if (gpio_get_value(data->pdata->gpio_data) + if (gpiod_get_value(data->data) || atomic_read(&data->interrupt_handled)) return; } @@ -918,46 +919,6 @@ static const struct of_device_id sht15_dt_match[] = { { }, }; MODULE_DEVICE_TABLE(of, sht15_dt_match); - -/* - * This function returns NULL if pdev isn't a device instatiated by dt, - * a pointer to pdata if it could successfully get all information - * from dt or a negative ERR_PTR() on error. - */ -static struct sht15_platform_data *sht15_probe_dt(struct device *dev) -{ - struct device_node *np = dev->of_node; - struct sht15_platform_data *pdata; - - /* no device tree device */ - if (!np) - return NULL; - - pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - return ERR_PTR(-ENOMEM); - - pdata->gpio_data = of_get_named_gpio(np, "data-gpios", 0); - if (pdata->gpio_data < 0) { - if (pdata->gpio_data != -EPROBE_DEFER) - dev_err(dev, "data-gpios not found\n"); - return ERR_PTR(pdata->gpio_data); - } - - pdata->gpio_sck = of_get_named_gpio(np, "clk-gpios", 0); - if (pdata->gpio_sck < 0) { - if (pdata->gpio_sck != -EPROBE_DEFER) - dev_err(dev, "clk-gpios not found\n"); - return ERR_PTR(pdata->gpio_sck); - } - - return pdata; -} -#else -static inline struct sht15_platform_data *sht15_probe_dt(struct device *dev) -{ - return NULL; -} #endif static int sht15_probe(struct platform_device *pdev) @@ -977,25 +938,6 @@ static int sht15_probe(struct platform_device *pdev) data->dev = &pdev->dev; init_waitqueue_head(&data->wait_queue); - data->pdata = sht15_probe_dt(&pdev->dev); - if (IS_ERR(data->pdata)) - return PTR_ERR(data->pdata); - if (data->pdata == NULL) { - data->pdata = dev_get_platdata(&pdev->dev); - if (data->pdata == NULL) { - dev_err(&pdev->dev, "no platform data supplied\n"); - return -EINVAL; - } - } - - data->supply_uv = data->pdata->supply_mv * 1000; - if (data->pdata->checksum) - data->checksumming = true; - if (data->pdata->no_otp_reload) - status |= SHT15_STATUS_NO_OTP_RELOAD; - if (data->pdata->low_resolution) - status |= SHT15_STATUS_LOW_RESOLUTION; - /* * If a regulator is available, * query what the supply voltage actually is! @@ -1030,21 +972,20 @@ static int sht15_probe(struct platform_device *pdev) } /* Try requesting the GPIOs */ - ret = devm_gpio_request_one(&pdev->dev, data->pdata->gpio_sck, - GPIOF_OUT_INIT_LOW, "SHT15 sck"); - if (ret) { + data->sck = devm_gpiod_get(&pdev->dev, "clk", GPIOD_OUT_LOW); + if (IS_ERR(data->sck)) { + ret = PTR_ERR(data->sck); dev_err(&pdev->dev, "clock line GPIO request failed\n"); goto err_release_reg; } - - ret = devm_gpio_request(&pdev->dev, data->pdata->gpio_data, - "SHT15 data"); - if (ret) { + data->data = devm_gpiod_get(&pdev->dev, "data", GPIOD_IN); + if (IS_ERR(data->data)) { + ret = PTR_ERR(data->data); dev_err(&pdev->dev, "data line GPIO request failed\n"); goto err_release_reg; } - ret = devm_request_irq(&pdev->dev, gpio_to_irq(data->pdata->gpio_data), + ret = devm_request_irq(&pdev->dev, gpiod_to_irq(data->data), sht15_interrupt_fired, IRQF_TRIGGER_FALLING, "sht15 data", @@ -1053,7 +994,7 @@ static int sht15_probe(struct platform_device *pdev) dev_err(&pdev->dev, "failed to get irq for data line\n"); goto err_release_reg; } - disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data)); + disable_irq_nosync(gpiod_to_irq(data->data)); ret = sht15_connection_reset(data); if (ret) goto err_release_reg; diff --git a/include/linux/platform_data/sht15.h b/include/linux/platform_data/sht15.h deleted file mode 100644 index 12289c1e9413..000000000000 --- a/include/linux/platform_data/sht15.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * sht15.h - support for the SHT15 Temperature and Humidity Sensor - * - * Copyright (c) 2009 Jonathan Cameron - * - * Copyright (c) 2007 Wouter Horre - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * For further information, see the Documentation/hwmon/sht15 file. - */ - -#ifndef _PDATA_SHT15_H -#define _PDATA_SHT15_H - -/** - * struct sht15_platform_data - sht15 connectivity info - * @gpio_data: no. of gpio to which bidirectional data line is - * connected. - * @gpio_sck: no. of gpio to which the data clock is connected. - * @supply_mv: supply voltage in mv. Overridden by regulator if - * available. - * @checksum: flag to indicate the checksum should be validated. - * @no_otp_reload: flag to indicate no reload from OTP. - * @low_resolution: flag to indicate the temp/humidity resolution to use. - */ -struct sht15_platform_data { - int gpio_data; - int gpio_sck; - int supply_mv; - bool checksum; - bool no_otp_reload; - bool low_resolution; -}; - -#endif /* _PDATA_SHT15_H */ From 90b863dd63d63f4d97c3f5b6d6ed5bd239500f7b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 15:45:48 +0100 Subject: [PATCH 0651/1085] hwmon: (sht15) remove redundant check on status and send of status value A previous commit removed bit or'ing into to the integer status so now status is now always zero. This means that the non-zero check on status and the sht15_send_status call will never occur; it is deadcode. Clean this up by removing the dead code. Detected by: CoverityScan CID#1456835 ("Logically dead code") Fixes: aa7ab80c578c ("hwmon: (sht15) Root out platform data") Signed-off-by: Colin Ian King Signed-off-by: Guenter Roeck --- drivers/hwmon/sht15.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 0e3e5f83f5cf..25d28343ba93 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -925,7 +925,6 @@ static int sht15_probe(struct platform_device *pdev) { int ret; struct sht15_data *data; - u8 status = 0; data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) @@ -1002,13 +1001,6 @@ static int sht15_probe(struct platform_device *pdev) if (ret) goto err_release_reg; - /* write status with platform data options */ - if (status) { - ret = sht15_send_status(data, status); - if (ret) - goto err_release_reg; - } - ret = sysfs_create_group(&pdev->dev.kobj, &sht15_attr_group); if (ret) { dev_err(&pdev->dev, "sysfs create failed\n"); From 0ec54a2e8ee22b13e9a89dfd76e66e5b75ca9885 Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Mon, 11 Sep 2017 14:16:48 -0500 Subject: [PATCH 0652/1085] dt-bindings: hwmon: add compatible for max1619 Add new device tree bindings document for max1619 device including a new compatible string. Signed-off-by: Alan Tull Cc: Jean Delvare Cc: Guenter Roeck Cc: Rob Herring Cc: Mark Rutland Acked-by: Rob Herring Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/max1619.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 Documentation/devicetree/bindings/hwmon/max1619.txt diff --git a/Documentation/devicetree/bindings/hwmon/max1619.txt b/Documentation/devicetree/bindings/hwmon/max1619.txt new file mode 100644 index 000000000000..c70dbbe1e56f --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/max1619.txt @@ -0,0 +1,12 @@ +Bindings for MAX1619 Temperature Sensor + +Required properties: +- compatible : "maxim,max1619" +- reg : I2C address, one of 0x18, 0x19, 0x1a, 0x29, 0x2a, 0x2b, 0x4c, or + 0x4d, 0x4e + +Example: + temp@4c { + compatible = "maxim,max1619"; + reg = <0x4c>; + }; From fd53f62160e5fcea18f6adcddd40ba429dc127b6 Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Mon, 11 Sep 2017 14:16:49 -0500 Subject: [PATCH 0653/1085] hwmon: (max1619) Add dt binding Add new device tree binding for max1619. Signed-off-by: Alan Tull Cc: Jean Delvare Cc: Guenter Roeck Cc: Rob Herring Cc: Mark Rutland Signed-off-by: Guenter Roeck --- drivers/hwmon/max1619.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/hwmon/max1619.c b/drivers/hwmon/max1619.c index a18278938494..76d966932941 100644 --- a/drivers/hwmon/max1619.c +++ b/drivers/hwmon/max1619.c @@ -303,10 +303,20 @@ static const struct i2c_device_id max1619_id[] = { }; MODULE_DEVICE_TABLE(i2c, max1619_id); +#ifdef CONFIG_OF +static const struct of_device_id max1619_of_match[] = { + { .compatible = "maxim,max1619", }, + {}, +}; + +MODULE_DEVICE_TABLE(of, max1619_of_match); +#endif + static struct i2c_driver max1619_driver = { .class = I2C_CLASS_HWMON, .driver = { .name = "max1619", + .of_match_table = of_match_ptr(max1619_of_match), }, .probe = max1619_probe, .id_table = max1619_id, From 762b1e88801357770889d013c5d20fe110d1f456 Mon Sep 17 00:00:00 2001 From: Patrick Venture Date: Mon, 11 Sep 2017 15:41:55 -0700 Subject: [PATCH 0654/1085] hwmon: (aspeed-pwm-tacho) increase fan tach period The previous value reduced the time required to determine the fan value, however, it's also used as the final timeout mechanism. The prevous value would work for any fan speed greater than around 3k RPM. This increased value, lets the fan speeds return quickly but will wait longer to handle speeds below 3k RPM. Testing: this value was determined through experimentation on the ast2400 on the Quanta-q71l. This configurations runs 8 fans attached to the controller. Signed-off-by: Patrick Venture Reviewed-by: Guenter Roeck Signed-off-by: Guenter Roeck --- drivers/hwmon/aspeed-pwm-tacho.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/aspeed-pwm-tacho.c b/drivers/hwmon/aspeed-pwm-tacho.c index 69b97d45e3cb..f914e5f41048 100644 --- a/drivers/hwmon/aspeed-pwm-tacho.c +++ b/drivers/hwmon/aspeed-pwm-tacho.c @@ -161,7 +161,7 @@ * 11: reserved. */ #define M_TACH_MODE 0x02 /* 10b */ -#define M_TACH_UNIT 0x00c0 +#define M_TACH_UNIT 0x0210 #define INIT_FAN_CTRL 0xFF /* How long we sleep in us while waiting for an RPM result. */ From 7a76a7f34afdfb080ec8e51ed18891b4f72ec907 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 22 Sep 2017 15:36:20 +0100 Subject: [PATCH 0655/1085] hwmon: (w83793) make const array watchdog_minors static, reduces object code size Don't populate const array watchdog_minors on the stack, instead make it static. Makes the object code smaller by over 350 bytes: Before: text data bss dec hex filename 48019 38144 256 86419 15193 drivers/hwmon/w83793.o After: text data bss dec hex filename 47574 38232 256 86062 1502e drivers/hwmon/w83793.o (gcc 6.3.0, x86-64) Signed-off-by: Colin Ian King Signed-off-by: Guenter Roeck --- drivers/hwmon/w83793.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c index dab5c515d5a3..5ba9d9f1daa1 100644 --- a/drivers/hwmon/w83793.c +++ b/drivers/hwmon/w83793.c @@ -1676,7 +1676,9 @@ static int w83793_probe(struct i2c_client *client, const struct i2c_device_id *id) { struct device *dev = &client->dev; - const int watchdog_minors[] = { WATCHDOG_MINOR, 212, 213, 214, 215 }; + static const int watchdog_minors[] = { + WATCHDOG_MINOR, 212, 213, 214, 215 + }; struct w83793_data *data; int i, tmp, val, err; int files_fan = ARRAY_SIZE(w83793_left_fan) / 7; From 92b64580f14b24a3d5cfd1e9dff0b745826a824b Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Tue, 3 Oct 2017 18:08:27 +0000 Subject: [PATCH 0656/1085] hwmon: (max6621) Add support for Maxim MAX6621 temperature sensor MAX6621 is a PECI-to-I2C translator provides an efficient, low-cost solution for PECI-to-SMBus/I2C protocol conversion. It allows reading the temperature from the PECI-compliant host directly from up to four PECI-enabled CPUs. Signed-off-by: Vadim Pasternak Signed-off-by: Guenter Roeck --- drivers/hwmon/Kconfig | 14 + drivers/hwmon/Makefile | 1 + drivers/hwmon/max6621.c | 593 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 608 insertions(+) create mode 100644 drivers/hwmon/max6621.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index d65431417b17..fae8a8904c10 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -862,6 +862,20 @@ tristate "MAX31722 temperature sensor" This driver can also be built as a module. If so, the module will be called max31722. +config SENSORS_MAX6621 + tristate "Maxim MAX6621 sensor chip" + depends on I2C + select REGMAP_I2C + help + If you say yes here you get support for MAX6621 sensor chip. + MAX6621 is a PECI-to-I2C translator provides an efficient, + low-cost solution for PECI-to-SMBus/I2C protocol conversion. + It allows reading the temperature from the PECI-compliant + host directly from up to four PECI-enabled CPUs. + + This driver can also be built as a module. If so, the module + will be called max6621. + config SENSORS_MAX6639 tristate "Maxim MAX6639 sensor chip" depends on I2C diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index c84d9784be98..8941a478d2a7 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -117,6 +117,7 @@ obj-$(CONFIG_SENSORS_MAX1619) += max1619.o obj-$(CONFIG_SENSORS_MAX1668) += max1668.o obj-$(CONFIG_SENSORS_MAX197) += max197.o obj-$(CONFIG_SENSORS_MAX31722) += max31722.o +obj-$(CONFIG_SENSORS_MAX6621) += max6621.o obj-$(CONFIG_SENSORS_MAX6639) += max6639.o obj-$(CONFIG_SENSORS_MAX6642) += max6642.o obj-$(CONFIG_SENSORS_MAX6650) += max6650.o diff --git a/drivers/hwmon/max6621.c b/drivers/hwmon/max6621.c new file mode 100644 index 000000000000..22079ec29660 --- /dev/null +++ b/drivers/hwmon/max6621.c @@ -0,0 +1,593 @@ +/* + * Hardware monitoring driver for Maxim MAX6621 + * + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Vadim Pasternak + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX6621_DRV_NAME "max6621" +#define MAX6621_TEMP_INPUT_REG_NUM 9 +#define MAX6621_TEMP_INPUT_MIN -127000 +#define MAX6621_TEMP_INPUT_MAX 128000 +#define MAX6621_TEMP_ALERT_CHAN_SHIFT 1 + +#define MAX6621_TEMP_S0D0_REG 0x00 +#define MAX6621_TEMP_S0D1_REG 0x01 +#define MAX6621_TEMP_S1D0_REG 0x02 +#define MAX6621_TEMP_S1D1_REG 0x03 +#define MAX6621_TEMP_S2D0_REG 0x04 +#define MAX6621_TEMP_S2D1_REG 0x05 +#define MAX6621_TEMP_S3D0_REG 0x06 +#define MAX6621_TEMP_S3D1_REG 0x07 +#define MAX6621_TEMP_MAX_REG 0x08 +#define MAX6621_TEMP_MAX_ADDR_REG 0x0a +#define MAX6621_TEMP_ALERT_CAUSE_REG 0x0b +#define MAX6621_CONFIG0_REG 0x0c +#define MAX6621_CONFIG1_REG 0x0d +#define MAX6621_CONFIG2_REG 0x0e +#define MAX6621_CONFIG3_REG 0x0f +#define MAX6621_TEMP_S0_ALERT_REG 0x10 +#define MAX6621_TEMP_S1_ALERT_REG 0x11 +#define MAX6621_TEMP_S2_ALERT_REG 0x12 +#define MAX6621_TEMP_S3_ALERT_REG 0x13 +#define MAX6621_CLEAR_ALERT_REG 0x15 +#define MAX6621_REG_MAX (MAX6621_CLEAR_ALERT_REG + 1) +#define MAX6621_REG_TEMP_SHIFT 0x06 + +#define MAX6621_ENABLE_TEMP_ALERTS_BIT 4 +#define MAX6621_ENABLE_I2C_CRC_BIT 5 +#define MAX6621_ENABLE_ALTERNATE_DATA 6 +#define MAX6621_ENABLE_LOCKUP_TO 7 +#define MAX6621_ENABLE_S0D0_BIT 8 +#define MAX6621_ENABLE_S3D1_BIT 15 +#define MAX6621_ENABLE_TEMP_ALL GENMASK(MAX6621_ENABLE_S3D1_BIT, \ + MAX6621_ENABLE_S0D0_BIT) +#define MAX6621_POLL_DELAY_MASK 0x5 +#define MAX6621_CONFIG0_INIT (MAX6621_ENABLE_TEMP_ALL | \ + BIT(MAX6621_ENABLE_LOCKUP_TO) | \ + BIT(MAX6621_ENABLE_I2C_CRC_BIT) | \ + MAX6621_POLL_DELAY_MASK) +#define MAX6621_PECI_BIT_TIME 0x2 +#define MAX6621_PECI_RETRY_NUM 0x3 +#define MAX6621_CONFIG1_INIT ((MAX6621_PECI_BIT_TIME << 8) | \ + MAX6621_PECI_RETRY_NUM) + +/* Error codes */ +#define MAX6621_TRAN_FAILED 0x8100 /* + * PECI transaction failed for more + * than the configured number of + * consecutive retries. + */ +#define MAX6621_POOL_DIS 0x8101 /* + * Polling disabled for requested + * socket/domain. + */ +#define MAX6621_POOL_UNCOMPLETE 0x8102 /* + * First poll not yet completed for + * requested socket/domain (on + * startup). + */ +#define MAX6621_SD_DIS 0x8103 /* + * Read maximum temperature requested, + * but no sockets/domains enabled or + * all enabled sockets/domains have + * errors; or read maximum temperature + * address requested, but read maximum + * temperature was not called. + */ +#define MAX6621_ALERT_DIS 0x8104 /* + * Get alert socket/domain requested, + * but no alert active. + */ +#define MAX6621_PECI_ERR_MIN 0x8000 /* Intel spec PECI error min value. */ +#define MAX6621_PECI_ERR_MAX 0x80ff /* Intel spec PECI error max value. */ + +static const u32 max6621_temp_regs[] = { + MAX6621_TEMP_MAX_REG, MAX6621_TEMP_S0D0_REG, MAX6621_TEMP_S1D0_REG, + MAX6621_TEMP_S2D0_REG, MAX6621_TEMP_S3D0_REG, MAX6621_TEMP_S0D1_REG, + MAX6621_TEMP_S1D1_REG, MAX6621_TEMP_S2D1_REG, MAX6621_TEMP_S3D1_REG, +}; + +static const char *const max6621_temp_labels[] = { + "maximum", + "socket0_0", + "socket1_0", + "socket2_0", + "socket3_0", + "socket0_1", + "socket1_1", + "socket2_1", + "socket3_1", +}; + +static const int max6621_temp_alert_chan2reg[] = { + MAX6621_TEMP_S0_ALERT_REG, + MAX6621_TEMP_S1_ALERT_REG, + MAX6621_TEMP_S2_ALERT_REG, + MAX6621_TEMP_S3_ALERT_REG, +}; + +/** + * struct max6621_data - private data: + * + * @client: I2C client; + * @regmap: register map handle; + * @input_chan2reg: mapping from channel to register; + */ +struct max6621_data { + struct i2c_client *client; + struct regmap *regmap; + int input_chan2reg[MAX6621_TEMP_INPUT_REG_NUM + 1]; +}; + +static long max6621_temp_mc2reg(long val) +{ + return (val / 1000L) << MAX6621_REG_TEMP_SHIFT; +} + +static umode_t +max6621_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, + int channel) +{ + /* Skip channels which are not physically conncted. */ + if (((struct max6621_data *)data)->input_chan2reg[channel] < 0) + return 0; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + case hwmon_temp_label: + case hwmon_temp_crit_alarm: + return 0444; + case hwmon_temp_offset: + case hwmon_temp_crit: + return 0644; + default: + break; + } + + default: + break; + } + + return 0; +} + +static int max6621_verify_reg_data(struct device *dev, int regval) +{ + if (regval >= MAX6621_PECI_ERR_MIN && + regval <= MAX6621_PECI_ERR_MAX) { + dev_dbg(dev, "PECI error code - err 0x%04x.\n", + regval); + + return -EIO; + } + + switch (regval) { + case MAX6621_TRAN_FAILED: + dev_dbg(dev, "PECI transaction failed - err 0x%04x.\n", + regval); + return -EIO; + case MAX6621_POOL_DIS: + dev_dbg(dev, "Polling disabled - err 0x%04x.\n", regval); + return -EOPNOTSUPP; + case MAX6621_POOL_UNCOMPLETE: + dev_dbg(dev, "First poll not completed on startup - err 0x%04x.\n", + regval); + return -EIO; + case MAX6621_SD_DIS: + dev_dbg(dev, "Resource is disabled - err 0x%04x.\n", regval); + return -EOPNOTSUPP; + case MAX6621_ALERT_DIS: + dev_dbg(dev, "No alert active - err 0x%04x.\n", regval); + return -EOPNOTSUPP; + default: + return 0; + } +} + +static int +max6621_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long *val) +{ + struct max6621_data *data = dev_get_drvdata(dev); + u32 regval; + int reg; + s8 temp; + int ret; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + reg = data->input_chan2reg[channel]; + ret = regmap_read(data->regmap, reg, ®val); + if (ret) + return ret; + + ret = max6621_verify_reg_data(dev, regval); + if (ret) + return ret; + + /* + * Bit MAX6621_REG_TEMP_SHIFT represents 1 degree step. + * The temperature is given in two's complement and 8 + * bits is used for the register conversion. + */ + temp = (regval >> MAX6621_REG_TEMP_SHIFT); + *val = temp * 1000L; + + break; + case hwmon_temp_offset: + ret = regmap_read(data->regmap, MAX6621_CONFIG2_REG, + ®val); + if (ret) + return ret; + + ret = max6621_verify_reg_data(dev, regval); + if (ret) + return ret; + + *val = (regval >> MAX6621_REG_TEMP_SHIFT) * + 1000L; + + break; + case hwmon_temp_crit: + channel -= MAX6621_TEMP_ALERT_CHAN_SHIFT; + reg = max6621_temp_alert_chan2reg[channel]; + ret = regmap_read(data->regmap, reg, ®val); + if (ret) + return ret; + + ret = max6621_verify_reg_data(dev, regval); + if (ret) + return ret; + + *val = regval * 1000L; + + break; + case hwmon_temp_crit_alarm: + /* + * Set val to zero to recover the case, when reading + * MAX6621_TEMP_ALERT_CAUSE_REG results in for example + * MAX6621_ALERT_DIS. Reading will return with error, + * but in such case alarm should be returned as 0. + */ + *val = 0; + ret = regmap_read(data->regmap, + MAX6621_TEMP_ALERT_CAUSE_REG, + ®val); + if (ret) + return ret; + + ret = max6621_verify_reg_data(dev, regval); + if (ret) { + /* Do not report error if alert is disabled. */ + if (regval == MAX6621_ALERT_DIS) + return 0; + else + return ret; + } + + /* + * Clear the alert automatically, using send-byte + * smbus protocol for clearing alert. + */ + if (regval) { + ret = i2c_smbus_write_byte(data->client, + MAX6621_CLEAR_ALERT_REG); + if (!ret) + return ret; + } + + *val = !!regval; + + break; + default: + return -EOPNOTSUPP; + } + break; + + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int +max6621_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long val) +{ + struct max6621_data *data = dev_get_drvdata(dev); + u32 reg; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_offset: + /* Clamp to allowed range to prevent overflow. */ + val = clamp_val(val, MAX6621_TEMP_INPUT_MIN, + MAX6621_TEMP_INPUT_MAX); + val = max6621_temp_mc2reg(val); + + return regmap_write(data->regmap, + MAX6621_CONFIG2_REG, val); + case hwmon_temp_crit: + channel -= MAX6621_TEMP_ALERT_CHAN_SHIFT; + reg = max6621_temp_alert_chan2reg[channel]; + /* Clamp to allowed range to prevent overflow. */ + val = clamp_val(val, MAX6621_TEMP_INPUT_MIN, + MAX6621_TEMP_INPUT_MAX); + val = val / 1000L; + + return regmap_write(data->regmap, reg, val); + default: + return -EOPNOTSUPP; + } + break; + + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int +max6621_read_string(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_label: + *str = max6621_temp_labels[channel]; + return 0; + default: + return -EOPNOTSUPP; + } + break; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static bool max6621_writeable_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case MAX6621_CONFIG0_REG: + case MAX6621_CONFIG1_REG: + case MAX6621_CONFIG2_REG: + case MAX6621_CONFIG3_REG: + case MAX6621_TEMP_S0_ALERT_REG: + case MAX6621_TEMP_S1_ALERT_REG: + case MAX6621_TEMP_S2_ALERT_REG: + case MAX6621_TEMP_S3_ALERT_REG: + case MAX6621_TEMP_ALERT_CAUSE_REG: + return true; + } + return false; +} + +static bool max6621_readable_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case MAX6621_TEMP_S0D0_REG: + case MAX6621_TEMP_S0D1_REG: + case MAX6621_TEMP_S1D0_REG: + case MAX6621_TEMP_S1D1_REG: + case MAX6621_TEMP_S2D0_REG: + case MAX6621_TEMP_S2D1_REG: + case MAX6621_TEMP_S3D0_REG: + case MAX6621_TEMP_S3D1_REG: + case MAX6621_TEMP_MAX_REG: + case MAX6621_TEMP_MAX_ADDR_REG: + case MAX6621_CONFIG0_REG: + case MAX6621_CONFIG1_REG: + case MAX6621_CONFIG2_REG: + case MAX6621_CONFIG3_REG: + case MAX6621_TEMP_S0_ALERT_REG: + case MAX6621_TEMP_S1_ALERT_REG: + case MAX6621_TEMP_S2_ALERT_REG: + case MAX6621_TEMP_S3_ALERT_REG: + return true; + } + return false; +} + +static bool max6621_volatile_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case MAX6621_TEMP_S0D0_REG: + case MAX6621_TEMP_S0D1_REG: + case MAX6621_TEMP_S1D0_REG: + case MAX6621_TEMP_S1D1_REG: + case MAX6621_TEMP_S2D0_REG: + case MAX6621_TEMP_S2D1_REG: + case MAX6621_TEMP_S3D0_REG: + case MAX6621_TEMP_S3D1_REG: + case MAX6621_TEMP_MAX_REG: + case MAX6621_TEMP_S0_ALERT_REG: + case MAX6621_TEMP_S1_ALERT_REG: + case MAX6621_TEMP_S2_ALERT_REG: + case MAX6621_TEMP_S3_ALERT_REG: + case MAX6621_TEMP_ALERT_CAUSE_REG: + return true; + } + return false; +} + +static const struct reg_default max6621_regmap_default[] = { + { MAX6621_CONFIG0_REG, MAX6621_CONFIG0_INIT }, + { MAX6621_CONFIG1_REG, MAX6621_CONFIG1_INIT }, +}; + +static const struct regmap_config max6621_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = MAX6621_REG_MAX, + .val_format_endian = REGMAP_ENDIAN_LITTLE, + .cache_type = REGCACHE_FLAT, + .writeable_reg = max6621_writeable_reg, + .readable_reg = max6621_readable_reg, + .volatile_reg = max6621_volatile_reg, + .reg_defaults = max6621_regmap_default, + .num_reg_defaults = ARRAY_SIZE(max6621_regmap_default), +}; + +static u32 max6621_chip_config[] = { + HWMON_C_REGISTER_TZ, + 0 +}; + +static const struct hwmon_channel_info max6621_chip = { + .type = hwmon_chip, + .config = max6621_chip_config, +}; + +static const u32 max6621_temp_config[] = { + HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_OFFSET, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_CRIT_ALARM | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_CRIT_ALARM | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_CRIT_ALARM | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_CRIT_ALARM | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + 0 +}; + +static const struct hwmon_channel_info max6621_temp = { + .type = hwmon_temp, + .config = max6621_temp_config, +}; + +static const struct hwmon_channel_info *max6621_info[] = { + &max6621_chip, + &max6621_temp, + NULL +}; + +static const struct hwmon_ops max6621_hwmon_ops = { + .read = max6621_read, + .write = max6621_write, + .read_string = max6621_read_string, + .is_visible = max6621_is_visible, +}; + +static const struct hwmon_chip_info max6621_chip_info = { + .ops = &max6621_hwmon_ops, + .info = max6621_info, +}; + +static int max6621_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct device *dev = &client->dev; + struct max6621_data *data; + struct device *hwmon_dev; + int i; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->regmap = devm_regmap_init_i2c(client, &max6621_regmap_config); + if (IS_ERR(data->regmap)) + return PTR_ERR(data->regmap); + + i2c_set_clientdata(client, data); + data->client = client; + + /* Set CONFIG0 register masking temperature alerts and PEC. */ + ret = regmap_write(data->regmap, MAX6621_CONFIG0_REG, + MAX6621_CONFIG0_INIT); + if (ret) + return ret; + + /* Set CONFIG1 register for PEC access retry number. */ + ret = regmap_write(data->regmap, MAX6621_CONFIG1_REG, + MAX6621_CONFIG1_INIT); + if (ret) + return ret; + + /* Sync registers with hardware. */ + regcache_mark_dirty(data->regmap); + ret = regcache_sync(data->regmap); + if (ret) + return ret; + + /* Verify which temperature input registers are enabled. */ + for (i = 0; i < MAX6621_TEMP_INPUT_REG_NUM; i++) { + ret = i2c_smbus_read_word_data(client, max6621_temp_regs[i]); + if (ret < 0) + return ret; + ret = max6621_verify_reg_data(dev, ret); + if (ret) { + data->input_chan2reg[i] = -1; + continue; + } + + data->input_chan2reg[i] = max6621_temp_regs[i]; + } + + hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, + data, + &max6621_chip_info, + NULL); + + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static const struct i2c_device_id max6621_id[] = { + { MAX6621_DRV_NAME, 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, max6621_id); + +static const struct of_device_id max6621_of_match[] = { + { .compatible = "maxim,max6621" }, + { } +}; +MODULE_DEVICE_TABLE(of, max6621_of_match); + +static struct i2c_driver max6621_driver = { + .class = I2C_CLASS_HWMON, + .driver = { + .name = MAX6621_DRV_NAME, + .of_match_table = of_match_ptr(max6621_of_match), + }, + .probe = max6621_probe, + .id_table = max6621_id, +}; + +module_i2c_driver(max6621_driver); + +MODULE_AUTHOR("Vadim Pasternak "); +MODULE_DESCRIPTION("Driver for Maxim MAX6621"); +MODULE_LICENSE("GPL"); From 24fae2b5b61da0cec6b94059552ae97ccc498ccc Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Tue, 3 Oct 2017 18:08:28 +0000 Subject: [PATCH 0657/1085] Documentation: devicetree: add max6621 device Add device record for Maxim MAX6621 temperature sensor device. Signed-off-by: Vadim Pasternak Acked-by: Rob Herring Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/trivial-devices.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/trivial-devices.txt b/Documentation/devicetree/bindings/trivial-devices.txt index af284fbd4d23..8bcac6ee73da 100644 --- a/Documentation/devicetree/bindings/trivial-devices.txt +++ b/Documentation/devicetree/bindings/trivial-devices.txt @@ -71,6 +71,7 @@ isil,isl29028 Intersil ISL29028 Ambient Light and Proximity Sensor isil,isl29030 Intersil ISL29030 Ambient Light and Proximity Sensor maxim,ds1050 5 Bit Programmable, Pulse-Width Modulator maxim,max1237 Low-Power, 4-/12-Channel, 2-Wire Serial, 12-Bit ADCs +maxim,max6621 PECI-to-I2C translator for PECI-to-SMBus/I2C protocol conversion maxim,max6625 9-Bit/12-Bit Temperature Sensors with I²C-Compatible Serial Interface mc,rv3029c2 Real Time Clock Module with I2C-Bus mcube,mc3230 mCube 3-axis 8-bit digital accelerometer From 9dfe310ed5f0802cba5ec29f2b5f5f92ba8ad9e9 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:03 +0200 Subject: [PATCH 0658/1085] hwmon: (gpio-fan) Move DT bindings to the right place This moves the GPIO fan bindings to the hwmon bindings. The GPIO fan is a hwmon class hardware, not related to GPIO other than being a consumer of GPIOs. Cc: devicetree@vger.kernel.org Signed-off-by: Linus Walleij Acked-by: Rob Herring Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/{gpio => hwmon}/gpio-fan.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Documentation/devicetree/bindings/{gpio => hwmon}/gpio-fan.txt (100%) diff --git a/Documentation/devicetree/bindings/gpio/gpio-fan.txt b/Documentation/devicetree/bindings/hwmon/gpio-fan.txt similarity index 100% rename from Documentation/devicetree/bindings/gpio/gpio-fan.txt rename to Documentation/devicetree/bindings/hwmon/gpio-fan.txt From f9013c1677426df09022fe6fa0121e6fe9e1a0fa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:04 +0200 Subject: [PATCH 0659/1085] hwmon: (gpio-fan) Use local variable pointers Create local struct device *dev and device_node *np pointers to make the code easier to read. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 9c355b9d31c5..f29cee9398ef 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -541,22 +541,24 @@ static int gpio_fan_probe(struct platform_device *pdev) { int err; struct gpio_fan_data *fan_data; - struct gpio_fan_platform_data *pdata = dev_get_platdata(&pdev->dev); + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct gpio_fan_platform_data *pdata = dev_get_platdata(dev); - fan_data = devm_kzalloc(&pdev->dev, sizeof(struct gpio_fan_data), + fan_data = devm_kzalloc(dev, sizeof(struct gpio_fan_data), GFP_KERNEL); if (!fan_data) return -ENOMEM; #ifdef CONFIG_OF_GPIO if (!pdata) { - pdata = devm_kzalloc(&pdev->dev, + pdata = devm_kzalloc(dev, sizeof(struct gpio_fan_platform_data), GFP_KERNEL); if (!pdata) return -ENOMEM; - err = gpio_fan_get_of_pdata(&pdev->dev, pdata); + err = gpio_fan_get_of_pdata(dev, pdata); if (err) return err; } @@ -587,14 +589,14 @@ static int gpio_fan_probe(struct platform_device *pdev) /* Make this driver part of hwmon class. */ fan_data->hwmon_dev = - devm_hwmon_device_register_with_groups(&pdev->dev, + devm_hwmon_device_register_with_groups(dev, "gpio_fan", fan_data, gpio_fan_groups); if (IS_ERR(fan_data->hwmon_dev)) return PTR_ERR(fan_data->hwmon_dev); #ifdef CONFIG_OF_GPIO /* Optional cooling device register for Device tree platforms */ - fan_data->cdev = thermal_of_cooling_device_register(pdev->dev.of_node, + fan_data->cdev = thermal_of_cooling_device_register(np, "gpio-fan", fan_data, &gpio_fan_cool_ops); @@ -604,7 +606,7 @@ static int gpio_fan_probe(struct platform_device *pdev) &gpio_fan_cool_ops); #endif /* CONFIG_OF_GPIO */ - dev_info(&pdev->dev, "GPIO fan initialized\n"); + dev_info(dev, "GPIO fan initialized\n"); return 0; } From ef7a612415958de1f9afd86235d38b14975d0b7c Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:05 +0200 Subject: [PATCH 0660/1085] hwmon: (gpio-fan) Localize platform data There is not a single user of the platform data header in . We can conclude that all current users are probing from the device tree, so start simplifying the code by pulling the header into the driver. Convert "unsigned" to "unsigned int" in the process to make checkpatch happy. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 23 ++++++++++++++++++++++- include/linux/gpio-fan.h | 36 ------------------------------------ 2 files changed, 22 insertions(+), 37 deletions(-) delete mode 100644 include/linux/gpio-fan.h diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index f29cee9398ef..cfa8d9b578dd 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -30,12 +30,33 @@ #include #include #include -#include #include #include #include #include +struct gpio_fan_alarm { + unsigned int gpio; + unsigned int active_low; +}; + +struct gpio_fan_speed { + int rpm; + int ctrl_val; +}; + +struct gpio_fan_platform_data { + int num_ctrl; + unsigned int *ctrl; /* fan control GPIOs. */ + struct gpio_fan_alarm *alarm; /* fan alarm GPIO. */ + /* + * Speed conversion array: rpm from/to GPIO bit field. + * This array _must_ be sorted in ascending rpm order. + */ + int num_speed; + struct gpio_fan_speed *speed; +}; + struct gpio_fan_data { struct platform_device *pdev; struct device *hwmon_dev; diff --git a/include/linux/gpio-fan.h b/include/linux/gpio-fan.h deleted file mode 100644 index 096659169215..000000000000 --- a/include/linux/gpio-fan.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * include/linux/gpio-fan.h - * - * Platform data structure for GPIO fan driver - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#ifndef __LINUX_GPIO_FAN_H -#define __LINUX_GPIO_FAN_H - -struct gpio_fan_alarm { - unsigned gpio; - unsigned active_low; -}; - -struct gpio_fan_speed { - int rpm; - int ctrl_val; -}; - -struct gpio_fan_platform_data { - int num_ctrl; - unsigned *ctrl; /* fan control GPIOs. */ - struct gpio_fan_alarm *alarm; /* fan alarm GPIO. */ - /* - * Speed conversion array: rpm from/to GPIO bit field. - * This array _must_ be sorted in ascending rpm order. - */ - int num_speed; - struct gpio_fan_speed *speed; -}; - -#endif /* __LINUX_GPIO_FAN_H */ From 8c0eb9bc52fad2fec7a5ff40d5da85b74232f5de Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:06 +0200 Subject: [PATCH 0661/1085] hwmon: (gpio-fan) Send around device pointer The driver is storing the struct platform_device *pdev pointer but what it is really using and want to pass around is a struct device *dev pointer. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index cfa8d9b578dd..ad7d8fdf4f81 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -58,7 +58,7 @@ struct gpio_fan_platform_data { }; struct gpio_fan_data { - struct platform_device *pdev; + struct device *dev; struct device *hwmon_dev; /* Cooling device if any */ struct thermal_cooling_device *cdev; @@ -85,8 +85,8 @@ static void fan_alarm_notify(struct work_struct *ws) struct gpio_fan_data *fan_data = container_of(ws, struct gpio_fan_data, alarm_work); - sysfs_notify(&fan_data->pdev->dev.kobj, NULL, "fan1_alarm"); - kobject_uevent(&fan_data->pdev->dev.kobj, KOBJ_CHANGE); + sysfs_notify(&fan_data->dev->kobj, NULL, "fan1_alarm"); + kobject_uevent(&fan_data->dev->kobj, KOBJ_CHANGE); } static irqreturn_t fan_alarm_irq_handler(int irq, void *dev_id) @@ -118,11 +118,11 @@ static int fan_alarm_init(struct gpio_fan_data *fan_data, { int err; int alarm_irq; - struct platform_device *pdev = fan_data->pdev; + struct device *dev = fan_data->dev; fan_data->alarm = alarm; - err = devm_gpio_request(&pdev->dev, alarm->gpio, "GPIO fan alarm"); + err = devm_gpio_request(dev, alarm->gpio, "GPIO fan alarm"); if (err) return err; @@ -140,7 +140,7 @@ static int fan_alarm_init(struct gpio_fan_data *fan_data, INIT_WORK(&fan_data->alarm_work, fan_alarm_notify); irq_set_irq_type(alarm_irq, IRQ_TYPE_EDGE_BOTH); - err = devm_request_irq(&pdev->dev, alarm_irq, fan_alarm_irq_handler, + err = devm_request_irq(dev, alarm_irq, fan_alarm_irq_handler, IRQF_SHARED, "GPIO fan alarm", fan_data); return err; } @@ -191,7 +191,7 @@ static int get_fan_speed_index(struct gpio_fan_data *fan_data) if (fan_data->speed[i].ctrl_val == ctrl_val) return i; - dev_warn(&fan_data->pdev->dev, + dev_warn(fan_data->dev, "missing speed array entry for GPIO value 0x%x\n", ctrl_val); return -ENODEV; @@ -382,13 +382,13 @@ static const struct attribute_group *gpio_fan_groups[] = { static int fan_ctrl_init(struct gpio_fan_data *fan_data, struct gpio_fan_platform_data *pdata) { - struct platform_device *pdev = fan_data->pdev; + struct device *dev = fan_data->dev; int num_ctrl = pdata->num_ctrl; unsigned *ctrl = pdata->ctrl; int i, err; for (i = 0; i < num_ctrl; i++) { - err = devm_gpio_request(&pdev->dev, ctrl[i], + err = devm_gpio_request(dev, ctrl[i], "GPIO fan control"); if (err) return err; @@ -588,7 +588,7 @@ static int gpio_fan_probe(struct platform_device *pdev) return -EINVAL; #endif /* CONFIG_OF_GPIO */ - fan_data->pdev = pdev; + fan_data->dev = dev; platform_set_drvdata(pdev, fan_data); mutex_init(&fan_data->lock); From a9b4c8afcd3d6c2b068e6ecf1a22ab26dd8c200e Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:07 +0200 Subject: [PATCH 0662/1085] hwmon: (gpio-fan) Mandate OF_GPIO and cut pdata path We have no users of platform data, we made platform data driver-local, so cut all #ifdefs for the platform data case, and depend on the Kconfig CONFIG_OF_GPIO symbol. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/Kconfig | 1 + drivers/hwmon/gpio-fan.c | 36 ++++++++++-------------------------- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index fae8a8904c10..7ad017690e3a 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -552,6 +552,7 @@ config SENSORS_G762 config SENSORS_GPIO_FAN tristate "GPIO fan" + depends on OF_GPIO depends on GPIOLIB || COMPILE_TEST depends on THERMAL || THERMAL=n help diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index ad7d8fdf4f81..55dbdb223e02 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -453,7 +453,6 @@ static const struct thermal_cooling_device_ops gpio_fan_cool_ops = { .set_cur_state = gpio_fan_set_cur_state, }; -#ifdef CONFIG_OF_GPIO /* * Translate OpenFirmware node properties into platform_data */ @@ -556,7 +555,6 @@ static const struct of_device_id of_gpio_fan_match[] = { {}, }; MODULE_DEVICE_TABLE(of, of_gpio_fan_match); -#endif /* CONFIG_OF_GPIO */ static int gpio_fan_probe(struct platform_device *pdev) { @@ -564,29 +562,22 @@ static int gpio_fan_probe(struct platform_device *pdev) struct gpio_fan_data *fan_data; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; - struct gpio_fan_platform_data *pdata = dev_get_platdata(dev); + struct gpio_fan_platform_data *pdata; fan_data = devm_kzalloc(dev, sizeof(struct gpio_fan_data), GFP_KERNEL); if (!fan_data) return -ENOMEM; -#ifdef CONFIG_OF_GPIO - if (!pdata) { - pdata = devm_kzalloc(dev, - sizeof(struct gpio_fan_platform_data), - GFP_KERNEL); - if (!pdata) - return -ENOMEM; - - err = gpio_fan_get_of_pdata(dev, pdata); - if (err) - return err; - } -#else /* CONFIG_OF_GPIO */ + pdata = devm_kzalloc(dev, + sizeof(struct gpio_fan_platform_data), + GFP_KERNEL); if (!pdata) - return -EINVAL; -#endif /* CONFIG_OF_GPIO */ + return -ENOMEM; + + err = gpio_fan_get_of_pdata(dev, pdata); + if (err) + return err; fan_data->dev = dev; platform_set_drvdata(pdev, fan_data); @@ -615,17 +606,12 @@ static int gpio_fan_probe(struct platform_device *pdev) gpio_fan_groups); if (IS_ERR(fan_data->hwmon_dev)) return PTR_ERR(fan_data->hwmon_dev); -#ifdef CONFIG_OF_GPIO + /* Optional cooling device register for Device tree platforms */ fan_data->cdev = thermal_of_cooling_device_register(np, "gpio-fan", fan_data, &gpio_fan_cool_ops); -#else /* CONFIG_OF_GPIO */ - /* Optional cooling device register for non Device tree platforms */ - fan_data->cdev = thermal_cooling_device_register("gpio-fan", fan_data, - &gpio_fan_cool_ops); -#endif /* CONFIG_OF_GPIO */ dev_info(dev, "GPIO fan initialized\n"); @@ -686,9 +672,7 @@ static struct platform_driver gpio_fan_driver = { .driver = { .name = "gpio-fan", .pm = GPIO_FAN_PM, -#ifdef CONFIG_OF_GPIO .of_match_table = of_match_ptr(of_gpio_fan_match), -#endif }, }; From b5482f7e6cc5639440bfa0b9bb4a3a9732883f53 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:08 +0200 Subject: [PATCH 0663/1085] hwmon: (gpio-fan) Get rid of platform data struct We are not passing the platform data struct into the driver from the outside, there is no point of having it around separately so instead of first populating the platform data struct and assigning the result into the same variables in the state container (struct gpio_fan_data) just assign the configuration from the device tree directly into the state container members. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 88 ++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 58 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 55dbdb223e02..000c8d2e0987 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -45,18 +45,6 @@ struct gpio_fan_speed { int ctrl_val; }; -struct gpio_fan_platform_data { - int num_ctrl; - unsigned int *ctrl; /* fan control GPIOs. */ - struct gpio_fan_alarm *alarm; /* fan alarm GPIO. */ - /* - * Speed conversion array: rpm from/to GPIO bit field. - * This array _must_ be sorted in ascending rpm order. - */ - int num_speed; - struct gpio_fan_speed *speed; -}; - struct gpio_fan_data { struct device *dev; struct device *hwmon_dev; @@ -113,14 +101,12 @@ static ssize_t fan1_alarm_show(struct device *dev, static DEVICE_ATTR_RO(fan1_alarm); -static int fan_alarm_init(struct gpio_fan_data *fan_data, - struct gpio_fan_alarm *alarm) +static int fan_alarm_init(struct gpio_fan_data *fan_data) { int err; int alarm_irq; struct device *dev = fan_data->dev; - - fan_data->alarm = alarm; + struct gpio_fan_alarm *alarm = fan_data->alarm; err = devm_gpio_request(dev, alarm->gpio, "GPIO fan alarm"); if (err) @@ -379,12 +365,11 @@ static const struct attribute_group *gpio_fan_groups[] = { NULL }; -static int fan_ctrl_init(struct gpio_fan_data *fan_data, - struct gpio_fan_platform_data *pdata) +static int fan_ctrl_init(struct gpio_fan_data *fan_data) { struct device *dev = fan_data->dev; - int num_ctrl = pdata->num_ctrl; - unsigned *ctrl = pdata->ctrl; + int num_ctrl = fan_data->num_ctrl; + unsigned int *ctrl = fan_data->ctrl; int i, err; for (i = 0; i < num_ctrl; i++) { @@ -399,10 +384,6 @@ static int fan_ctrl_init(struct gpio_fan_data *fan_data, return err; } - fan_data->num_ctrl = num_ctrl; - fan_data->ctrl = ctrl; - fan_data->num_speed = pdata->num_speed; - fan_data->speed = pdata->speed; fan_data->pwm_enable = true; /* Enable manual fan speed control. */ fan_data->speed_index = get_fan_speed_index(fan_data); if (fan_data->speed_index < 0) @@ -456,21 +437,19 @@ static const struct thermal_cooling_device_ops gpio_fan_cool_ops = { /* * Translate OpenFirmware node properties into platform_data */ -static int gpio_fan_get_of_pdata(struct device *dev, - struct gpio_fan_platform_data *pdata) +static int gpio_fan_get_of_data(struct gpio_fan_data *fan_data) { - struct device_node *node; struct gpio_fan_speed *speed; + struct device *dev = fan_data->dev; + struct device_node *np = dev->of_node; unsigned *ctrl; unsigned i; u32 u; struct property *prop; const __be32 *p; - node = dev->of_node; - /* Alarm GPIO if one exists */ - if (of_gpio_named_count(node, "alarm-gpios") > 0) { + if (of_gpio_named_count(np, "alarm-gpios") > 0) { struct gpio_fan_alarm *alarm; int val; enum of_gpio_flags flags; @@ -480,39 +459,39 @@ static int gpio_fan_get_of_pdata(struct device *dev, if (!alarm) return -ENOMEM; - val = of_get_named_gpio_flags(node, "alarm-gpios", 0, &flags); + val = of_get_named_gpio_flags(np, "alarm-gpios", 0, &flags); if (val < 0) return val; alarm->gpio = val; alarm->active_low = flags & OF_GPIO_ACTIVE_LOW; - pdata->alarm = alarm; + fan_data->alarm = alarm; } /* Fill GPIO pin array */ - pdata->num_ctrl = of_gpio_count(node); - if (pdata->num_ctrl <= 0) { - if (pdata->alarm) + fan_data->num_ctrl = of_gpio_count(np); + if (fan_data->num_ctrl <= 0) { + if (fan_data->alarm) return 0; dev_err(dev, "DT properties empty / missing"); return -ENODEV; } - ctrl = devm_kzalloc(dev, pdata->num_ctrl * sizeof(unsigned), - GFP_KERNEL); + ctrl = devm_kzalloc(dev, fan_data->num_ctrl * sizeof(unsigned int), + GFP_KERNEL); if (!ctrl) return -ENOMEM; - for (i = 0; i < pdata->num_ctrl; i++) { + for (i = 0; i < fan_data->num_ctrl; i++) { int val; - val = of_get_gpio(node, i); + val = of_get_gpio(np, i); if (val < 0) return val; ctrl[i] = val; } - pdata->ctrl = ctrl; + fan_data->ctrl = ctrl; /* Get number of RPM/ctrl_val pairs in speed map */ - prop = of_find_property(node, "gpio-fan,speed-map", &i); + prop = of_find_property(np, "gpio-fan,speed-map", &i); if (!prop) { dev_err(dev, "gpio-fan,speed-map DT property missing"); return -ENODEV; @@ -522,7 +501,7 @@ static int gpio_fan_get_of_pdata(struct device *dev, dev_err(dev, "gpio-fan,speed-map contains zero/odd number of entries"); return -ENODEV; } - pdata->num_speed = i / 2; + fan_data->num_speed = i / 2; /* * Populate speed map @@ -530,12 +509,12 @@ static int gpio_fan_get_of_pdata(struct device *dev, * this needs splitting into pairs to create gpio_fan_speed structs */ speed = devm_kzalloc(dev, - pdata->num_speed * sizeof(struct gpio_fan_speed), + fan_data->num_speed * sizeof(struct gpio_fan_speed), GFP_KERNEL); if (!speed) return -ENOMEM; p = NULL; - for (i = 0; i < pdata->num_speed; i++) { + for (i = 0; i < fan_data->num_speed; i++) { p = of_prop_next_u32(prop, p, &u); if (!p) return -ENODEV; @@ -545,7 +524,7 @@ static int gpio_fan_get_of_pdata(struct device *dev, return -ENODEV; speed[i].ctrl_val = u; } - pdata->speed = speed; + fan_data->speed = speed; return 0; } @@ -562,20 +541,13 @@ static int gpio_fan_probe(struct platform_device *pdev) struct gpio_fan_data *fan_data; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; - struct gpio_fan_platform_data *pdata; fan_data = devm_kzalloc(dev, sizeof(struct gpio_fan_data), GFP_KERNEL); if (!fan_data) return -ENOMEM; - pdata = devm_kzalloc(dev, - sizeof(struct gpio_fan_platform_data), - GFP_KERNEL); - if (!pdata) - return -ENOMEM; - - err = gpio_fan_get_of_pdata(dev, pdata); + err = gpio_fan_get_of_data(fan_data); if (err) return err; @@ -584,17 +556,17 @@ static int gpio_fan_probe(struct platform_device *pdev) mutex_init(&fan_data->lock); /* Configure alarm GPIO if available. */ - if (pdata->alarm) { - err = fan_alarm_init(fan_data, pdata->alarm); + if (fan_data->alarm) { + err = fan_alarm_init(fan_data); if (err) return err; } /* Configure control GPIOs if available. */ - if (pdata->ctrl && pdata->num_ctrl > 0) { - if (!pdata->speed || pdata->num_speed <= 1) + if (fan_data->ctrl && fan_data->num_ctrl > 0) { + if (!fan_data->speed || fan_data->num_speed <= 1) return -EINVAL; - err = fan_ctrl_init(fan_data, pdata); + err = fan_ctrl_init(fan_data); if (err) return err; } From c9933cb16f166de57b0b8bce170c1b9476b89836 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:09 +0200 Subject: [PATCH 0664/1085] hwmon: (gpio-fan) Get rid of the gpio alarm struct There is no point in storing the GPIO alarm settings in their own struct so merge this into the main state container. Convert the variables from "unsigned" to "unsigned int" to make checkpatch happy. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 000c8d2e0987..568ce4b25a9e 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -35,11 +35,6 @@ #include #include -struct gpio_fan_alarm { - unsigned int gpio; - unsigned int active_low; -}; - struct gpio_fan_speed { int rpm; int ctrl_val; @@ -60,7 +55,8 @@ struct gpio_fan_data { int resume_speed; #endif bool pwm_enable; - struct gpio_fan_alarm *alarm; + unsigned int alarm_gpio; + unsigned int alarm_gpio_active_low; struct work_struct alarm_work; }; @@ -90,10 +86,9 @@ static ssize_t fan1_alarm_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gpio_fan_data *fan_data = dev_get_drvdata(dev); - struct gpio_fan_alarm *alarm = fan_data->alarm; - int value = gpio_get_value_cansleep(alarm->gpio); + int value = gpio_get_value_cansleep(fan_data->alarm_gpio); - if (alarm->active_low) + if (fan_data->alarm_gpio_active_low) value = !value; return sprintf(buf, "%d\n", value); @@ -106,13 +101,12 @@ static int fan_alarm_init(struct gpio_fan_data *fan_data) int err; int alarm_irq; struct device *dev = fan_data->dev; - struct gpio_fan_alarm *alarm = fan_data->alarm; - err = devm_gpio_request(dev, alarm->gpio, "GPIO fan alarm"); + err = devm_gpio_request(dev, fan_data->alarm_gpio, "GPIO fan alarm"); if (err) return err; - err = gpio_direction_input(alarm->gpio); + err = gpio_direction_input(fan_data->alarm_gpio); if (err) return err; @@ -120,7 +114,7 @@ static int fan_alarm_init(struct gpio_fan_data *fan_data) * If the alarm GPIO don't support interrupts, just leave * without initializing the fail notification support. */ - alarm_irq = gpio_to_irq(alarm->gpio); + alarm_irq = gpio_to_irq(fan_data->alarm_gpio); if (alarm_irq < 0) return 0; @@ -335,7 +329,7 @@ static umode_t gpio_fan_is_visible(struct kobject *kobj, struct device *dev = container_of(kobj, struct device, kobj); struct gpio_fan_data *data = dev_get_drvdata(dev); - if (index == 0 && !data->alarm) + if (index == 0 && !data->alarm_gpio) return 0; if (index > 0 && !data->ctrl) return 0; @@ -450,28 +444,20 @@ static int gpio_fan_get_of_data(struct gpio_fan_data *fan_data) /* Alarm GPIO if one exists */ if (of_gpio_named_count(np, "alarm-gpios") > 0) { - struct gpio_fan_alarm *alarm; int val; enum of_gpio_flags flags; - alarm = devm_kzalloc(dev, sizeof(struct gpio_fan_alarm), - GFP_KERNEL); - if (!alarm) - return -ENOMEM; - val = of_get_named_gpio_flags(np, "alarm-gpios", 0, &flags); if (val < 0) return val; - alarm->gpio = val; - alarm->active_low = flags & OF_GPIO_ACTIVE_LOW; - - fan_data->alarm = alarm; + fan_data->alarm_gpio = val; + fan_data->alarm_gpio_active_low = flags & OF_GPIO_ACTIVE_LOW; } /* Fill GPIO pin array */ fan_data->num_ctrl = of_gpio_count(np); if (fan_data->num_ctrl <= 0) { - if (fan_data->alarm) + if (fan_data->alarm_gpio) return 0; dev_err(dev, "DT properties empty / missing"); return -ENODEV; @@ -556,7 +542,7 @@ static int gpio_fan_probe(struct platform_device *pdev) mutex_init(&fan_data->lock); /* Configure alarm GPIO if available. */ - if (fan_data->alarm) { + if (fan_data->alarm_gpio) { err = fan_alarm_init(fan_data); if (err) return err; From e99c2e5d6cde7f06dac0444b5edd6ed0d1abc431 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 26 Sep 2017 01:09:10 +0200 Subject: [PATCH 0665/1085] hwmon: (gpio-fan) Rename GPIO line state variables The "ctrl" and "num_ctrl" entries in the state container struct is ambiguously named "ctrl" and "num_ctrl" overlapping with some hwmon lingo and making it hard to understand. Since this array actually contains the GPIO line numbers, from the Linux global GPIO numberspace, used to control the different fan speeds. Rename these fields to "gpios" (pluralis) and "num_gpios" so as to make it unambiguous. Convert some instances of "unsigned" to "unsigned int" to keep checkpatch happy. Signed-off-by: Linus Walleij Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 51 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 568ce4b25a9e..18b3c7c27d36 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -46,8 +46,8 @@ struct gpio_fan_data { /* Cooling device if any */ struct thermal_cooling_device *cdev; struct mutex lock; /* lock GPIOs operations. */ - int num_ctrl; - unsigned *ctrl; + int num_gpios; + unsigned int *gpios; int num_speed; struct gpio_fan_speed *speed; int speed_index; @@ -134,8 +134,9 @@ static void __set_fan_ctrl(struct gpio_fan_data *fan_data, int ctrl_val) { int i; - for (i = 0; i < fan_data->num_ctrl; i++) - gpio_set_value_cansleep(fan_data->ctrl[i], (ctrl_val >> i) & 1); + for (i = 0; i < fan_data->num_gpios; i++) + gpio_set_value_cansleep(fan_data->gpios[i], + (ctrl_val >> i) & 1); } static int __get_fan_ctrl(struct gpio_fan_data *fan_data) @@ -143,10 +144,10 @@ static int __get_fan_ctrl(struct gpio_fan_data *fan_data) int i; int ctrl_val = 0; - for (i = 0; i < fan_data->num_ctrl; i++) { + for (i = 0; i < fan_data->num_gpios; i++) { int value; - value = gpio_get_value_cansleep(fan_data->ctrl[i]); + value = gpio_get_value_cansleep(fan_data->gpios[i]); ctrl_val |= (value << i); } return ctrl_val; @@ -331,7 +332,7 @@ static umode_t gpio_fan_is_visible(struct kobject *kobj, if (index == 0 && !data->alarm_gpio) return 0; - if (index > 0 && !data->ctrl) + if (index > 0 && !data->gpios) return 0; return attr->mode; @@ -362,18 +363,18 @@ static const struct attribute_group *gpio_fan_groups[] = { static int fan_ctrl_init(struct gpio_fan_data *fan_data) { struct device *dev = fan_data->dev; - int num_ctrl = fan_data->num_ctrl; - unsigned int *ctrl = fan_data->ctrl; + int num_gpios = fan_data->num_gpios; + unsigned int *gpios = fan_data->gpios; int i, err; - for (i = 0; i < num_ctrl; i++) { - err = devm_gpio_request(dev, ctrl[i], + for (i = 0; i < num_gpios; i++) { + err = devm_gpio_request(dev, gpios[i], "GPIO fan control"); if (err) return err; - err = gpio_direction_output(ctrl[i], - gpio_get_value_cansleep(ctrl[i])); + err = gpio_direction_output(gpios[i], + gpio_get_value_cansleep(gpios[i])); if (err) return err; } @@ -436,7 +437,7 @@ static int gpio_fan_get_of_data(struct gpio_fan_data *fan_data) struct gpio_fan_speed *speed; struct device *dev = fan_data->dev; struct device_node *np = dev->of_node; - unsigned *ctrl; + unsigned int *gpios; unsigned i; u32 u; struct property *prop; @@ -455,26 +456,26 @@ static int gpio_fan_get_of_data(struct gpio_fan_data *fan_data) } /* Fill GPIO pin array */ - fan_data->num_ctrl = of_gpio_count(np); - if (fan_data->num_ctrl <= 0) { + fan_data->num_gpios = of_gpio_count(np); + if (fan_data->num_gpios <= 0) { if (fan_data->alarm_gpio) return 0; dev_err(dev, "DT properties empty / missing"); return -ENODEV; } - ctrl = devm_kzalloc(dev, fan_data->num_ctrl * sizeof(unsigned int), + gpios = devm_kzalloc(dev, fan_data->num_gpios * sizeof(unsigned int), GFP_KERNEL); - if (!ctrl) + if (!gpios) return -ENOMEM; - for (i = 0; i < fan_data->num_ctrl; i++) { + for (i = 0; i < fan_data->num_gpios; i++) { int val; val = of_get_gpio(np, i); if (val < 0) return val; - ctrl[i] = val; + gpios[i] = val; } - fan_data->ctrl = ctrl; + fan_data->gpios = gpios; /* Get number of RPM/ctrl_val pairs in speed map */ prop = of_find_property(np, "gpio-fan,speed-map", &i); @@ -549,7 +550,7 @@ static int gpio_fan_probe(struct platform_device *pdev) } /* Configure control GPIOs if available. */ - if (fan_data->ctrl && fan_data->num_ctrl > 0) { + if (fan_data->gpios && fan_data->num_gpios > 0) { if (!fan_data->speed || fan_data->num_speed <= 1) return -EINVAL; err = fan_ctrl_init(fan_data); @@ -583,7 +584,7 @@ static int gpio_fan_remove(struct platform_device *pdev) if (!IS_ERR(fan_data->cdev)) thermal_cooling_device_unregister(fan_data->cdev); - if (fan_data->ctrl) + if (fan_data->gpios) set_fan_speed(fan_data, 0); return 0; @@ -599,7 +600,7 @@ static int gpio_fan_suspend(struct device *dev) { struct gpio_fan_data *fan_data = dev_get_drvdata(dev); - if (fan_data->ctrl) { + if (fan_data->gpios) { fan_data->resume_speed = fan_data->speed_index; set_fan_speed(fan_data, 0); } @@ -611,7 +612,7 @@ static int gpio_fan_resume(struct device *dev) { struct gpio_fan_data *fan_data = dev_get_drvdata(dev); - if (fan_data->ctrl) + if (fan_data->gpios) set_fan_speed(fan_data, fan_data->resume_speed); return 0; From 9de382fddf18f673436f1058d822e1236a0b4c2a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 9 Oct 2017 01:14:32 +0200 Subject: [PATCH 0666/1085] hwmon: (gpio-fan) Convert to use GPIO descriptors This converts the GPIO fan driver to use GPIO descriptors. This way we avoid indirection since the gpiolib anyway just use descriptors inside, and we also get rid of explicit polarity handling: the descriptors internally knows if the line is active high or active low. Signed-off-by: Linus Walleij [groeck: Line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 88 +++++++++++++++------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 18b3c7c27d36..43b697380987 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -29,10 +29,9 @@ #include #include #include -#include +#include #include #include -#include #include struct gpio_fan_speed { @@ -47,7 +46,7 @@ struct gpio_fan_data { struct thermal_cooling_device *cdev; struct mutex lock; /* lock GPIOs operations. */ int num_gpios; - unsigned int *gpios; + struct gpio_desc **gpios; int num_speed; struct gpio_fan_speed *speed; int speed_index; @@ -55,8 +54,7 @@ struct gpio_fan_data { int resume_speed; #endif bool pwm_enable; - unsigned int alarm_gpio; - unsigned int alarm_gpio_active_low; + struct gpio_desc *alarm_gpio; struct work_struct alarm_work; }; @@ -86,43 +84,30 @@ static ssize_t fan1_alarm_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gpio_fan_data *fan_data = dev_get_drvdata(dev); - int value = gpio_get_value_cansleep(fan_data->alarm_gpio); - if (fan_data->alarm_gpio_active_low) - value = !value; - - return sprintf(buf, "%d\n", value); + return sprintf(buf, "%d\n", + gpiod_get_value_cansleep(fan_data->alarm_gpio)); } static DEVICE_ATTR_RO(fan1_alarm); static int fan_alarm_init(struct gpio_fan_data *fan_data) { - int err; int alarm_irq; struct device *dev = fan_data->dev; - err = devm_gpio_request(dev, fan_data->alarm_gpio, "GPIO fan alarm"); - if (err) - return err; - - err = gpio_direction_input(fan_data->alarm_gpio); - if (err) - return err; - /* * If the alarm GPIO don't support interrupts, just leave * without initializing the fail notification support. */ - alarm_irq = gpio_to_irq(fan_data->alarm_gpio); - if (alarm_irq < 0) + alarm_irq = gpiod_to_irq(fan_data->alarm_gpio); + if (alarm_irq <= 0) return 0; INIT_WORK(&fan_data->alarm_work, fan_alarm_notify); irq_set_irq_type(alarm_irq, IRQ_TYPE_EDGE_BOTH); - err = devm_request_irq(dev, alarm_irq, fan_alarm_irq_handler, - IRQF_SHARED, "GPIO fan alarm", fan_data); - return err; + return devm_request_irq(dev, alarm_irq, fan_alarm_irq_handler, + IRQF_SHARED, "GPIO fan alarm", fan_data); } /* @@ -135,8 +120,8 @@ static void __set_fan_ctrl(struct gpio_fan_data *fan_data, int ctrl_val) int i; for (i = 0; i < fan_data->num_gpios; i++) - gpio_set_value_cansleep(fan_data->gpios[i], - (ctrl_val >> i) & 1); + gpiod_set_value_cansleep(fan_data->gpios[i], + (ctrl_val >> i) & 1); } static int __get_fan_ctrl(struct gpio_fan_data *fan_data) @@ -147,7 +132,7 @@ static int __get_fan_ctrl(struct gpio_fan_data *fan_data) for (i = 0; i < fan_data->num_gpios; i++) { int value; - value = gpio_get_value_cansleep(fan_data->gpios[i]); + value = gpiod_get_value_cansleep(fan_data->gpios[i]); ctrl_val |= (value << i); } return ctrl_val; @@ -362,19 +347,19 @@ static const struct attribute_group *gpio_fan_groups[] = { static int fan_ctrl_init(struct gpio_fan_data *fan_data) { - struct device *dev = fan_data->dev; int num_gpios = fan_data->num_gpios; - unsigned int *gpios = fan_data->gpios; + struct gpio_desc **gpios = fan_data->gpios; int i, err; for (i = 0; i < num_gpios; i++) { - err = devm_gpio_request(dev, gpios[i], - "GPIO fan control"); - if (err) - return err; - - err = gpio_direction_output(gpios[i], - gpio_get_value_cansleep(gpios[i])); + /* + * The GPIO descriptors were retrieved with GPIOD_ASIS so here + * we set the GPIO into output mode, carefully preserving the + * current value by setting it to whatever it is already set + * (no surprise changes in default fan speed). + */ + err = gpiod_direction_output(gpios[i], + gpiod_get_value_cansleep(gpios[i])); if (err) return err; } @@ -437,43 +422,34 @@ static int gpio_fan_get_of_data(struct gpio_fan_data *fan_data) struct gpio_fan_speed *speed; struct device *dev = fan_data->dev; struct device_node *np = dev->of_node; - unsigned int *gpios; + struct gpio_desc **gpios; unsigned i; u32 u; struct property *prop; const __be32 *p; /* Alarm GPIO if one exists */ - if (of_gpio_named_count(np, "alarm-gpios") > 0) { - int val; - enum of_gpio_flags flags; - - val = of_get_named_gpio_flags(np, "alarm-gpios", 0, &flags); - if (val < 0) - return val; - fan_data->alarm_gpio = val; - fan_data->alarm_gpio_active_low = flags & OF_GPIO_ACTIVE_LOW; - } + fan_data->alarm_gpio = devm_gpiod_get_optional(dev, "alarm", GPIOD_IN); + if (IS_ERR(fan_data->alarm_gpio)) + return PTR_ERR(fan_data->alarm_gpio); /* Fill GPIO pin array */ - fan_data->num_gpios = of_gpio_count(np); + fan_data->num_gpios = gpiod_count(dev, NULL); if (fan_data->num_gpios <= 0) { if (fan_data->alarm_gpio) return 0; dev_err(dev, "DT properties empty / missing"); return -ENODEV; } - gpios = devm_kzalloc(dev, fan_data->num_gpios * sizeof(unsigned int), - GFP_KERNEL); + gpios = devm_kzalloc(dev, + fan_data->num_gpios * sizeof(struct gpio_desc *), + GFP_KERNEL); if (!gpios) return -ENOMEM; for (i = 0; i < fan_data->num_gpios; i++) { - int val; - - val = of_get_gpio(np, i); - if (val < 0) - return val; - gpios[i] = val; + gpios[i] = devm_gpiod_get_index(dev, NULL, i, GPIOD_ASIS); + if (IS_ERR(gpios[i])) + return PTR_ERR(gpios[i]); } fan_data->gpios = gpios; From 534e28d876926669bba0dc31519a0b0026f3dfcb Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 16 Oct 2017 14:12:10 +0200 Subject: [PATCH 0667/1085] hwmon: (gpio-fan) Fix null pointer dereference at probe A previous commit changed the argument list of gpio_fan_get_of_data(), removing the "struct *dev" argument and retrieving it instead from the gpio_fan_data structure. The "dev" entry of gpio_fan_data was then dereferenced to access the of_node field, leading to a kernel panic during the probe as the "dev" entry of the gpio_fan_data structure was not filled yet. Fix this by setting fan_data->dev before calling gpio_fan_get_of_data(). Fixes: 5859d8d30737 ("hwmon: (gpio-fan) Get rid of platform data struct") Signed-off-by: Miquel Raynal Signed-off-by: Guenter Roeck --- drivers/hwmon/gpio-fan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index 43b697380987..5c9a52599cf6 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -510,11 +510,11 @@ static int gpio_fan_probe(struct platform_device *pdev) if (!fan_data) return -ENOMEM; + fan_data->dev = dev; err = gpio_fan_get_of_data(fan_data); if (err) return err; - fan_data->dev = dev; platform_set_drvdata(pdev, fan_data); mutex_init(&fan_data->lock); From 749d782d80de6db454af4336a4f4c8dfe79a5fc4 Mon Sep 17 00:00:00 2001 From: hotran Date: Tue, 17 Oct 2017 11:28:34 -0700 Subject: [PATCH 0668/1085] hwmon: (xgene) Support hwmon v2 This patch supports xgene-hwmon v2 which uses the non-cachable memory as the PCC shared memory. Signed-off-by: Hoan Tran Signed-off-by: Guenter Roeck --- drivers/hwmon/xgene-hwmon.c | 40 ++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index e1be61095532..57834361fb3c 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -91,6 +91,11 @@ #define to_xgene_hwmon_dev(cl) \ container_of(cl, struct xgene_hwmon_dev, mbox_client) +enum xgene_hwmon_version { + XGENE_HWMON_V1 = 0, + XGENE_HWMON_V2 = 1, +}; + struct slimpro_resp_msg { u32 msg; u32 param1; @@ -609,6 +614,15 @@ static void xgene_hwmon_tx_done(struct mbox_client *cl, void *msg, int ret) } } +#ifdef CONFIG_ACPI +static const struct acpi_device_id xgene_hwmon_acpi_match[] = { + {"APMC0D29", XGENE_HWMON_V1}, + {"APMC0D8A", XGENE_HWMON_V2}, + {}, +}; +MODULE_DEVICE_TABLE(acpi, xgene_hwmon_acpi_match); +#endif + static int xgene_hwmon_probe(struct platform_device *pdev) { struct xgene_hwmon_dev *ctx; @@ -651,6 +665,16 @@ static int xgene_hwmon_probe(struct platform_device *pdev) } } else { struct acpi_pcct_hw_reduced *cppc_ss; + int version = XGENE_HWMON_V1; +#ifdef CONFIG_ACPI + const struct acpi_device_id *acpi_id; + + acpi_id = acpi_match_device(xgene_hwmon_acpi_match, &pdev->dev); + if (!acpi_id) + return -EINVAL; + + version = (int)acpi_id->driver_data; +#endif if (device_property_read_u32(&pdev->dev, "pcc-channel", &ctx->mbox_idx)) { @@ -693,7 +717,13 @@ static int xgene_hwmon_probe(struct platform_device *pdev) */ ctx->comm_base_addr = cppc_ss->base_address; if (ctx->comm_base_addr) { - ctx->pcc_comm_addr = memremap(ctx->comm_base_addr, + if (version == XGENE_HWMON_V2) + ctx->pcc_comm_addr = (void __force *)ioremap( + ctx->comm_base_addr, + cppc_ss->length); + else + ctx->pcc_comm_addr = memremap( + ctx->comm_base_addr, cppc_ss->length, MEMREMAP_WB); } else { @@ -761,14 +791,6 @@ static int xgene_hwmon_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_ACPI -static const struct acpi_device_id xgene_hwmon_acpi_match[] = { - {"APMC0D29", 0}, - {}, -}; -MODULE_DEVICE_TABLE(acpi, xgene_hwmon_acpi_match); -#endif - static const struct of_device_id xgene_hwmon_of_match[] = { {.compatible = "apm,xgene-slimpro-hwmon"}, {} From a3bdc5b5bd369fcab8306b80d2c740e332183d20 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Oct 2017 13:10:38 +0100 Subject: [PATCH 0669/1085] hwmon: (asc7621) remove redundant assignment to newval The setting of newval to zero is redundant as the following if/else stanzas will always update newval to a new value. Remove the redundant setting, cleans up clang build warning: drivers/hwmon/asc7621.c:582:2: warning: Value stored to 'newval' is never read Signed-off-by: Colin Ian King Signed-off-by: Guenter Roeck --- drivers/hwmon/asc7621.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hwmon/asc7621.c b/drivers/hwmon/asc7621.c index 4875e99b59c9..6d34c05a4f83 100644 --- a/drivers/hwmon/asc7621.c +++ b/drivers/hwmon/asc7621.c @@ -579,7 +579,6 @@ static ssize_t show_pwm_enable(struct device *dev, mutex_unlock(&data->update_lock); val = config | (altbit << 3); - newval = 0; if (val == 3 || val >= 10) newval = 255; From 5813da157f7dbe03d268c8d0dd9425d0e3944910 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Oct 2017 12:47:42 +0300 Subject: [PATCH 0670/1085] hwmon: (max6621) Inverted if condition in max6621_read() We intended to test for failure here but accidentally tested for success. It means that we don't set "*val" to true and it means that if i2c_smbus_write_byte() does fail then we return success. Fixes: e7895864b0d7 ("hwmon: (max6621) Add support for Maxim MAX6621 temperature sensor") Signed-off-by: Dan Carpenter Acked-by: Vadim Pasternak Signed-off-by: Guenter Roeck --- drivers/hwmon/max6621.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/max6621.c b/drivers/hwmon/max6621.c index 22079ec29660..35555f0eefb9 100644 --- a/drivers/hwmon/max6621.c +++ b/drivers/hwmon/max6621.c @@ -296,7 +296,7 @@ max6621_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, if (regval) { ret = i2c_smbus_write_byte(data->client, MAX6621_CLEAR_ALERT_REG); - if (!ret) + if (ret) return ret; } From ca5d376e17072c1b60c3fee66f3be58ef018952d Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Sat, 28 Oct 2017 14:06:44 +0800 Subject: [PATCH 0671/1085] x86/paravirt: Set up the virt_spin_lock_key after static keys get initialized Commit: 9043442b43b1 ("locking/paravirt: Use new static key for controlling call of virt_spin_lock()") sets the static virt_spin_lock_key to a value before jump_label_init() has been called, which will result in a WARN(). Reorder the initialization sequence: - Move the native_pv_lock_init() into native_smp_prepare_cpus() - set the value in xen_init_lock_cpu() to avoid calling into the not yet initialized static keys subsystem. Suggested-by: Juergen Gross Reported-by: Juergen Gross Signed-off-by: Dou Liyang Reviewed-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: bp@suse.de Cc: luto@kernel.org Cc: vkuznets@redhat.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1509170804-3813-1-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 ++- arch/x86/xen/spinlock.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 361f91674ce5..198416ddff01 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1358,6 +1358,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); + native_pv_lock_init(); + uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1385,7 +1387,6 @@ void __init native_smp_prepare_boot_cpu(void) /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); - native_pv_lock_init(); } void __init native_smp_cpus_done(unsigned int max_cpus) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e8ab80ad7a6f..1e1462d3d43e 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -81,8 +81,11 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) + if (!xen_pvspin) { + if (cpu == 0) + static_branch_disable(&virt_spin_lock_key); return; + } WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); @@ -130,7 +133,6 @@ void __init xen_init_spinlocks(void) if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); - static_branch_disable(&virt_spin_lock_key); return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); From 15670bfe19905b1dcbb63137f40d718b59d84479 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 28 Oct 2017 09:30:38 +0800 Subject: [PATCH 0672/1085] x86/mm/64: Rename the register_page_bootmem_memmap() 'size' parameter to 'nr_pages' register_page_bootmem_memmap()'s 3rd 'size' parameter is named in a somewhat misleading fashion - rename it to 'nr_pages' which makes the units of it much clearer. Meanwhile rename the existing local variable 'nr_pages' to 'nr_pmd_pages', a more expressive name, to avoid conflict with new function parameter 'nr_pages'. (Also clean up the unnecessary parentheses in which get_order() is called.) Signed-off-by: Baoquan He Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akpm@linux-foundation.org Link: http://lkml.kernel.org/r/1509154238-23250-1-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 10 +++++----- include/linux/mm.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 048fbe8fc274..adcea90a2046 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) void register_page_bootmem_memmap(unsigned long section_nr, - struct page *start_page, unsigned long size) + struct page *start_page, unsigned long nr_pages) { unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long end = (unsigned long)(start_page + nr_pages); unsigned long next; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; - unsigned int nr_pages; + unsigned int nr_pmd_pages; struct page *page; for (; addr < end; addr = next) { @@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, if (pmd_none(*pmd)) continue; - nr_pages = 1 << (get_order(PMD_SIZE)); + nr_pmd_pages = 1 << get_order(PMD_SIZE); page = pmd_page(*pmd); - while (nr_pages--) + while (nr_pmd_pages--) get_page_bootmem(section_nr, page++, SECTION_INFO); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 065d99deb847..b2c7045e9604 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2495,7 +2495,7 @@ void vmemmap_populate_print_last(void); void vmemmap_free(unsigned long start, unsigned long end); #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long size); + unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, From 68481a7e1c84b652c03bd714c1028fb8ea680a9d Mon Sep 17 00:00:00 2001 From: Krishna Reddy Date: Fri, 8 Sep 2017 12:48:33 -0700 Subject: [PATCH 0673/1085] mmc: tegra: Mark 64 bit dma broken on Tegra186 SDHCI controllers on Tegra186 support 40 bit addressing. IOVA addresses are 48-bit wide on Tegra186. SDHCI host common code sets dma mask as either 32-bit or 64-bit. To avoid access issues when SMMU is enabled, disable 64-bit dma. Signed-off-by: Krishna Reddy Tested-by: Thierry Reding Acked-by: Thierry Reding Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 0cd6fa80db66..b877c13184c2 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -422,7 +422,15 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | + /* SDHCI controllers on Tegra186 support 40-bit addressing. + * IOVA addresses are 48-bit wide on Tegra186. + * With 64-bit dma mask used for SDHCI, accesses can + * be broken. Disable 64-bit dma, which would fall back + * to 32-bit dma mask. Ideally 40-bit dma mask would work, + * But it is not supported as of now. + */ + SDHCI_QUIRK2_BROKEN_64_BIT_DMA, .ops = &tegra114_sdhci_ops, }; From e4bf91f6723e493175eedfd2760f0ffda19756fa Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 15 Sep 2017 16:35:23 -0700 Subject: [PATCH 0674/1085] mmc: sdhci-msm: Utilize bulk clock API By stuffing the runtime controlled clocks into a clk_bulk_data array we can utilize the newly introduced bulk clock operations and clean up the error paths. This allow us to handle additional clocks in subsequent patch, without the added complexity. Cc: Ritesh Harjani Signed-off-by: Bjorn Andersson Tested-by: Jeremy McNicoll Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 80 +++++++++++++++--------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index fc73e56eb1e2..526e19eced2f 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -127,10 +127,9 @@ struct sdhci_msm_host { struct platform_device *pdev; void __iomem *core_mem; /* MSM SDCC mapped address */ int pwr_irq; /* power irq */ - struct clk *clk; /* main SD/MMC bus clock */ - struct clk *pclk; /* SDHC peripheral bus clock */ struct clk *bus_clk; /* SDHC bus voter clock */ struct clk *xo_clk; /* TCXO clk needed for FLL feature of cm_dll*/ + struct clk_bulk_data bulk_clks[2]; /* core, iface clocks */ unsigned long clk_rate; struct mmc_host *mmc; bool use_14lpp_dll_reset; @@ -164,10 +163,11 @@ static void msm_set_clock_rate_for_bus_mode(struct sdhci_host *host, struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); struct mmc_ios curr_ios = host->mmc->ios; + struct clk *core_clk = msm_host->bulk_clks[0].clk; int rc; clock = msm_get_clock_rate_for_bus_mode(host, clock); - rc = clk_set_rate(msm_host->clk, clock); + rc = clk_set_rate(core_clk, clock); if (rc) { pr_err("%s: Failed to set clock at rate %u at timing %d\n", mmc_hostname(host->mmc), clock, @@ -176,7 +176,7 @@ static void msm_set_clock_rate_for_bus_mode(struct sdhci_host *host, } msm_host->clk_rate = clock; pr_debug("%s: Setting clock at rate %lu at timing %d\n", - mmc_hostname(host->mmc), clk_get_rate(msm_host->clk), + mmc_hostname(host->mmc), clk_get_rate(core_clk), curr_ios.timing); } @@ -1032,8 +1032,9 @@ static unsigned int sdhci_msm_get_max_clock(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + struct clk *core_clk = msm_host->bulk_clks[0].clk; - return clk_round_rate(msm_host->clk, ULONG_MAX); + return clk_round_rate(core_clk, ULONG_MAX); } static unsigned int sdhci_msm_get_min_clock(struct sdhci_host *host) @@ -1124,6 +1125,7 @@ static int sdhci_msm_probe(struct platform_device *pdev) struct sdhci_pltfm_host *pltfm_host; struct sdhci_msm_host *msm_host; struct resource *core_memres; + struct clk *clk; int ret; u16 host_version, core_minor; u32 core_version, config; @@ -1160,24 +1162,32 @@ static int sdhci_msm_probe(struct platform_device *pdev) } /* Setup main peripheral bus clock */ - msm_host->pclk = devm_clk_get(&pdev->dev, "iface"); - if (IS_ERR(msm_host->pclk)) { - ret = PTR_ERR(msm_host->pclk); + clk = devm_clk_get(&pdev->dev, "iface"); + if (IS_ERR(clk)) { + ret = PTR_ERR(clk); dev_err(&pdev->dev, "Peripheral clk setup failed (%d)\n", ret); goto bus_clk_disable; } - - ret = clk_prepare_enable(msm_host->pclk); - if (ret) - goto bus_clk_disable; + msm_host->bulk_clks[1].clk = clk; /* Setup SDC MMC clock */ - msm_host->clk = devm_clk_get(&pdev->dev, "core"); - if (IS_ERR(msm_host->clk)) { - ret = PTR_ERR(msm_host->clk); + clk = devm_clk_get(&pdev->dev, "core"); + if (IS_ERR(clk)) { + ret = PTR_ERR(clk); dev_err(&pdev->dev, "SDC MMC clk setup failed (%d)\n", ret); - goto pclk_disable; + goto bus_clk_disable; } + msm_host->bulk_clks[0].clk = clk; + + /* Vote for maximum clock rate for maximum performance */ + ret = clk_set_rate(clk, INT_MAX); + if (ret) + dev_warn(&pdev->dev, "core clock boost failed\n"); + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(msm_host->bulk_clks), + msm_host->bulk_clks); + if (ret) + goto bus_clk_disable; /* * xo clock is needed for FLL feature of cm_dll. @@ -1189,15 +1199,6 @@ static int sdhci_msm_probe(struct platform_device *pdev) dev_warn(&pdev->dev, "TCXO clk not present (%d)\n", ret); } - /* Vote for maximum clock rate for maximum performance */ - ret = clk_set_rate(msm_host->clk, INT_MAX); - if (ret) - dev_warn(&pdev->dev, "core clock boost failed\n"); - - ret = clk_prepare_enable(msm_host->clk); - if (ret) - goto pclk_disable; - core_memres = platform_get_resource(pdev, IORESOURCE_MEM, 1); msm_host->core_mem = devm_ioremap_resource(&pdev->dev, core_memres); @@ -1290,9 +1291,8 @@ pm_runtime_disable: pm_runtime_set_suspended(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); clk_disable: - clk_disable_unprepare(msm_host->clk); -pclk_disable: - clk_disable_unprepare(msm_host->pclk); + clk_bulk_disable_unprepare(ARRAY_SIZE(msm_host->bulk_clks), + msm_host->bulk_clks); bus_clk_disable: if (!IS_ERR(msm_host->bus_clk)) clk_disable_unprepare(msm_host->bus_clk); @@ -1315,8 +1315,8 @@ static int sdhci_msm_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); - clk_disable_unprepare(msm_host->clk); - clk_disable_unprepare(msm_host->pclk); + clk_bulk_disable_unprepare(ARRAY_SIZE(msm_host->bulk_clks), + msm_host->bulk_clks); if (!IS_ERR(msm_host->bus_clk)) clk_disable_unprepare(msm_host->bus_clk); sdhci_pltfm_free(pdev); @@ -1330,8 +1330,8 @@ static int sdhci_msm_runtime_suspend(struct device *dev) struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); - clk_disable_unprepare(msm_host->clk); - clk_disable_unprepare(msm_host->pclk); + clk_bulk_disable_unprepare(ARRAY_SIZE(msm_host->bulk_clks), + msm_host->bulk_clks); return 0; } @@ -1341,21 +1341,9 @@ static int sdhci_msm_runtime_resume(struct device *dev) struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); - int ret; - ret = clk_prepare_enable(msm_host->clk); - if (ret) { - dev_err(dev, "clk_enable failed for core_clk: %d\n", ret); - return ret; - } - ret = clk_prepare_enable(msm_host->pclk); - if (ret) { - dev_err(dev, "clk_enable failed for iface_clk: %d\n", ret); - clk_disable_unprepare(msm_host->clk); - return ret; - } - - return 0; + return clk_bulk_prepare_enable(ARRAY_SIZE(msm_host->bulk_clks), + msm_host->bulk_clks); } #endif From 4946b3af5e8e96f0f0ceec53701541f1faae714d Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 15 Sep 2017 16:35:24 -0700 Subject: [PATCH 0675/1085] mmc: sdhci-msm: Enable delay circuit calibration clocks The delay circuit used to support HS400 is calibrated based on two additional clocks. When these clocks are not available and FF_CLK_SW_RST_DIS is not set in CORE_HC_MODE, reset might fail. But on some platforms this doesn't work properly and below dump can be seen in the kernel log. mmc0: Reset 0x1 never completed. mmc0: sdhci: ============ SDHCI REGISTER DUMP =========== mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00001102 mmc0: sdhci: Blk size: 0x00004000 | Blk cnt: 0x00000000 mmc0: sdhci: Argument: 0x00000000 | Trn mode: 0x00000000 mmc0: sdhci: Present: 0x01f80000 | Host ctl: 0x00000000 mmc0: sdhci: Power: 0x00000000 | Blk gap: 0x00000000 mmc0: sdhci: Wake-up: 0x00000000 | Clock: 0x00000002 mmc0: sdhci: Timeout: 0x00000000 | Int stat: 0x00000000 mmc0: sdhci: Int enab: 0x00000000 | Sig enab: 0x00000000 mmc0: sdhci: AC12 err: 0x00000000 | Slot int: 0x00000000 mmc0: sdhci: Caps: 0x742dc8b2 | Caps_1: 0x00008007 mmc0: sdhci: Cmd: 0x00000000 | Max curr: 0x00000000 mmc0: sdhci: Resp[0]: 0x00000000 | Resp[1]: 0x00000000 mmc0: sdhci: Resp[2]: 0x00000000 | Resp[3]: 0x00000000 mmc0: sdhci: Host ctl2: 0x00000000 mmc0: sdhci: ============================================ Add support for the additional calibration clocks to allow these platforms to be configured appropriately. Cc: Venkat Gopalakrishnan Cc: Ritesh Harjani Signed-off-by: Bjorn Andersson Acked-by: Rob Herring Tested-by: Jeremy McNicoll Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-msm.txt | 2 ++ drivers/mmc/host/sdhci-msm.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt b/Documentation/devicetree/bindings/mmc/sdhci-msm.txt index 0576264eab5e..bfdcdc4ccdff 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-msm.txt @@ -18,6 +18,8 @@ Required properties: "core" - SDC MMC clock (MCLK) (required) "bus" - SDCC bus voter clock (optional) "xo" - TCXO clock (optional) + "cal" - reference clock for RCLK delay calibration (optional) + "sleep" - sleep clock for RCLK delay calibration (optional) Example: diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 526e19eced2f..33919d2b35d1 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -129,7 +129,7 @@ struct sdhci_msm_host { int pwr_irq; /* power irq */ struct clk *bus_clk; /* SDHC bus voter clock */ struct clk *xo_clk; /* TCXO clk needed for FLL feature of cm_dll*/ - struct clk_bulk_data bulk_clks[2]; /* core, iface clocks */ + struct clk_bulk_data bulk_clks[4]; /* core, iface, cal, sleep clocks */ unsigned long clk_rate; struct mmc_host *mmc; bool use_14lpp_dll_reset; @@ -1184,6 +1184,16 @@ static int sdhci_msm_probe(struct platform_device *pdev) if (ret) dev_warn(&pdev->dev, "core clock boost failed\n"); + clk = devm_clk_get(&pdev->dev, "cal"); + if (IS_ERR(clk)) + clk = NULL; + msm_host->bulk_clks[2].clk = clk; + + clk = devm_clk_get(&pdev->dev, "sleep"); + if (IS_ERR(clk)) + clk = NULL; + msm_host->bulk_clks[3].clk = clk; + ret = clk_bulk_prepare_enable(ARRAY_SIZE(msm_host->bulk_clks), msm_host->bulk_clks); if (ret) From 79ea73b05aaa83b7451a3aee48a5e835fb0f3864 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 15 Sep 2017 20:13:28 +0200 Subject: [PATCH 0676/1085] mmc: sdhci-pci: remove outdated declaration The function was removed half a year ago, so this declaration can go, too. Fixes: 51ced59cc02e0d ("mmc: sdhci-pci: Use ACPI DSM to get driver strength for some Intel devices") Signed-off-by: Wolfram Sang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/sdhci-pci-data.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h index fda15b6d4135..618f90d6e1ba 100644 --- a/include/linux/mmc/sdhci-pci-data.h +++ b/include/linux/mmc/sdhci-pci-data.h @@ -14,7 +14,4 @@ struct sdhci_pci_data { extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev, int slotno); - -extern int sdhci_pci_spt_drive_strength; - #endif From cdaba732ff2fde83685bce889eddc70964404381 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Sep 2017 15:17:05 +0300 Subject: [PATCH 0677/1085] mmc: sdhci-pci: Add support for Intel CDF Add PCI Id for Intel CDF. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 1 + drivers/mmc/host/sdhci-pci.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 67d787fa3306..e854ee5d7f4a 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -1290,6 +1290,7 @@ static const struct pci_device_id pci_ids[] = { SDHCI_PCI_DEVICE(INTEL, SPT_SDIO, intel_byt_sdio), SDHCI_PCI_DEVICE(INTEL, SPT_SD, intel_byt_sd), SDHCI_PCI_DEVICE(INTEL, DNV_EMMC, intel_byt_emmc), + SDHCI_PCI_DEVICE(INTEL, CDF_EMMC, intel_glk_emmc), SDHCI_PCI_DEVICE(INTEL, BXT_EMMC, intel_byt_emmc), SDHCI_PCI_DEVICE(INTEL, BXT_SDIO, intel_byt_sdio), SDHCI_PCI_DEVICE(INTEL, BXT_SD, intel_byt_sd), diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 75196a2b5289..3c1dd79fdbde 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -25,6 +25,7 @@ #define PCI_DEVICE_ID_INTEL_SPT_SDIO 0x9d2c #define PCI_DEVICE_ID_INTEL_SPT_SD 0x9d2d #define PCI_DEVICE_ID_INTEL_DNV_EMMC 0x19db +#define PCI_DEVICE_ID_INTEL_CDF_EMMC 0x18db #define PCI_DEVICE_ID_INTEL_BXT_SD 0x0aca #define PCI_DEVICE_ID_INTEL_BXT_EMMC 0x0acc #define PCI_DEVICE_ID_INTEL_BXT_SDIO 0x0ad0 From dd3f6983b4a468efca9e8caa0e2b4aa20946d801 Mon Sep 17 00:00:00 2001 From: yangbo lu Date: Thu, 21 Sep 2017 16:43:31 +0800 Subject: [PATCH 0678/1085] mmc: sdhci-of-esdhc: disable SD clock for clock value 0 SD clock should be disabled for clock value 0. It's not right to just return. This may cause failure of signal voltage switching. Signed-off-by: Yangbo Lu Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index d96a057a7db8..1f424374bbbb 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -458,6 +458,33 @@ static unsigned int esdhc_of_get_min_clock(struct sdhci_host *host) return clock / 256 / 16; } +static void esdhc_clock_enable(struct sdhci_host *host, bool enable) +{ + u32 val; + ktime_t timeout; + + val = sdhci_readl(host, ESDHC_SYSTEM_CONTROL); + + if (enable) + val |= ESDHC_CLOCK_SDCLKEN; + else + val &= ~ESDHC_CLOCK_SDCLKEN; + + sdhci_writel(host, val, ESDHC_SYSTEM_CONTROL); + + /* Wait max 20 ms */ + timeout = ktime_add_ms(ktime_get(), 20); + val = ESDHC_CLOCK_STABLE; + while (!(sdhci_readl(host, ESDHC_PRSSTAT) & val)) { + if (ktime_after(ktime_get(), timeout)) { + pr_err("%s: Internal clock never stabilised.\n", + mmc_hostname(host->mmc)); + break; + } + udelay(10); + } +} + static void esdhc_of_set_clock(struct sdhci_host *host, unsigned int clock) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -469,8 +496,10 @@ static void esdhc_of_set_clock(struct sdhci_host *host, unsigned int clock) host->mmc->actual_clock = 0; - if (clock == 0) + if (clock == 0) { + esdhc_clock_enable(host, false); return; + } /* Workaround to start pre_div at 2 for VNN < VENDOR_V_23 */ if (esdhc->vendor_ver < VENDOR_V_23) @@ -558,33 +587,6 @@ static void esdhc_pltfm_set_bus_width(struct sdhci_host *host, int width) sdhci_writel(host, ctrl, ESDHC_PROCTL); } -static void esdhc_clock_enable(struct sdhci_host *host, bool enable) -{ - u32 val; - ktime_t timeout; - - val = sdhci_readl(host, ESDHC_SYSTEM_CONTROL); - - if (enable) - val |= ESDHC_CLOCK_SDCLKEN; - else - val &= ~ESDHC_CLOCK_SDCLKEN; - - sdhci_writel(host, val, ESDHC_SYSTEM_CONTROL); - - /* Wait max 20 ms */ - timeout = ktime_add_ms(ktime_get(), 20); - val = ESDHC_CLOCK_STABLE; - while (!(sdhci_readl(host, ESDHC_PRSSTAT) & val)) { - if (ktime_after(ktime_get(), timeout)) { - pr_err("%s: Internal clock never stabilised.\n", - mmc_hostname(host->mmc)); - break; - } - udelay(10); - } -} - static void esdhc_reset(struct sdhci_host *host, u8 mask) { sdhci_reset(host, mask); From 97548575bef38abd06690a5a6f6816200c7e77f7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 20 Sep 2017 10:02:00 +0200 Subject: [PATCH 0679/1085] mmc: block: Convert RPMB to a character device The RPMB partition on the eMMC devices is a special area used for storing cryptographically safe information signed by a special secret key. To write and read records from this special area, authentication is needed. The RPMB area is *only* and *exclusively* accessed using ioctl():s from userspace. It is not really a block device, as blocks cannot be read or written from the device, also the signed chunks that can be stored on the RPMB are actually 256 bytes, not 512 making a block device a real bad fit. Currently the RPMB partition spawns a separate block device named /dev/mmcblkNrpmb for each device with an RPMB partition, including the creation of a block queue with its own kernel thread and all overhead associated with this. On the Ux500 HREFv60 platform, for example, the two eMMCs means that two block queues with separate threads are created for no use whatsoever. I have concluded that this block device design for RPMB is actually pretty wrong. The RPMB area should have been designed to be accessed from /dev/mmcblkN directly, using ioctl()s on the main block device. It is however way too late to change that, since userspace expects to open an RPMB device in /dev/mmcblkNrpmb and we cannot break userspace. This patch tries to amend the situation using the following strategy: - Stop creating a block device for the RPMB partition/area - Instead create a custom, dynamic character device with the same name. - Make this new character device support exactly the same set of ioctl()s as the old block device. - Wrap the requests back to the same ioctl() handlers, but issue them on the block queue of the main partition/area, i.e. /dev/mmcblkN We need to create a special "rpmb" bus type in order to get udev and/or busybox hot/coldplug to instantiate the device node properly. Before the patch, this appears in 'ps aux': 101 root 0:00 [mmcqd/2rpmb] 123 root 0:00 [mmcqd/3rpmb] After applying the patch these surplus block queue threads are gone, but RPMB is as usable as ever using the userspace MMC tools, such as 'mmc rpmb read-counter'. We get instead those dynamice devices in /dev: brw-rw---- 1 root root 179, 0 Jan 1 2000 mmcblk0 brw-rw---- 1 root root 179, 1 Jan 1 2000 mmcblk0p1 brw-rw---- 1 root root 179, 2 Jan 1 2000 mmcblk0p2 brw-rw---- 1 root root 179, 5 Jan 1 2000 mmcblk0p5 brw-rw---- 1 root root 179, 8 Jan 1 2000 mmcblk2 brw-rw---- 1 root root 179, 16 Jan 1 2000 mmcblk2boot0 brw-rw---- 1 root root 179, 24 Jan 1 2000 mmcblk2boot1 crw-rw---- 1 root root 248, 0 Jan 1 2000 mmcblk2rpmb brw-rw---- 1 root root 179, 32 Jan 1 2000 mmcblk3 brw-rw---- 1 root root 179, 40 Jan 1 2000 mmcblk3boot0 brw-rw---- 1 root root 179, 48 Jan 1 2000 mmcblk3boot1 brw-rw---- 1 root root 179, 33 Jan 1 2000 mmcblk3p1 crw-rw---- 1 root root 248, 1 Jan 1 2000 mmcblk3rpmb Notice the (248,0) and (248,1) character devices for RPMB. Cc: Tomas Winkler Signed-off-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 283 ++++++++++++++++++++++++++++++++++++--- drivers/mmc/core/queue.h | 2 + 2 files changed, 263 insertions(+), 22 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 2ad7b5c69156..f75932776968 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ static int max_devices; #define MAX_DEVICES 256 static DEFINE_IDA(mmc_blk_ida); +static DEFINE_IDA(mmc_rpmb_ida); /* * There is one mmc_blk_data per slot. @@ -96,6 +98,7 @@ struct mmc_blk_data { struct gendisk *disk; struct mmc_queue queue; struct list_head part; + struct list_head rpmbs; unsigned int flags; #define MMC_BLK_CMD23 (1 << 0) /* Can do SET_BLOCK_COUNT for multiblock */ @@ -121,6 +124,32 @@ struct mmc_blk_data { int area_type; }; +/* Device type for RPMB character devices */ +static dev_t mmc_rpmb_devt; + +/* Bus type for RPMB character devices */ +static struct bus_type mmc_rpmb_bus_type = { + .name = "mmc_rpmb", +}; + +/** + * struct mmc_rpmb_data - special RPMB device type for these areas + * @dev: the device for the RPMB area + * @chrdev: character device for the RPMB area + * @id: unique device ID number + * @part_index: partition index (0 on first) + * @md: parent MMC block device + * @node: list item, so we can put this device on a list + */ +struct mmc_rpmb_data { + struct device dev; + struct cdev chrdev; + int id; + unsigned int part_index; + struct mmc_blk_data *md; + struct list_head node; +}; + static DEFINE_MUTEX(open_lock); module_param(perdev_minors, int, 0444); @@ -299,6 +328,7 @@ struct mmc_blk_ioc_data { struct mmc_ioc_cmd ic; unsigned char *buf; u64 buf_bytes; + struct mmc_rpmb_data *rpmb; }; static struct mmc_blk_ioc_data *mmc_blk_ioctl_copy_from_user( @@ -437,14 +467,25 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, struct mmc_request mrq = {}; struct scatterlist sg; int err; - bool is_rpmb = false; + unsigned int target_part; u32 status = 0; if (!card || !md || !idata) return -EINVAL; - if (md->area_type & MMC_BLK_DATA_AREA_RPMB) - is_rpmb = true; + /* + * The RPMB accesses comes in from the character device, so we + * need to target these explicitly. Else we just target the + * partition type for the block device the ioctl() was issued + * on. + */ + if (idata->rpmb) { + /* Support multiple RPMB partitions */ + target_part = idata->rpmb->part_index; + target_part |= EXT_CSD_PART_CONFIG_ACC_RPMB; + } else { + target_part = md->part_type; + } cmd.opcode = idata->ic.opcode; cmd.arg = idata->ic.arg; @@ -488,7 +529,7 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, mrq.cmd = &cmd; - err = mmc_blk_part_switch(card, md->part_type); + err = mmc_blk_part_switch(card, target_part); if (err) return err; @@ -498,7 +539,7 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, return err; } - if (is_rpmb) { + if (idata->rpmb) { err = mmc_set_blockcount(card, data.blocks, idata->ic.write_flag & (1 << 31)); if (err) @@ -538,7 +579,7 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, memcpy(&(idata->ic.response), cmd.resp, sizeof(cmd.resp)); - if (is_rpmb) { + if (idata->rpmb) { /* * Ensure RPMB command has completed by polling CMD13 * "Send Status". @@ -554,7 +595,8 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, } static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, - struct mmc_ioc_cmd __user *ic_ptr) + struct mmc_ioc_cmd __user *ic_ptr, + struct mmc_rpmb_data *rpmb) { struct mmc_blk_ioc_data *idata; struct mmc_blk_ioc_data *idatas[1]; @@ -566,6 +608,8 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, idata = mmc_blk_ioctl_copy_from_user(ic_ptr); if (IS_ERR(idata)) return PTR_ERR(idata); + /* This will be NULL on non-RPMB ioctl():s */ + idata->rpmb = rpmb; card = md->queue.card; if (IS_ERR(card)) { @@ -581,7 +625,8 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); idatas[0] = idata; - req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_IOCTL; + req_to_mmc_queue_req(req)->drv_op = + rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; blk_execute_rq(mq->queue, NULL, req, 0); @@ -596,7 +641,8 @@ cmd_done: } static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, - struct mmc_ioc_multi_cmd __user *user) + struct mmc_ioc_multi_cmd __user *user, + struct mmc_rpmb_data *rpmb) { struct mmc_blk_ioc_data **idata = NULL; struct mmc_ioc_cmd __user *cmds = user->cmds; @@ -627,6 +673,8 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, num_of_cmds = i; goto cmd_err; } + /* This will be NULL on non-RPMB ioctl():s */ + idata[i]->rpmb = rpmb; } card = md->queue.card; @@ -643,7 +691,8 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, req = blk_get_request(mq->queue, idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); - req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_IOCTL; + req_to_mmc_queue_req(req)->drv_op = + rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; blk_execute_rq(mq->queue, NULL, req, 0); @@ -691,7 +740,8 @@ static int mmc_blk_ioctl(struct block_device *bdev, fmode_t mode, if (!md) return -EINVAL; ret = mmc_blk_ioctl_cmd(md, - (struct mmc_ioc_cmd __user *)arg); + (struct mmc_ioc_cmd __user *)arg, + NULL); mmc_blk_put(md); return ret; case MMC_IOC_MULTI_CMD: @@ -702,7 +752,8 @@ static int mmc_blk_ioctl(struct block_device *bdev, fmode_t mode, if (!md) return -EINVAL; ret = mmc_blk_ioctl_multi_cmd(md, - (struct mmc_ioc_multi_cmd __user *)arg); + (struct mmc_ioc_multi_cmd __user *)arg, + NULL); mmc_blk_put(md); return ret; default: @@ -1174,17 +1225,19 @@ static void mmc_blk_issue_drv_op(struct mmc_queue *mq, struct request *req) struct mmc_queue_req *mq_rq; struct mmc_card *card = mq->card; struct mmc_blk_data *md = mq->blkdata; - struct mmc_blk_data *main_md = dev_get_drvdata(&card->dev); struct mmc_blk_ioc_data **idata; + bool rpmb_ioctl; u8 **ext_csd; u32 status; int ret; int i; mq_rq = req_to_mmc_queue_req(req); + rpmb_ioctl = (mq_rq->drv_op == MMC_DRV_OP_IOCTL_RPMB); switch (mq_rq->drv_op) { case MMC_DRV_OP_IOCTL: + case MMC_DRV_OP_IOCTL_RPMB: idata = mq_rq->drv_op_data; for (i = 0, ret = 0; i < mq_rq->ioc_count; i++) { ret = __mmc_blk_ioctl_cmd(card, md, idata[i]); @@ -1192,8 +1245,8 @@ static void mmc_blk_issue_drv_op(struct mmc_queue *mq, struct request *req) break; } /* Always switch back to main area after RPMB access */ - if (md->area_type & MMC_BLK_DATA_AREA_RPMB) - mmc_blk_part_switch(card, main_md->part_type); + if (rpmb_ioctl) + mmc_blk_part_switch(card, 0); break; case MMC_DRV_OP_BOOT_WP: ret = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, EXT_CSD_BOOT_WP, @@ -2068,6 +2121,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, spin_lock_init(&md->lock); INIT_LIST_HEAD(&md->part); + INIT_LIST_HEAD(&md->rpmbs); md->usage = 1; ret = mmc_init_queue(&md->queue, card, &md->lock, subname); @@ -2186,6 +2240,154 @@ static int mmc_blk_alloc_part(struct mmc_card *card, return 0; } +/** + * mmc_rpmb_ioctl() - ioctl handler for the RPMB chardev + * @filp: the character device file + * @cmd: the ioctl() command + * @arg: the argument from userspace + * + * This will essentially just redirect the ioctl()s coming in over to + * the main block device spawning the RPMB character device. + */ +static long mmc_rpmb_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mmc_rpmb_data *rpmb = filp->private_data; + int ret; + + switch (cmd) { + case MMC_IOC_CMD: + ret = mmc_blk_ioctl_cmd(rpmb->md, + (struct mmc_ioc_cmd __user *)arg, + rpmb); + break; + case MMC_IOC_MULTI_CMD: + ret = mmc_blk_ioctl_multi_cmd(rpmb->md, + (struct mmc_ioc_multi_cmd __user *)arg, + rpmb); + break; + default: + ret = -EINVAL; + break; + } + + return 0; +} + +#ifdef CONFIG_COMPAT +static long mmc_rpmb_ioctl_compat(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return mmc_rpmb_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static int mmc_rpmb_chrdev_open(struct inode *inode, struct file *filp) +{ + struct mmc_rpmb_data *rpmb = container_of(inode->i_cdev, + struct mmc_rpmb_data, chrdev); + + get_device(&rpmb->dev); + filp->private_data = rpmb; + mutex_lock(&open_lock); + rpmb->md->usage++; + mutex_unlock(&open_lock); + + return nonseekable_open(inode, filp); +} + +static int mmc_rpmb_chrdev_release(struct inode *inode, struct file *filp) +{ + struct mmc_rpmb_data *rpmb = container_of(inode->i_cdev, + struct mmc_rpmb_data, chrdev); + + put_device(&rpmb->dev); + mutex_lock(&open_lock); + rpmb->md->usage--; + mutex_unlock(&open_lock); + + return 0; +} + +static const struct file_operations mmc_rpmb_fileops = { + .release = mmc_rpmb_chrdev_release, + .open = mmc_rpmb_chrdev_open, + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = mmc_rpmb_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = mmc_rpmb_ioctl_compat, +#endif +}; + + +static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, + struct mmc_blk_data *md, + unsigned int part_index, + sector_t size, + const char *subname) +{ + int devidx, ret; + char rpmb_name[DISK_NAME_LEN]; + char cap_str[10]; + struct mmc_rpmb_data *rpmb; + + /* This creates the minor number for the RPMB char device */ + devidx = ida_simple_get(&mmc_rpmb_ida, 0, max_devices, GFP_KERNEL); + if (devidx < 0) + return devidx; + + rpmb = kzalloc(sizeof(*rpmb), GFP_KERNEL); + if (!rpmb) + return -ENOMEM; + + snprintf(rpmb_name, sizeof(rpmb_name), + "mmcblk%u%s", card->host->index, subname ? subname : ""); + + rpmb->id = devidx; + rpmb->part_index = part_index; + rpmb->dev.init_name = rpmb_name; + rpmb->dev.bus = &mmc_rpmb_bus_type; + rpmb->dev.devt = MKDEV(MAJOR(mmc_rpmb_devt), rpmb->id); + rpmb->dev.parent = &card->dev; + device_initialize(&rpmb->dev); + dev_set_drvdata(&rpmb->dev, rpmb); + rpmb->md = md; + + cdev_init(&rpmb->chrdev, &mmc_rpmb_fileops); + rpmb->chrdev.owner = THIS_MODULE; + ret = cdev_device_add(&rpmb->chrdev, &rpmb->dev); + if (ret) { + pr_err("%s: could not add character device\n", rpmb_name); + goto out_remove_ida; + } + + list_add(&rpmb->node, &md->rpmbs); + + string_get_size((u64)size, 512, STRING_UNITS_2, + cap_str, sizeof(cap_str)); + + pr_info("%s: %s %s partition %u %s, chardev (%d:%d)\n", + rpmb_name, mmc_card_id(card), + mmc_card_name(card), EXT_CSD_PART_CONFIG_ACC_RPMB, cap_str, + MAJOR(mmc_rpmb_devt), rpmb->id); + + return 0; + +out_remove_ida: + ida_simple_remove(&mmc_rpmb_ida, rpmb->id); + kfree(rpmb); + return ret; +} + +static void mmc_blk_remove_rpmb_part(struct mmc_rpmb_data *rpmb) +{ + cdev_device_del(&rpmb->chrdev, &rpmb->dev); + device_del(&rpmb->dev); + ida_simple_remove(&mmc_rpmb_ida, rpmb->id); + kfree(rpmb); +} + /* MMC Physical partitions consist of two boot partitions and * up to four general purpose partitions. * For each partition enabled in EXT_CSD a block device will be allocatedi @@ -2194,13 +2396,26 @@ static int mmc_blk_alloc_part(struct mmc_card *card, static int mmc_blk_alloc_parts(struct mmc_card *card, struct mmc_blk_data *md) { - int idx, ret = 0; + int idx, ret; if (!mmc_card_mmc(card)) return 0; for (idx = 0; idx < card->nr_parts; idx++) { - if (card->part[idx].size) { + if (card->part[idx].area_type & MMC_BLK_DATA_AREA_RPMB) { + /* + * RPMB partitions does not provide block access, they + * are only accessed using ioctl():s. Thus create + * special RPMB block devices that do not have a + * backing block queue for these. + */ + ret = mmc_blk_alloc_rpmb_part(card, md, + card->part[idx].part_cfg, + card->part[idx].size >> 9, + card->part[idx].name); + if (ret) + return ret; + } else if (card->part[idx].size) { ret = mmc_blk_alloc_part(card, md, card->part[idx].part_cfg, card->part[idx].size >> 9, @@ -2212,7 +2427,7 @@ static int mmc_blk_alloc_parts(struct mmc_card *card, struct mmc_blk_data *md) } } - return ret; + return 0; } static void mmc_blk_remove_req(struct mmc_blk_data *md) @@ -2249,7 +2464,15 @@ static void mmc_blk_remove_parts(struct mmc_card *card, { struct list_head *pos, *q; struct mmc_blk_data *part_md; + struct mmc_rpmb_data *rpmb; + /* Remove RPMB partitions */ + list_for_each_safe(pos, q, &md->rpmbs) { + rpmb = list_entry(pos, struct mmc_rpmb_data, node); + list_del(pos); + mmc_blk_remove_rpmb_part(rpmb); + } + /* Remove block partitions */ list_for_each_safe(pos, q, &md->part) { part_md = list_entry(pos, struct mmc_blk_data, part); list_del(pos); @@ -2568,6 +2791,17 @@ static int __init mmc_blk_init(void) { int res; + res = bus_register(&mmc_rpmb_bus_type); + if (res < 0) { + pr_err("mmcblk: could not register RPMB bus type\n"); + return res; + } + res = alloc_chrdev_region(&mmc_rpmb_devt, 0, MAX_DEVICES, "rpmb"); + if (res < 0) { + pr_err("mmcblk: failed to allocate rpmb chrdev region\n"); + goto out_bus_unreg; + } + if (perdev_minors != CONFIG_MMC_BLOCK_MINORS) pr_info("mmcblk: using %d minors per device\n", perdev_minors); @@ -2575,16 +2809,20 @@ static int __init mmc_blk_init(void) res = register_blkdev(MMC_BLOCK_MAJOR, "mmc"); if (res) - goto out; + goto out_chrdev_unreg; res = mmc_register_driver(&mmc_driver); if (res) - goto out2; + goto out_blkdev_unreg; return 0; - out2: + +out_blkdev_unreg: unregister_blkdev(MMC_BLOCK_MAJOR, "mmc"); - out: +out_chrdev_unreg: + unregister_chrdev_region(mmc_rpmb_devt, MAX_DEVICES); +out_bus_unreg: + bus_unregister(&mmc_rpmb_bus_type); return res; } @@ -2592,6 +2830,7 @@ static void __exit mmc_blk_exit(void) { mmc_unregister_driver(&mmc_driver); unregister_blkdev(MMC_BLOCK_MAJOR, "mmc"); + unregister_chrdev_region(mmc_rpmb_devt, MAX_DEVICES); } module_init(mmc_blk_init); diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index f18d3f656baa..5807c03dddcf 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -35,12 +35,14 @@ struct mmc_blk_request { /** * enum mmc_drv_op - enumerates the operations in the mmc_queue_req * @MMC_DRV_OP_IOCTL: ioctl operation + * @MMC_DRV_OP_IOCTL_RPMB: RPMB-oriented ioctl operation * @MMC_DRV_OP_BOOT_WP: write protect boot partitions * @MMC_DRV_OP_GET_CARD_STATUS: get card status * @MMC_DRV_OP_GET_EXT_CSD: get the EXT CSD from an eMMC card */ enum mmc_drv_op { MMC_DRV_OP_IOCTL, + MMC_DRV_OP_IOCTL_RPMB, MMC_DRV_OP_BOOT_WP, MMC_DRV_OP_GET_CARD_STATUS, MMC_DRV_OP_GET_EXT_CSD, From 14f4ca7e4d2825f9f71e22905ae177b899959f1d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 20 Sep 2017 10:02:01 +0200 Subject: [PATCH 0680/1085] mmc: block: Delete mmc_access_rpmb() This function is used by the block layer queue to bail out of requests if the current request is towards an RPMB "block device". This was done to avoid boot time scanning of this "block device" which was never really a block device, thus duct-taping over the fact that it was badly engineered. This problem is now gone as we removed the offending RPMB block device in another patch and replaced it with a character device. Cc: Tomas Winkler Signed-off-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 12 ------------ drivers/mmc/core/queue.c | 2 +- drivers/mmc/core/queue.h | 2 -- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index f75932776968..ab9c780df750 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1203,18 +1203,6 @@ static inline void mmc_blk_reset_success(struct mmc_blk_data *md, int type) md->reset_done &= ~type; } -int mmc_access_rpmb(struct mmc_queue *mq) -{ - struct mmc_blk_data *md = mq->blkdata; - /* - * If this is a RPMB partition access, return ture - */ - if (md && md->part_type == EXT_CSD_PART_CONFIG_ACC_RPMB) - return true; - - return false; -} - /* * The non-block commands come back from the block layer after it queued it and * processed it with all other requests and then they get issued in this diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 0a4e77a5ba33..f74f9ef460cc 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -30,7 +30,7 @@ static int mmc_prep_request(struct request_queue *q, struct request *req) { struct mmc_queue *mq = q->queuedata; - if (mq && (mmc_card_removed(mq->card) || mmc_access_rpmb(mq))) + if (mq && mmc_card_removed(mq->card)) return BLKPREP_KILL; req->rq_flags |= RQF_DONTPREP; diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 5807c03dddcf..68f68ecd94ea 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -83,6 +83,4 @@ extern void mmc_queue_resume(struct mmc_queue *); extern unsigned int mmc_queue_map_sg(struct mmc_queue *, struct mmc_queue_req *); -extern int mmc_access_rpmb(struct mmc_queue *); - #endif From 1c87f73578497a6c3cc77bcbfd2e5bf15fe753c7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 4 Oct 2017 11:10:07 +0200 Subject: [PATCH 0681/1085] mmc: block: Fix bug when removing RPMB chardev I forgot to account for the fact that the device core holds a reference to a device added with device_initialize() that need to be released with a corresponding put_device() to reach a 0 refcount at the end of the lifecycle. This led to a NULL pointer reference when freeing the device when e.g. unbidning the host device in sysfs. Fix this and use the device .release() callback to free the IDA and free:ing the memory used by the RPMB device. Before this patch: /sys/bus/amba/drivers/mmci-pl18x$ echo 80114000.sdi4_per2 > unbind [ 29.797332] mmc3: card 0001 removed [ 29.810791] Unable to handle kernel NULL pointer dereference at virtual address 00000050 [ 29.818878] pgd = de70c000 [ 29.821624] [00000050] *pgd=1e70a831, *pte=00000000, *ppte=00000000 [ 29.827911] Internal error: Oops: 17 [#1] PREEMPT SMP ARM [ 29.833282] Modules linked in: [ 29.836334] CPU: 1 PID: 154 Comm: sh Not tainted 4.14.0-rc3-00039-g83318e309566-dirty #736 [ 29.844604] Hardware name: ST-Ericsson Ux5x0 platform (Device Tree Support) [ 29.851562] task: de572700 task.stack: de742000 [ 29.856079] PC is at kernfs_find_ns+0x8/0x100 [ 29.860443] LR is at kernfs_find_and_get_ns+0x30/0x48 After this patch: /sys/bus/amba/drivers/mmci-pl18x$ echo 80005000.sdi4_per2 > unbind [ 20.623382] mmc3: card 0001 removed Fixes: 97548575bef3 ("mmc: block: Convert RPMB to a character device") Reported-by: Adrian Hunter Signed-off-by: Linus Walleij Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index ab9c780df750..f11537a66a60 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2277,9 +2277,7 @@ static int mmc_rpmb_chrdev_open(struct inode *inode, struct file *filp) get_device(&rpmb->dev); filp->private_data = rpmb; - mutex_lock(&open_lock); - rpmb->md->usage++; - mutex_unlock(&open_lock); + mmc_blk_get(rpmb->md->disk); return nonseekable_open(inode, filp); } @@ -2290,9 +2288,7 @@ static int mmc_rpmb_chrdev_release(struct inode *inode, struct file *filp) struct mmc_rpmb_data, chrdev); put_device(&rpmb->dev); - mutex_lock(&open_lock); - rpmb->md->usage--; - mutex_unlock(&open_lock); + mmc_blk_put(rpmb->md); return 0; } @@ -2308,6 +2304,13 @@ static const struct file_operations mmc_rpmb_fileops = { #endif }; +static void mmc_blk_rpmb_device_release(struct device *dev) +{ + struct mmc_rpmb_data *rpmb = dev_get_drvdata(dev); + + ida_simple_remove(&mmc_rpmb_ida, rpmb->id); + kfree(rpmb); +} static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, struct mmc_blk_data *md, @@ -2326,8 +2329,10 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, return devidx; rpmb = kzalloc(sizeof(*rpmb), GFP_KERNEL); - if (!rpmb) + if (!rpmb) { + ida_simple_remove(&mmc_rpmb_ida, devidx); return -ENOMEM; + } snprintf(rpmb_name, sizeof(rpmb_name), "mmcblk%u%s", card->host->index, subname ? subname : ""); @@ -2338,6 +2343,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, rpmb->dev.bus = &mmc_rpmb_bus_type; rpmb->dev.devt = MKDEV(MAJOR(mmc_rpmb_devt), rpmb->id); rpmb->dev.parent = &card->dev; + rpmb->dev.release = mmc_blk_rpmb_device_release; device_initialize(&rpmb->dev); dev_set_drvdata(&rpmb->dev, rpmb); rpmb->md = md; @@ -2347,7 +2353,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, ret = cdev_device_add(&rpmb->chrdev, &rpmb->dev); if (ret) { pr_err("%s: could not add character device\n", rpmb_name); - goto out_remove_ida; + goto out_put_device; } list_add(&rpmb->node, &md->rpmbs); @@ -2362,18 +2368,16 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, return 0; -out_remove_ida: - ida_simple_remove(&mmc_rpmb_ida, rpmb->id); - kfree(rpmb); +out_put_device: + put_device(&rpmb->dev); return ret; } static void mmc_blk_remove_rpmb_part(struct mmc_rpmb_data *rpmb) + { cdev_device_del(&rpmb->chrdev, &rpmb->dev); - device_del(&rpmb->dev); - ida_simple_remove(&mmc_rpmb_ida, rpmb->id); - kfree(rpmb); + put_device(&rpmb->dev); } /* MMC Physical partitions consist of two boot partitions and From 9ca28c5cd967e07e708a407eec59894e2e643670 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:50 +0300 Subject: [PATCH 0682/1085] mmc: core: Remove unnecessary host claim Callers already have the host claimed, so remove the unnecessary calls to mmc_claim_host() and mmc_release_host(). Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_ops.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 54686ca4bfb7..a6b0a232f24a 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -977,7 +977,6 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception) from_exception) return; - mmc_claim_host(card->host); if (card->ext_csd.raw_bkops_status >= EXT_CSD_BKOPS_LEVEL_2) { timeout = MMC_OPS_TIMEOUT_MS; use_busy_signal = true; @@ -995,7 +994,7 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception) pr_warn("%s: Error %d starting bkops\n", mmc_hostname(card->host), err); mmc_retune_release(card->host); - goto out; + return; } /* @@ -1007,8 +1006,6 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception) mmc_card_set_doing_bkops(card); else mmc_retune_release(card->host); -out: - mmc_release_host(card->host); } /* From 6c0cedd1ef9527ef13e66875746570e76a3188a7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:51 +0300 Subject: [PATCH 0683/1085] mmc: core: Introduce host claiming by context Currently the host can be claimed by a task. Change this so that the host can be claimed by a context that may or may not be a task. This provides for the host to be claimed by a block driver queue to support blk-mq, while maintaining compatibility with the existing use of mmc_claim_host(). Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 4 ++-- drivers/mmc/core/core.c | 48 +++++++++++++++++++++++++++++++------ drivers/mmc/core/core.h | 9 +++---- drivers/mmc/core/mmc.c | 4 ++-- drivers/mmc/core/sd.c | 4 ++-- drivers/mmc/core/sdio_irq.c | 3 ++- include/linux/mmc/host.h | 7 +++++- 7 files changed, 60 insertions(+), 19 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index f11537a66a60..9476312f081a 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1989,7 +1989,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) if (req && !mq->qcnt) /* claim host only for the first request */ - mmc_get_card(card); + mmc_get_card(card, NULL); ret = mmc_blk_part_switch(card, md->part_type); if (ret) { @@ -2052,7 +2052,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) out: if (!mq->qcnt) - mmc_put_card(card); + mmc_put_card(card, NULL); } static inline int mmc_blk_readonly(struct mmc_card *card) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 66c9cf49ad2f..b997cf92ce6c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -832,9 +832,36 @@ unsigned int mmc_align_data_size(struct mmc_card *card, unsigned int sz) } EXPORT_SYMBOL(mmc_align_data_size); +/* + * Allow claiming an already claimed host if the context is the same or there is + * no context but the task is the same. + */ +static inline bool mmc_ctx_matches(struct mmc_host *host, struct mmc_ctx *ctx, + struct task_struct *task) +{ + return host->claimer == ctx || + (!ctx && task && host->claimer->task == task); +} + +static inline void mmc_ctx_set_claimer(struct mmc_host *host, + struct mmc_ctx *ctx, + struct task_struct *task) +{ + if (!host->claimer) { + if (ctx) + host->claimer = ctx; + else + host->claimer = &host->default_ctx; + } + if (task) + host->claimer->task = task; +} + /** * __mmc_claim_host - exclusively claim a host * @host: mmc host to claim + * @ctx: context that claims the host or NULL in which case the default + * context will be used * @abort: whether or not the operation should be aborted * * Claim a host for a set of operations. If @abort is non null and @@ -842,8 +869,10 @@ EXPORT_SYMBOL(mmc_align_data_size); * that non-zero value without acquiring the lock. Returns zero * with the lock held otherwise. */ -int __mmc_claim_host(struct mmc_host *host, atomic_t *abort) +int __mmc_claim_host(struct mmc_host *host, struct mmc_ctx *ctx, + atomic_t *abort) { + struct task_struct *task = ctx ? NULL : current; DECLARE_WAITQUEUE(wait, current); unsigned long flags; int stop; @@ -856,7 +885,7 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort) while (1) { set_current_state(TASK_UNINTERRUPTIBLE); stop = abort ? atomic_read(abort) : 0; - if (stop || !host->claimed || host->claimer == current) + if (stop || !host->claimed || mmc_ctx_matches(host, ctx, task)) break; spin_unlock_irqrestore(&host->lock, flags); schedule(); @@ -865,7 +894,7 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort) set_current_state(TASK_RUNNING); if (!stop) { host->claimed = 1; - host->claimer = current; + mmc_ctx_set_claimer(host, ctx, task); host->claim_cnt += 1; if (host->claim_cnt == 1) pm = true; @@ -900,6 +929,7 @@ void mmc_release_host(struct mmc_host *host) spin_unlock_irqrestore(&host->lock, flags); } else { host->claimed = 0; + host->claimer->task = NULL; host->claimer = NULL; spin_unlock_irqrestore(&host->lock, flags); wake_up(&host->wq); @@ -913,10 +943,10 @@ EXPORT_SYMBOL(mmc_release_host); * This is a helper function, which fetches a runtime pm reference for the * card device and also claims the host. */ -void mmc_get_card(struct mmc_card *card) +void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx) { pm_runtime_get_sync(&card->dev); - mmc_claim_host(card->host); + __mmc_claim_host(card->host, ctx, NULL); } EXPORT_SYMBOL(mmc_get_card); @@ -924,9 +954,13 @@ EXPORT_SYMBOL(mmc_get_card); * This is a helper function, which releases the host and drops the runtime * pm reference for the card device. */ -void mmc_put_card(struct mmc_card *card) +void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx) { - mmc_release_host(card->host); + struct mmc_host *host = card->host; + + WARN_ON(ctx && host->claimer != ctx); + + mmc_release_host(host); pm_runtime_mark_last_busy(&card->dev); pm_runtime_put_autosuspend(&card->dev); } diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index ca861091a776..94675f88f704 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -128,10 +128,11 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen); int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount, bool is_rel_write); -int __mmc_claim_host(struct mmc_host *host, atomic_t *abort); +int __mmc_claim_host(struct mmc_host *host, struct mmc_ctx *ctx, + atomic_t *abort); void mmc_release_host(struct mmc_host *host); -void mmc_get_card(struct mmc_card *card); -void mmc_put_card(struct mmc_card *card); +void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx); +void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx); /** * mmc_claim_host - exclusively claim a host @@ -141,7 +142,7 @@ void mmc_put_card(struct mmc_card *card); */ static inline void mmc_claim_host(struct mmc_host *host) { - __mmc_claim_host(host, NULL); + __mmc_claim_host(host, NULL, NULL); } #endif diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 36217ad5e9b1..8d77ddc85af1 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1911,14 +1911,14 @@ static void mmc_detect(struct mmc_host *host) { int err; - mmc_get_card(host->card); + mmc_get_card(host->card, NULL); /* * Just check if our card has been removed. */ err = _mmc_detect_card_removed(host); - mmc_put_card(host->card); + mmc_put_card(host->card, NULL); if (err) { mmc_remove(host); diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 4fd1620b732d..2036b2b4835c 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -1056,14 +1056,14 @@ static void mmc_sd_detect(struct mmc_host *host) { int err; - mmc_get_card(host->card); + mmc_get_card(host->card, NULL); /* * Just check if our card has been removed. */ err = _mmc_detect_card_removed(host); - mmc_put_card(host->card); + mmc_put_card(host->card, NULL); if (err) { mmc_sd_remove(host); diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c index c771843e4c15..7a2eaf8410a3 100644 --- a/drivers/mmc/core/sdio_irq.c +++ b/drivers/mmc/core/sdio_irq.c @@ -155,7 +155,8 @@ static int sdio_irq_thread(void *_host) * holding of the host lock does not cover too much work * that doesn't require that lock to be held. */ - ret = __mmc_claim_host(host, &host->sdio_irq_thread_abort); + ret = __mmc_claim_host(host, NULL, + &host->sdio_irq_thread_abort); if (ret) break; ret = process_sdio_pending_irqs(host); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 9a43763a68ad..443f7a8cdfe5 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -255,6 +255,10 @@ struct mmc_supply { struct regulator *vqmmc; /* Optional Vccq supply */ }; +struct mmc_ctx { + struct task_struct *task; +}; + struct mmc_host { struct device *parent; struct device class_dev; @@ -388,8 +392,9 @@ struct mmc_host { struct mmc_card *card; /* device attached to this host */ wait_queue_head_t wq; - struct task_struct *claimer; /* task that has host claimed */ + struct mmc_ctx *claimer; /* context that has host claimed */ int claim_cnt; /* "claim" nesting count */ + struct mmc_ctx default_ctx; /* default context */ struct delayed_work detect; int detect_change; /* card detect flag */ From 72a5af554df837e373efb0d6c8fc68c568f9a7ac Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:52 +0300 Subject: [PATCH 0684/1085] mmc: core: Add support for handling CQE requests Add core support for handling CQE requests, including starting, completing and recovering. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 163 +++++++++++++++++++++++++++++++++++++-- drivers/mmc/core/core.h | 4 + include/linux/mmc/host.h | 2 + 3 files changed, 164 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b997cf92ce6c..2ff614d4ffac 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -266,7 +266,8 @@ static void __mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) host->ops->request(host, mrq); } -static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq) +static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq, + bool cqe) { if (mrq->sbc) { pr_debug("<%s: starting CMD%u arg %08x flags %08x>\n", @@ -275,9 +276,12 @@ static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq) } if (mrq->cmd) { - pr_debug("%s: starting CMD%u arg %08x flags %08x\n", - mmc_hostname(host), mrq->cmd->opcode, mrq->cmd->arg, - mrq->cmd->flags); + pr_debug("%s: starting %sCMD%u arg %08x flags %08x\n", + mmc_hostname(host), cqe ? "CQE direct " : "", + mrq->cmd->opcode, mrq->cmd->arg, mrq->cmd->flags); + } else if (cqe) { + pr_debug("%s: starting CQE transfer for tag %d blkaddr %u\n", + mmc_hostname(host), mrq->tag, mrq->data->blk_addr); } if (mrq->data) { @@ -342,7 +346,7 @@ static int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) if (mmc_card_removed(host->card)) return -ENOMEDIUM; - mmc_mrq_pr_debug(host, mrq); + mmc_mrq_pr_debug(host, mrq, false); WARN_ON(!host->claimed); @@ -482,6 +486,155 @@ void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq) } EXPORT_SYMBOL(mmc_wait_for_req_done); +/* + * mmc_cqe_start_req - Start a CQE request. + * @host: MMC host to start the request + * @mrq: request to start + * + * Start the request, re-tuning if needed and it is possible. Returns an error + * code if the request fails to start or -EBUSY if CQE is busy. + */ +int mmc_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq) +{ + int err; + + /* + * CQE cannot process re-tuning commands. Caller must hold retuning + * while CQE is in use. Re-tuning can happen here only when CQE has no + * active requests i.e. this is the first. Note, re-tuning will call + * ->cqe_off(). + */ + err = mmc_retune(host); + if (err) + goto out_err; + + mrq->host = host; + + mmc_mrq_pr_debug(host, mrq, true); + + err = mmc_mrq_prep(host, mrq); + if (err) + goto out_err; + + err = host->cqe_ops->cqe_request(host, mrq); + if (err) + goto out_err; + + trace_mmc_request_start(host, mrq); + + return 0; + +out_err: + if (mrq->cmd) { + pr_debug("%s: failed to start CQE direct CMD%u, error %d\n", + mmc_hostname(host), mrq->cmd->opcode, err); + } else { + pr_debug("%s: failed to start CQE transfer for tag %d, error %d\n", + mmc_hostname(host), mrq->tag, err); + } + return err; +} +EXPORT_SYMBOL(mmc_cqe_start_req); + +/** + * mmc_cqe_request_done - CQE has finished processing an MMC request + * @host: MMC host which completed request + * @mrq: MMC request which completed + * + * CQE drivers should call this function when they have completed + * their processing of a request. + */ +void mmc_cqe_request_done(struct mmc_host *host, struct mmc_request *mrq) +{ + mmc_should_fail_request(host, mrq); + + /* Flag re-tuning needed on CRC errors */ + if ((mrq->cmd && mrq->cmd->error == -EILSEQ) || + (mrq->data && mrq->data->error == -EILSEQ)) + mmc_retune_needed(host); + + trace_mmc_request_done(host, mrq); + + if (mrq->cmd) { + pr_debug("%s: CQE req done (direct CMD%u): %d\n", + mmc_hostname(host), mrq->cmd->opcode, mrq->cmd->error); + } else { + pr_debug("%s: CQE transfer done tag %d\n", + mmc_hostname(host), mrq->tag); + } + + if (mrq->data) { + pr_debug("%s: %d bytes transferred: %d\n", + mmc_hostname(host), + mrq->data->bytes_xfered, mrq->data->error); + } + + mrq->done(mrq); +} +EXPORT_SYMBOL(mmc_cqe_request_done); + +/** + * mmc_cqe_post_req - CQE post process of a completed MMC request + * @host: MMC host + * @mrq: MMC request to be processed + */ +void mmc_cqe_post_req(struct mmc_host *host, struct mmc_request *mrq) +{ + if (host->cqe_ops->cqe_post_req) + host->cqe_ops->cqe_post_req(host, mrq); +} +EXPORT_SYMBOL(mmc_cqe_post_req); + +/* Arbitrary 1 second timeout */ +#define MMC_CQE_RECOVERY_TIMEOUT 1000 + +/* + * mmc_cqe_recovery - Recover from CQE errors. + * @host: MMC host to recover + * + * Recovery consists of stopping CQE, stopping eMMC, discarding the queue in + * in eMMC, and discarding the queue in CQE. CQE must call + * mmc_cqe_request_done() on all requests. An error is returned if the eMMC + * fails to discard its queue. + */ +int mmc_cqe_recovery(struct mmc_host *host) +{ + struct mmc_command cmd; + int err; + + mmc_retune_hold_now(host); + + /* + * Recovery is expected seldom, if at all, but it reduces performance, + * so make sure it is not completely silent. + */ + pr_warn("%s: running CQE recovery\n", mmc_hostname(host)); + + host->cqe_ops->cqe_recovery_start(host); + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = MMC_STOP_TRANSMISSION, + cmd.flags = MMC_RSP_R1B | MMC_CMD_AC, + cmd.flags &= ~MMC_RSP_CRC; /* Ignore CRC */ + cmd.busy_timeout = MMC_CQE_RECOVERY_TIMEOUT, + mmc_wait_for_cmd(host, &cmd, 0); + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = MMC_CMDQ_TASK_MGMT; + cmd.arg = 1; /* Discard entire queue */ + cmd.flags = MMC_RSP_R1B | MMC_CMD_AC; + cmd.flags &= ~MMC_RSP_CRC; /* Ignore CRC */ + cmd.busy_timeout = MMC_CQE_RECOVERY_TIMEOUT, + err = mmc_wait_for_cmd(host, &cmd, 0); + + host->cqe_ops->cqe_recovery_finish(host); + + mmc_retune_release(host); + + return err; +} +EXPORT_SYMBOL(mmc_cqe_recovery); + /** * mmc_is_req_done - Determine if a 'cap_cmd_during_tfr' request is done * @host: MMC host diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 94675f88f704..ba5a8fea0dc2 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -145,4 +145,8 @@ static inline void mmc_claim_host(struct mmc_host *host) __mmc_claim_host(host, NULL, NULL); } +int mmc_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq); +void mmc_cqe_post_req(struct mmc_host *host, struct mmc_request *mrq); +int mmc_cqe_recovery(struct mmc_host *host); + #endif diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 443f7a8cdfe5..c296f4351c1d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -474,6 +474,8 @@ void mmc_detect_change(struct mmc_host *, unsigned long delay); void mmc_request_done(struct mmc_host *, struct mmc_request *); void mmc_command_done(struct mmc_host *host, struct mmc_request *mrq); +void mmc_cqe_request_done(struct mmc_host *host, struct mmc_request *mrq); + static inline void mmc_signal_sdio_irq(struct mmc_host *host) { host->ops->enable_sdio_irq(host, 0); From 98d4f7809d99bbf456f93816ef9895616cdd1b2d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:53 +0300 Subject: [PATCH 0685/1085] mmc: mmc: Enable Command Queuing Enable the Command Queue if the host controller supports a command queue engine. It is not compatible with Packed Commands, so make a note of that in the comment. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 8d77ddc85af1..8e11526c5092 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1785,6 +1785,23 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, } } + /* + * Enable Command Queue if supported. Note that Packed Commands cannot + * be used with Command Queue. + */ + card->ext_csd.cmdq_en = false; + if (card->ext_csd.cmdq_support && host->caps2 & MMC_CAP2_CQE) { + err = mmc_cmdq_enable(card); + if (err && err != -EBADMSG) + goto free_card; + if (err) { + pr_warn("%s: Enabling CMDQ failed\n", + mmc_hostname(card->host)); + card->ext_csd.cmdq_support = false; + card->ext_csd.cmdq_depth = 0; + err = 0; + } + } /* * In some cases (e.g. RPMB or mmc_test), the Command Queue must be * disabled for a time, so a flag is needed to indicate to re-enable the From f690f4409ddd79a481efddaf6e4cb65cf1a747cb Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:54 +0300 Subject: [PATCH 0686/1085] mmc: mmc: Enable CQE's Enable or disable CQE when a card is added or removed respectively. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/bus.c | 7 +++++++ drivers/mmc/core/mmc.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 301246513a37..a4b49e25fe96 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -369,10 +369,17 @@ int mmc_add_card(struct mmc_card *card) */ void mmc_remove_card(struct mmc_card *card) { + struct mmc_host *host = card->host; + #ifdef CONFIG_DEBUG_FS mmc_remove_card_debugfs(card); #endif + if (host->cqe_enabled) { + host->cqe_ops->cqe_disable(host); + host->cqe_enabled = false; + } + if (mmc_card_present(card)) { if (mmc_host_is_spi(card->host)) { pr_info("%s: SPI card removed\n", diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 8e11526c5092..44d67ee8bf7b 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1809,6 +1809,18 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, */ card->reenable_cmdq = card->ext_csd.cmdq_en; + if (card->ext_csd.cmdq_en && !host->cqe_enabled) { + err = host->cqe_ops->cqe_enable(host, card); + if (err) { + pr_err("%s: Failed to enable CQE, error %d\n", + mmc_hostname(host), err); + } else { + host->cqe_enabled = true; + pr_info("%s: Command Queue Engine enabled\n", + mmc_hostname(host)); + } + } + if (!oldcard) host->card = card; From d3377c012f4d7a0a822e289f1effba997da3e295 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:55 +0300 Subject: [PATCH 0687/1085] mmc: block: Use local variables in mmc_blk_data_prep() Use local variables in mmc_blk_data_prep() in preparation for adding CQE support which doesn't use the output variables. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9476312f081a..bb5f70ec8015 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1575,21 +1575,22 @@ static enum mmc_blk_status mmc_blk_err_check(struct mmc_card *card, } static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, - int disable_multi, bool *do_rel_wr, - bool *do_data_tag) + int disable_multi, bool *do_rel_wr_p, + bool *do_data_tag_p) { struct mmc_blk_data *md = mq->blkdata; struct mmc_card *card = md->queue.card; struct mmc_blk_request *brq = &mqrq->brq; struct request *req = mmc_queue_req_to_req(mqrq); + bool do_rel_wr, do_data_tag; /* * Reliable writes are used to implement Forced Unit Access and * are supported only on MMCs. */ - *do_rel_wr = (req->cmd_flags & REQ_FUA) && - rq_data_dir(req) == WRITE && - (md->flags & MMC_BLK_REL_WR); + do_rel_wr = (req->cmd_flags & REQ_FUA) && + rq_data_dir(req) == WRITE && + (md->flags & MMC_BLK_REL_WR); memset(brq, 0, sizeof(struct mmc_blk_request)); @@ -1637,18 +1638,18 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, brq->data.blocks); } - if (*do_rel_wr) + if (do_rel_wr) mmc_apply_rel_rw(brq, card, req); /* * Data tag is used only during writing meta data to speed * up write and any subsequent read of this meta data */ - *do_data_tag = card->ext_csd.data_tag_unit_size && - (req->cmd_flags & REQ_META) && - (rq_data_dir(req) == WRITE) && - ((brq->data.blocks * brq->data.blksz) >= - card->ext_csd.data_tag_unit_size); + do_data_tag = card->ext_csd.data_tag_unit_size && + (req->cmd_flags & REQ_META) && + (rq_data_dir(req) == WRITE) && + ((brq->data.blocks * brq->data.blksz) >= + card->ext_csd.data_tag_unit_size); mmc_set_data_timeout(&brq->data, card); @@ -1675,6 +1676,12 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, } mqrq->areq.mrq = &brq->mrq; + + if (do_rel_wr_p) + *do_rel_wr_p = do_rel_wr; + + if (do_data_tag_p) + *do_data_tag_p = do_data_tag; } static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, From 93482b3d70c2120aadb0f1d1281a59199866e70a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:56 +0300 Subject: [PATCH 0688/1085] mmc: block: Prepare CQE data Enhance mmc_blk_data_prep() to support CQE requests. That means adding some things that for non-CQE requests would be encoded into the command arguments - such as the block address, reliable-write flag, and data tag flag. Also the request tag is needed to provide the command queue task id, and a comment is added to explain the future possibility of defining a priority. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index bb5f70ec8015..ea80ff4cd7f9 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1595,6 +1595,7 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, memset(brq, 0, sizeof(struct mmc_blk_request)); brq->mrq.data = &brq->data; + brq->mrq.tag = req->tag; brq->stop.opcode = MMC_STOP_TRANSMISSION; brq->stop.arg = 0; @@ -1609,6 +1610,14 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, brq->data.blksz = 512; brq->data.blocks = blk_rq_sectors(req); + brq->data.blk_addr = blk_rq_pos(req); + + /* + * The command queue supports 2 priorities: "high" (1) and "simple" (0). + * The eMMC will give "high" priority tasks priority over "simple" + * priority tasks. Here we always set "simple" priority by not setting + * MMC_DATA_PRIO. + */ /* * The block layer doesn't support all sector count @@ -1638,8 +1647,10 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, brq->data.blocks); } - if (do_rel_wr) + if (do_rel_wr) { mmc_apply_rel_rw(brq, card, req); + brq->data.flags |= MMC_DATA_REL_WR; + } /* * Data tag is used only during writing meta data to speed @@ -1651,6 +1662,9 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, ((brq->data.blocks * brq->data.blksz) >= card->ext_csd.data_tag_unit_size); + if (do_data_tag) + brq->data.flags |= MMC_DATA_DAT_TAG; + mmc_set_data_timeout(&brq->data, card); brq->data.sg = mqrq->sg; From c8b5fd031a3004dc382e201f69ea9a44ec62c04f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:57 +0300 Subject: [PATCH 0689/1085] mmc: block: Factor out mmc_setup_queue() Factor out some common code that will also be used with blk-mq. Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/queue.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index f74f9ef460cc..4f33d277b125 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -177,6 +177,29 @@ static void mmc_exit_request(struct request_queue *q, struct request *req) mq_rq->sg = NULL; } +static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) +{ + struct mmc_host *host = card->host; + u64 limit = BLK_BOUNCE_HIGH; + + if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) + limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; + + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue); + if (mmc_can_erase(card)) + mmc_queue_setup_discard(mq->queue, card); + + blk_queue_bounce_limit(mq->queue, limit); + blk_queue_max_hw_sectors(mq->queue, + min(host->max_blk_count, host->max_req_size / 512)); + blk_queue_max_segments(mq->queue, host->max_segs); + blk_queue_max_segment_size(mq->queue, host->max_seg_size); + + /* Initialize thread_sem even if it is not used */ + sema_init(&mq->thread_sem, 1); +} + /** * mmc_init_queue - initialise a queue structure. * @mq: mmc queue @@ -190,12 +213,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock, const char *subname) { struct mmc_host *host = card->host; - u64 limit = BLK_BOUNCE_HIGH; int ret = -ENOMEM; - if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) - limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; - mq->card = card; mq->queue = blk_alloc_queue(GFP_KERNEL); if (!mq->queue) @@ -214,18 +233,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, } blk_queue_prep_rq(mq->queue, mmc_prep_request); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); - queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue); - if (mmc_can_erase(card)) - mmc_queue_setup_discard(mq->queue, card); - blk_queue_bounce_limit(mq->queue, limit); - blk_queue_max_hw_sectors(mq->queue, - min(host->max_blk_count, host->max_req_size / 512)); - blk_queue_max_segments(mq->queue, host->max_segs); - blk_queue_max_segment_size(mq->queue, host->max_seg_size); - - sema_init(&mq->thread_sem, 1); + mmc_setup_queue(mq, card); mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd/%d%s", host->index, subname ? subname : ""); From 563be8b60324f8947e9c71ec81a8cb67ea2f2127 Mon Sep 17 00:00:00 2001 From: rui_feng Date: Fri, 22 Sep 2017 16:07:35 +0800 Subject: [PATCH 0690/1085] mmc: rtsx: fix tuning fail on gen3 PCI-Express On gen3 PCI-Express we should send command one by one. If sending many commands in one packet will lead to a failure. Signed-off-by: rui_feng Signed-off-by: Ulf Hansson --- drivers/mmc/host/rtsx_pci_sdmmc.c | 38 +++++++++++++++---------------- include/linux/mfd/rtsx_pci.h | 1 + 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 41b57713b620..0848dc0f882e 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -618,29 +618,22 @@ static int sd_change_phase(struct realtek_pci_sdmmc *host, u8 sample_point, bool rx) { struct rtsx_pcr *pcr = host->pcr; - int err; dev_dbg(sdmmc_dev(host), "%s(%s): sample_point = %d\n", __func__, rx ? "RX" : "TX", sample_point); - rtsx_pci_init_cmd(pcr); - - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL, CHANGE_CLK, CHANGE_CLK); + rtsx_pci_write_register(pcr, CLK_CTL, CHANGE_CLK, CHANGE_CLK); if (rx) - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, - SD_VPRX_CTL, 0x1F, sample_point); + rtsx_pci_write_register(pcr, SD_VPRX_CTL, + PHASE_SELECT_MASK, sample_point); else - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, - SD_VPTX_CTL, 0x1F, sample_point); - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL, PHASE_NOT_RESET, 0); - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL, - PHASE_NOT_RESET, PHASE_NOT_RESET); - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL, CHANGE_CLK, 0); - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0); - - err = rtsx_pci_send_cmd(pcr, 100); - if (err < 0) - return err; + rtsx_pci_write_register(pcr, SD_VPTX_CTL, + PHASE_SELECT_MASK, sample_point); + rtsx_pci_write_register(pcr, SD_VPCLK0_CTL, PHASE_NOT_RESET, 0); + rtsx_pci_write_register(pcr, SD_VPCLK0_CTL, PHASE_NOT_RESET, + PHASE_NOT_RESET); + rtsx_pci_write_register(pcr, CLK_CTL, CHANGE_CLK, 0); + rtsx_pci_write_register(pcr, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0); return 0; } @@ -708,10 +701,12 @@ static int sd_tuning_rx_cmd(struct realtek_pci_sdmmc *host, { int err; struct mmc_command cmd = {}; + struct rtsx_pcr *pcr = host->pcr; - err = sd_change_phase(host, sample_point, true); - if (err < 0) - return err; + sd_change_phase(host, sample_point, true); + + rtsx_pci_write_register(pcr, SD_CFG3, SD_RSP_80CLK_TIMEOUT_EN, + SD_RSP_80CLK_TIMEOUT_EN); cmd.opcode = opcode; err = sd_read_data(host, &cmd, 0x40, NULL, 0, 100); @@ -719,9 +714,12 @@ static int sd_tuning_rx_cmd(struct realtek_pci_sdmmc *host, /* Wait till SD DATA IDLE */ sd_wait_data_idle(host); sd_clear_error(host); + rtsx_pci_write_register(pcr, SD_CFG3, + SD_RSP_80CLK_TIMEOUT_EN, 0); return err; } + rtsx_pci_write_register(pcr, SD_CFG3, SD_RSP_80CLK_TIMEOUT_EN, 0); return 0; } diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h index 116816fb9110..7815d8db7eca 100644 --- a/include/linux/mfd/rtsx_pci.h +++ b/include/linux/mfd/rtsx_pci.h @@ -334,6 +334,7 @@ #define DCM_DRP_RD_DATA_H 0xFC29 #define SD_VPCLK0_CTL 0xFC2A #define SD_VPCLK1_CTL 0xFC2B +#define PHASE_SELECT_MASK 0x1F #define SD_DCMPS0_CTL 0xFC2C #define SD_DCMPS1_CTL 0xFC2D #define SD_VPTX_CTL SD_VPCLK0_CTL From ec32e106a112e79385e4c78a7aede39dc34cad6a Mon Sep 17 00:00:00 2001 From: Allen Date: Fri, 22 Sep 2017 17:37:23 +0530 Subject: [PATCH 0691/1085] mmc-host: wbsd: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/host/wbsd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index 546aaf8d1507..499852d8f706 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -1224,9 +1224,8 @@ static int wbsd_alloc_mmc(struct device *dev) /* * Set up timers */ - init_timer(&host->ignore_timer); - host->ignore_timer.data = (unsigned long)host; - host->ignore_timer.function = wbsd_reset_ignore; + setup_timer(&host->ignore_timer, wbsd_reset_ignore, + (unsigned long)host); /* * Maximum number of segments. Worst case is one sector per segment From 4ffd3aaf37b785536810b3783ce0f58d52e9a57e Mon Sep 17 00:00:00 2001 From: Allen Date: Fri, 22 Sep 2017 17:37:24 +0530 Subject: [PATCH 0692/1085] mmc-host: via: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/host/via-sdmmc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index a838bf5480d8..a8c97b6e59dc 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -1036,9 +1036,7 @@ static void via_init_mmc_host(struct via_crdr_mmc_host *host) u32 lenreg; u32 status; - init_timer(&host->timer); - host->timer.data = (unsigned long)host; - host->timer.function = via_sdc_timeout; + setup_timer(&host->timer, via_sdc_timeout, (unsigned long)host); spin_lock_init(&host->lock); From 15e8c7d9c17b3f2eb6d2547658582a67e15a93ac Mon Sep 17 00:00:00 2001 From: Allen Date: Fri, 22 Sep 2017 17:37:25 +0530 Subject: [PATCH 0693/1085] mmc-host: vub300: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/host/vub300.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c index 8f569d257405..c1a169843f99 100644 --- a/drivers/mmc/host/vub300.c +++ b/drivers/mmc/host/vub300.c @@ -2323,13 +2323,11 @@ static int vub300_probe(struct usb_interface *interface, INIT_WORK(&vub300->cmndwork, vub300_cmndwork_thread); INIT_WORK(&vub300->deadwork, vub300_deadwork_thread); kref_init(&vub300->kref); - init_timer(&vub300->sg_transfer_timer); - vub300->sg_transfer_timer.data = (unsigned long)vub300; - vub300->sg_transfer_timer.function = vub300_sg_timed_out; + setup_timer(&vub300->sg_transfer_timer, vub300_sg_timed_out, + (unsigned long)vub300); kref_get(&vub300->kref); - init_timer(&vub300->inactivity_timer); - vub300->inactivity_timer.data = (unsigned long)vub300; - vub300->inactivity_timer.function = vub300_inactivity_timer_expired; + setup_timer(&vub300->inactivity_timer, + vub300_inactivity_timer_expired, (unsigned long)vub300); vub300->inactivity_timer.expires = jiffies + HZ; add_timer(&vub300->inactivity_timer); if (vub300->card_present) From c7ddaa2ec992a096905d897fc00ae4901a912f2f Mon Sep 17 00:00:00 2001 From: Allen Date: Fri, 22 Sep 2017 17:37:26 +0530 Subject: [PATCH 0694/1085] mmc-host: mxcmmc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/host/mxcmmc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index 1d5418e4efae..328484b96620 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1165,9 +1165,7 @@ static int mxcmci_probe(struct platform_device *pdev) goto out_free_dma; } - init_timer(&host->watchdog); - host->watchdog.function = &mxcmci_watchdog; - host->watchdog.data = (unsigned long)mmc; + setup_timer(&host->watchdog, &mxcmci_watchdog, (unsigned long)mmc); mmc_add_host(mmc); From 2afcbdb0d88b41dace18d7dd2ace62ce4b997319 Mon Sep 17 00:00:00 2001 From: Ziyuan Date: Wed, 30 Aug 2017 10:58:21 +0800 Subject: [PATCH 0695/1085] mmc: dw_mmc: correct outdated comment for use_dma Since commit 3fc7eaef44db ("mmc: dw_mmc: Add external dma interface support") use_dma no longer means only the data transfer mode, and includes dma transmission channel. So make it more clear. Signed-off-by: Ziyuan Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 34474ad731aa..e3124f06a47e 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -74,7 +74,8 @@ struct dw_mci_dma_slave { * @stop_abort: The command currently prepared for stoping transfer. * @prev_blksz: The former transfer blksz record. * @timing: Record of current ios timing. - * @use_dma: Whether DMA channel is initialized or not. + * @use_dma: Which DMA channel is in use for the current transfer, zero + * denotes PIO mode. * @using_dma: Whether DMA is in use for the current transfer. * @dma_64bit_address: Whether DMA supports 64-bit address mode or not. * @sg_dma: Bus address of DMA buffer. From 27d70d368021160fada401d3e2641fbd38a9d10a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 3 Sep 2017 14:39:50 +0100 Subject: [PATCH 0696/1085] mmc: dw_mmc: make const arrays mszs static Don't populate the const arrays mszs on the stack, instead make them static. Makes the object code smaller by over 310 bytes: Before: text data bss dec hex filename 47527 8528 320 56375 dc37 drivers/mmc/host/dw_mmc.o After: text data bss dec hex filename 47055 8688 320 56063 daff drivers/mmc/host/dw_mmc.o Signed-off-by: Colin Ian King Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 860313bd952a..45289c5e0295 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -796,7 +796,7 @@ static int dw_mci_edmac_start_dma(struct dw_mci *host, struct dma_slave_config cfg; struct dma_async_tx_descriptor *desc = NULL; struct scatterlist *sgl = host->data->sg; - const u32 mszs[] = {1, 4, 8, 16, 32, 64, 128, 256}; + static const u32 mszs[] = {1, 4, 8, 16, 32, 64, 128, 256}; u32 sg_elems = host->data->sg_len; u32 fifoth_val; u32 fifo_offset = host->fifo_reg - host->regs; @@ -1003,7 +1003,7 @@ static int dw_mci_get_cd(struct mmc_host *mmc) static void dw_mci_adjust_fifoth(struct dw_mci *host, struct mmc_data *data) { unsigned int blksz = data->blksz; - const u32 mszs[] = {1, 4, 8, 16, 32, 64, 128, 256}; + static const u32 mszs[] = {1, 4, 8, 16, 32, 64, 128, 256}; u32 fifo_width = 1 << host->data_shift; u32 blksz_depth = blksz / fifo_width, fifoth_val; u32 msize = 0, rx_wmark = 1, tx_wmark, tx_wmark_invers; From cb39f61e9b1e675c8df86df69d53f08334b34077 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Sep 2017 15:36:59 +0300 Subject: [PATCH 0697/1085] mmc: core: Export a few functions needed for blkmq support The following functions are needed by the mmc block device driver, once it converts to blkmq, therefore let's export them. mmc_start_bkops() mmc_start_request() mmc_retune_hold_now() mmc_retune_release() Signed-off-by: Adrian Hunter Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 3 ++- drivers/mmc/core/core.h | 2 ++ drivers/mmc/core/host.c | 7 +------ drivers/mmc/core/host.h | 7 ++++++- drivers/mmc/core/mmc_ops.c | 1 + 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 2ff614d4ffac..a9ee960fbb7e 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -337,7 +337,7 @@ static int mmc_mrq_prep(struct mmc_host *host, struct mmc_request *mrq) return 0; } -static int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) +int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) { int err; @@ -359,6 +359,7 @@ static int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) return 0; } +EXPORT_SYMBOL(mmc_start_request); /* * mmc_wait_data_done() - done callback for data request diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index ba5a8fea0dc2..51a160762784 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -107,6 +107,8 @@ static inline void mmc_unregister_pm_notifier(struct mmc_host *host) { } void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq); bool mmc_is_req_done(struct mmc_host *host, struct mmc_request *mrq); +int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq); + struct mmc_async_req; struct mmc_async_req *mmc_start_areq(struct mmc_host *host, diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index ad88deb2e8f3..e58be39b1568 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -111,12 +111,6 @@ void mmc_retune_hold(struct mmc_host *host) host->hold_retune += 1; } -void mmc_retune_hold_now(struct mmc_host *host) -{ - host->retune_now = 0; - host->hold_retune += 1; -} - void mmc_retune_release(struct mmc_host *host) { if (host->hold_retune) @@ -124,6 +118,7 @@ void mmc_retune_release(struct mmc_host *host) else WARN_ON(1); } +EXPORT_SYMBOL(mmc_retune_release); int mmc_retune(struct mmc_host *host) { diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 77d6f60d1bf9..fb689a1065ed 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -19,12 +19,17 @@ void mmc_unregister_host_class(void); void mmc_retune_enable(struct mmc_host *host); void mmc_retune_disable(struct mmc_host *host); void mmc_retune_hold(struct mmc_host *host); -void mmc_retune_hold_now(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); void mmc_retune_pause(struct mmc_host *host); void mmc_retune_unpause(struct mmc_host *host); +static inline void mmc_retune_hold_now(struct mmc_host *host) +{ + host->retune_now = 0; + host->hold_retune += 1; +} + static inline void mmc_retune_recheck(struct mmc_host *host) { if (host->hold_retune <= 1) diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index a6b0a232f24a..908e4db03535 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -1007,6 +1007,7 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception) else mmc_retune_release(card->host); } +EXPORT_SYMBOL(mmc_start_bkops); /* * Flush the cache to the non-volatile storage. From 3f496afb6fb361b282f37968ff7d3d80b0f1b5cb Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Sep 2017 11:29:03 +0300 Subject: [PATCH 0698/1085] mmc: core: Factor out mmc_host_set_uhs_voltage() Factor out mmc_host_set_uhs_voltage() so it can be reused. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 38 ++++++++++++++++++++++++-------------- drivers/mmc/core/core.h | 1 + 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index a9ee960fbb7e..12b271c2a912 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1672,11 +1672,33 @@ int mmc_set_signal_voltage(struct mmc_host *host, int signal_voltage) } +int mmc_host_set_uhs_voltage(struct mmc_host *host) +{ + u32 clock; + + /* + * During a signal voltage level switch, the clock must be gated + * for 5 ms according to the SD spec + */ + clock = host->ios.clock; + host->ios.clock = 0; + mmc_set_ios(host); + + if (mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180)) + return -EAGAIN; + + /* Keep clock gated for at least 10 ms, though spec only says 5 ms */ + mmc_delay(10); + host->ios.clock = clock; + mmc_set_ios(host); + + return 0; +} + int mmc_set_uhs_voltage(struct mmc_host *host, u32 ocr) { struct mmc_command cmd = {}; int err = 0; - u32 clock; /* * If we cannot switch voltages, return failure so the caller @@ -1708,15 +1730,8 @@ int mmc_set_uhs_voltage(struct mmc_host *host, u32 ocr) err = -EAGAIN; goto power_cycle; } - /* - * During a signal voltage level switch, the clock must be gated - * for 5 ms according to the SD spec - */ - clock = host->ios.clock; - host->ios.clock = 0; - mmc_set_ios(host); - if (mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180)) { + if (mmc_host_set_uhs_voltage(host)) { /* * Voltages may not have been switched, but we've already * sent CMD11, so a power cycle is required anyway @@ -1725,11 +1740,6 @@ int mmc_set_uhs_voltage(struct mmc_host *host, u32 ocr) goto power_cycle; } - /* Keep clock gated for at least 10 ms, though spec only says 5 ms */ - mmc_delay(10); - host->ios.clock = clock; - mmc_set_ios(host); - /* Wait for at least 1 ms according to spec */ mmc_delay(1); diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 51a160762784..71e6c6d7ceb7 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -49,6 +49,7 @@ void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode); void mmc_set_bus_width(struct mmc_host *host, unsigned int width); u32 mmc_select_voltage(struct mmc_host *host, u32 ocr); int mmc_set_uhs_voltage(struct mmc_host *host, u32 ocr); +int mmc_host_set_uhs_voltage(struct mmc_host *host); int mmc_set_signal_voltage(struct mmc_host *host, int signal_voltage); void mmc_set_timing(struct mmc_host *host, unsigned int timing); void mmc_set_driver_type(struct mmc_host *host, unsigned int drv_type); From 6a11fc47f175c8d87018e89cb58e2d36c66534cb Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Sep 2017 11:29:04 +0300 Subject: [PATCH 0699/1085] mmc: sd: Fix signal voltage when there is no power cycle Some boards have SD card connectors where the power rail cannot be switched off by the driver. However there are various circumstances when a card might be re-initialized, such as after system resume, warm re-boot, or error handling. However, a UHS card will continue to use 1.8V signaling unless it is power cycled. If the card has not been power cycled, it may still be using 1.8V signaling. According to the SD spec., the Bus Speed Mode (function group 1) bits 2 to 4 are zero if the card is initialized at 3.3V signal level. Thus they can be used to determine if the card has already switched to 1.8V signaling. Detect that situation and try to initialize a UHS-I (1.8V) transfer mode. Tested with the following cards: Transcend 4GB High Speed Kingston 64GB SDR104 Lexar by Micron HIGH-PERFORMANCE 300x 16GB DDR50 SanDisk Ultra 8GB DDR50 Transcend Ultimate 600x 16GB SDR104 Transcend Premium 300x 64GB SDR104 Lexar by Micron Professional 1000x 32GB UHS-II SDR104 SanDisk Extreme Pro 16GB SDR104 Signed-off-by: Adrian Hunter Tested-by: Zhoujie Wu Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/core/sd.c | 47 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 2036b2b4835c..45bf78f32716 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -908,6 +908,18 @@ unsigned mmc_sd_get_max_clock(struct mmc_card *card) return max_dtr; } +static bool mmc_sd_card_using_v18(struct mmc_card *card) +{ + /* + * According to the SD spec., the Bus Speed Mode (function group 1) bits + * 2 to 4 are zero if the card is initialized at 3.3V signal level. Thus + * they can be used to determine if the card has already switched to + * 1.8V signaling. + */ + return card->sw_caps.sd3_bus_mode & + (SD_MODE_UHS_SDR50 | SD_MODE_UHS_SDR104 | SD_MODE_UHS_DDR50); +} + /* * Handle the detection and initialisation of a card. * @@ -921,9 +933,10 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr, int err; u32 cid[4]; u32 rocr = 0; + bool v18_fixup_failed = false; WARN_ON(!host->claimed); - +retry: err = mmc_sd_get_cid(host, ocr, cid, &rocr); if (err) return err; @@ -989,6 +1002,36 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr, if (err) goto free_card; + /* + * If the card has not been power cycled, it may still be using 1.8V + * signaling. Detect that situation and try to initialize a UHS-I (1.8V) + * transfer mode. + */ + if (!v18_fixup_failed && !mmc_host_is_spi(host) && mmc_host_uhs(host) && + mmc_sd_card_using_v18(card) && + host->ios.signal_voltage != MMC_SIGNAL_VOLTAGE_180) { + /* + * Re-read switch information in case it has changed since + * oldcard was initialized. + */ + if (oldcard) { + err = mmc_read_switch(card); + if (err) + goto free_card; + } + if (mmc_sd_card_using_v18(card)) { + if (mmc_host_set_uhs_voltage(host) || + mmc_sd_init_uhs_card(card)) { + v18_fixup_failed = true; + mmc_power_cycle(host, ocr); + if (!oldcard) + mmc_remove_card(card); + goto retry; + } + goto done; + } + } + /* Initialization sequence for UHS-I cards */ if (rocr & SD_ROCR_S18A) { err = mmc_sd_init_uhs_card(card); @@ -1021,7 +1064,7 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr, mmc_set_bus_width(host, MMC_BUS_WIDTH_4); } } - +done: host->card = card; return 0; From c7ccee224d2d551f712752c4a16947f6529d6506 Mon Sep 17 00:00:00 2001 From: Subhash Jadavani Date: Wed, 27 Sep 2017 11:04:40 +0530 Subject: [PATCH 0700/1085] mmc: sdhci-msm: fix issue with power irq SDCC controller reset (SW_RST) during probe may trigger power irq if previous status of PWRCTL was either BUS_ON or IO_HIGH_V. So before we enable the power irq interrupt in GIC (by registering the interrupt handler), we need to ensure that any pending power irq interrupt status is acknowledged otherwise power irq interrupt handler would be fired prematurely. Signed-off-by: Subhash Jadavani Signed-off-by: Vijay Viswanath Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 33919d2b35d1..ba2bc20f5d9a 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1262,6 +1262,21 @@ static int sdhci_msm_probe(struct platform_device *pdev) CORE_VENDOR_SPEC_CAPABILITIES0); } + /* + * Power on reset state may trigger power irq if previous status of + * PWRCTL was either BUS_ON or IO_HIGH_V. So before enabling pwr irq + * interrupt in GIC, any pending power irq interrupt should be + * acknowledged. Otherwise power irq interrupt handler would be + * fired prematurely. + */ + sdhci_msm_voltage_switch(host); + + /* + * Ensure that above writes are propogated before interrupt enablement + * in GIC. + */ + mb(); + /* Setup IRQ for handling power/voltage tasks with PMIC */ msm_host->pwr_irq = platform_get_irq_byname(pdev, "pwr_irq"); if (msm_host->pwr_irq < 0) { @@ -1271,6 +1286,9 @@ static int sdhci_msm_probe(struct platform_device *pdev) goto clk_disable; } + /* Enable pwr irq interrupts */ + writel_relaxed(INT_MASK, msm_host->core_mem + CORE_PWRCTL_MASK); + ret = devm_request_threaded_irq(&pdev->dev, msm_host->pwr_irq, NULL, sdhci_msm_pwr_irq, IRQF_ONESHOT, dev_name(&pdev->dev), host); From 401b2d06c4ed69f5d491f9297651bed3fbbfe69b Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 27 Sep 2017 11:04:41 +0530 Subject: [PATCH 0701/1085] mmc: sdhci-msm: Fix HW issue with power IRQ handling during reset There is a rare scenario in HW, where the first clear pulse could be lost when the actual reset and clear/read of status register are happening at the same time. Fix this by retrying upto 10 times to ensure the status register gets cleared. Otherwise, this will lead to a spurious power IRQ which results in system instability. Signed-off-by: Sahitya Tummala Signed-off-by: Vijay Viswanath Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 46 ++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index ba2bc20f5d9a..79cb06064084 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -995,17 +995,52 @@ static void sdhci_msm_set_uhs_signaling(struct sdhci_host *host, sdhci_msm_hs400(host, &mmc->ios); } -static void sdhci_msm_voltage_switch(struct sdhci_host *host) +static void sdhci_msm_dump_pwr_ctrl_regs(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + + pr_err("%s: PWRCTL_STATUS: 0x%08x | PWRCTL_MASK: 0x%08x | PWRCTL_CTL: 0x%08x\n", + mmc_hostname(host->mmc), + readl_relaxed(msm_host->core_mem + CORE_PWRCTL_STATUS), + readl_relaxed(msm_host->core_mem + CORE_PWRCTL_MASK), + readl_relaxed(msm_host->core_mem + CORE_PWRCTL_CTL)); +} + +static void sdhci_msm_handle_pwr_irq(struct sdhci_host *host, int irq) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); u32 irq_status, irq_ack = 0; + int retry = 10; irq_status = readl_relaxed(msm_host->core_mem + CORE_PWRCTL_STATUS); irq_status &= INT_MASK; writel_relaxed(irq_status, msm_host->core_mem + CORE_PWRCTL_CLEAR); + /* + * There is a rare HW scenario where the first clear pulse could be + * lost when actual reset and clear/read of status register is + * happening at a time. Hence, retry for at least 10 times to make + * sure status register is cleared. Otherwise, this will result in + * a spurious power IRQ resulting in system instability. + */ + while (irq_status & readl_relaxed(msm_host->core_mem + + CORE_PWRCTL_STATUS)) { + if (retry == 0) { + pr_err("%s: Timedout clearing (0x%x) pwrctl status register\n", + mmc_hostname(host->mmc), irq_status); + sdhci_msm_dump_pwr_ctrl_regs(host); + WARN_ON(1); + break; + } + writel_relaxed(irq_status, + msm_host->core_mem + CORE_PWRCTL_CLEAR); + retry--; + udelay(10); + } + if (irq_status & (CORE_PWRCTL_BUS_ON | CORE_PWRCTL_BUS_OFF)) irq_ack |= CORE_PWRCTL_BUS_SUCCESS; if (irq_status & (CORE_PWRCTL_IO_LOW | CORE_PWRCTL_IO_HIGH)) @@ -1017,13 +1052,17 @@ static void sdhci_msm_voltage_switch(struct sdhci_host *host) * switches are handled by the sdhci core, so just report success. */ writel_relaxed(irq_ack, msm_host->core_mem + CORE_PWRCTL_CTL); + + pr_debug("%s: %s: Handled IRQ(%d), irq_status=0x%x, ack=0x%x\n", + mmc_hostname(msm_host->mmc), __func__, irq, irq_status, + irq_ack); } static irqreturn_t sdhci_msm_pwr_irq(int irq, void *data) { struct sdhci_host *host = (struct sdhci_host *)data; - sdhci_msm_voltage_switch(host); + sdhci_msm_handle_pwr_irq(host, irq); return IRQ_HANDLED; } @@ -1107,7 +1146,6 @@ static const struct sdhci_ops sdhci_msm_ops = { .get_max_clock = sdhci_msm_get_max_clock, .set_bus_width = sdhci_set_bus_width, .set_uhs_signaling = sdhci_msm_set_uhs_signaling, - .voltage_switch = sdhci_msm_voltage_switch, }; static const struct sdhci_pltfm_data sdhci_msm_pdata = { @@ -1269,7 +1307,7 @@ static int sdhci_msm_probe(struct platform_device *pdev) * acknowledged. Otherwise power irq interrupt handler would be * fired prematurely. */ - sdhci_msm_voltage_switch(host); + sdhci_msm_handle_pwr_irq(host, 0); /* * Ensure that above writes are propogated before interrupt enablement From 99d570da309813f67e9c741edeff55bafc6c1d5e Mon Sep 17 00:00:00 2001 From: Vijay Viswanath Date: Wed, 27 Sep 2017 11:04:42 +0530 Subject: [PATCH 0702/1085] mmc: Kconfig: Enable CONFIG_MMC_SDHCI_IO_ACCESSORS Enable CONFIG_MMC_SDHCI_IO_ACCESSORS so that SDHC controller specific register read and write APIs, if registered, can be used. Signed-off-by: Vijay Viswanath Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 17afe1ad3a03..37281d4201b1 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -429,6 +429,7 @@ config MMC_SDHCI_MSM tristate "Qualcomm SDHCI Controller Support" depends on ARCH_QCOM || (ARM && COMPILE_TEST) depends on MMC_SDHCI_PLTFM + select MMC_SDHCI_IO_ACCESSORS help This selects the Secure Digital Host Controller Interface (SDHCI) support present in Qualcomm SOCs. The controller supports From c0309b3803fed96cc9adb0a8701e271177f5cb04 Mon Sep 17 00:00:00 2001 From: Vijay Viswanath Date: Wed, 27 Sep 2017 11:04:43 +0530 Subject: [PATCH 0703/1085] mmc: sdhci-msm: Add sdhci msm register write APIs which wait for pwr irq Register writes which change voltage of IO lines or turn the IO bus on/off require controller to be ready before progressing further. When the controller is ready, it will generate a power irq which needs to be handled. The thread which initiated the register write should wait for power irq to complete. This will be done through the new sdhc msm write APIs which will check whether the particular write can trigger a power irq and wait for it with a timeout if it is expected. The SDHC core power control IRQ gets triggered when - * There is a state change in power control bit (bit 0) of SDHCI_POWER_CONTROL register. * There is a state change in 1.8V enable bit (bit 3) of SDHCI_HOST_CONTROL2 register. * Bit 1 of SDHCI_SOFTWARE_RESET is set. Also add support APIs which are used by sdhc msm write APIs to check if power irq is expected to be generated and wait for the power irq to come and complete if the irq is expected. This patch requires CONFIG_MMC_SDHCI_IO_ACCESSORS to be enabled. Signed-off-by: Sahitya Tummala Signed-off-by: Vijay Viswanath Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 173 ++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 79cb06064084..85a882b4c541 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -123,6 +123,10 @@ #define CMUX_SHIFT_PHASE_MASK (7 << CMUX_SHIFT_PHASE_SHIFT) #define MSM_MMC_AUTOSUSPEND_DELAY_MS 50 + +/* Timeout value to avoid infinite waiting for pwr_irq */ +#define MSM_PWR_IRQ_TIMEOUT_MS 5000 + struct sdhci_msm_host { struct platform_device *pdev; void __iomem *core_mem; /* MSM SDCC mapped address */ @@ -137,6 +141,10 @@ struct sdhci_msm_host { bool calibration_done; u8 saved_tuning_phase; bool use_cdclp533; + u32 curr_pwr_state; + u32 curr_io_level; + wait_queue_head_t pwr_irq_wait; + bool pwr_irq_flag; }; static unsigned int msm_get_clock_rate_for_bus_mode(struct sdhci_host *host, @@ -995,6 +1003,73 @@ static void sdhci_msm_set_uhs_signaling(struct sdhci_host *host, sdhci_msm_hs400(host, &mmc->ios); } +static inline void sdhci_msm_init_pwr_irq_wait(struct sdhci_msm_host *msm_host) +{ + init_waitqueue_head(&msm_host->pwr_irq_wait); +} + +static inline void sdhci_msm_complete_pwr_irq_wait( + struct sdhci_msm_host *msm_host) +{ + wake_up(&msm_host->pwr_irq_wait); +} + +/* + * sdhci_msm_check_power_status API should be called when registers writes + * which can toggle sdhci IO bus ON/OFF or change IO lines HIGH/LOW happens. + * To what state the register writes will change the IO lines should be passed + * as the argument req_type. This API will check whether the IO line's state + * is already the expected state and will wait for power irq only if + * power irq is expected to be trigerred based on the current IO line state + * and expected IO line state. + */ +static void sdhci_msm_check_power_status(struct sdhci_host *host, u32 req_type) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + bool done = false; + + pr_debug("%s: %s: request %d curr_pwr_state %x curr_io_level %x\n", + mmc_hostname(host->mmc), __func__, req_type, + msm_host->curr_pwr_state, msm_host->curr_io_level); + + /* + * The IRQ for request type IO High/LOW will be generated when - + * there is a state change in 1.8V enable bit (bit 3) of + * SDHCI_HOST_CONTROL2 register. The reset state of that bit is 0 + * which indicates 3.3V IO voltage. So, when MMC core layer tries + * to set it to 3.3V before card detection happens, the + * IRQ doesn't get triggered as there is no state change in this bit. + * The driver already handles this case by changing the IO voltage + * level to high as part of controller power up sequence. Hence, check + * for host->pwr to handle a case where IO voltage high request is + * issued even before controller power up. + */ + if ((req_type & REQ_IO_HIGH) && !host->pwr) { + pr_debug("%s: do not wait for power IRQ that never comes, req_type: %d\n", + mmc_hostname(host->mmc), req_type); + return; + } + if ((req_type & msm_host->curr_pwr_state) || + (req_type & msm_host->curr_io_level)) + done = true; + /* + * This is needed here to handle cases where register writes will + * not change the current bus state or io level of the controller. + * In this case, no power irq will be triggerred and we should + * not wait. + */ + if (!done) { + if (!wait_event_timeout(msm_host->pwr_irq_wait, + msm_host->pwr_irq_flag, + msecs_to_jiffies(MSM_PWR_IRQ_TIMEOUT_MS))) + __WARN_printf("%s: pwr_irq for req: (%d) timed out\n", + mmc_hostname(host->mmc), req_type); + } + pr_debug("%s: %s: request %d done\n", mmc_hostname(host->mmc), + __func__, req_type); +} + static void sdhci_msm_dump_pwr_ctrl_regs(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1013,6 +1088,8 @@ static void sdhci_msm_handle_pwr_irq(struct sdhci_host *host, int irq) struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); u32 irq_status, irq_ack = 0; int retry = 10; + int pwr_state = 0, io_level = 0; + irq_status = readl_relaxed(msm_host->core_mem + CORE_PWRCTL_STATUS); irq_status &= INT_MASK; @@ -1041,10 +1118,26 @@ static void sdhci_msm_handle_pwr_irq(struct sdhci_host *host, int irq) udelay(10); } - if (irq_status & (CORE_PWRCTL_BUS_ON | CORE_PWRCTL_BUS_OFF)) + /* Handle BUS ON/OFF*/ + if (irq_status & CORE_PWRCTL_BUS_ON) { + pwr_state = REQ_BUS_ON; + io_level = REQ_IO_HIGH; irq_ack |= CORE_PWRCTL_BUS_SUCCESS; - if (irq_status & (CORE_PWRCTL_IO_LOW | CORE_PWRCTL_IO_HIGH)) + } + if (irq_status & CORE_PWRCTL_BUS_OFF) { + pwr_state = REQ_BUS_OFF; + io_level = REQ_IO_LOW; + irq_ack |= CORE_PWRCTL_BUS_SUCCESS; + } + /* Handle IO LOW/HIGH */ + if (irq_status & CORE_PWRCTL_IO_LOW) { + io_level = REQ_IO_LOW; irq_ack |= CORE_PWRCTL_IO_SUCCESS; + } + if (irq_status & CORE_PWRCTL_IO_HIGH) { + io_level = REQ_IO_HIGH; + irq_ack |= CORE_PWRCTL_IO_SUCCESS; + } /* * The driver has to acknowledge the interrupt, switch voltages and @@ -1053,6 +1146,11 @@ static void sdhci_msm_handle_pwr_irq(struct sdhci_host *host, int irq) */ writel_relaxed(irq_ack, msm_host->core_mem + CORE_PWRCTL_CTL); + if (pwr_state) + msm_host->curr_pwr_state = pwr_state; + if (io_level) + msm_host->curr_io_level = io_level; + pr_debug("%s: %s: Handled IRQ(%d), irq_status=0x%x, ack=0x%x\n", mmc_hostname(msm_host->mmc), __func__, irq, irq_status, irq_ack); @@ -1061,8 +1159,13 @@ static void sdhci_msm_handle_pwr_irq(struct sdhci_host *host, int irq) static irqreturn_t sdhci_msm_pwr_irq(int irq, void *data) { struct sdhci_host *host = (struct sdhci_host *)data; + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); sdhci_msm_handle_pwr_irq(host, irq); + msm_host->pwr_irq_flag = 1; + sdhci_msm_complete_pwr_irq_wait(msm_host); + return IRQ_HANDLED; } @@ -1132,6 +1235,69 @@ out: __sdhci_msm_set_clock(host, clock); } +/* + * Platform specific register write functions. This is so that, if any + * register write needs to be followed up by platform specific actions, + * they can be added here. These functions can go to sleep when writes + * to certain registers are done. + * These functions are relying on sdhci_set_ios not using spinlock. + */ +static int __sdhci_msm_check_write(struct sdhci_host *host, u16 val, int reg) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + u32 req_type = 0; + + switch (reg) { + case SDHCI_HOST_CONTROL2: + req_type = (val & SDHCI_CTRL_VDD_180) ? REQ_IO_LOW : + REQ_IO_HIGH; + break; + case SDHCI_SOFTWARE_RESET: + if (host->pwr && (val & SDHCI_RESET_ALL)) + req_type = REQ_BUS_OFF; + break; + case SDHCI_POWER_CONTROL: + req_type = !val ? REQ_BUS_OFF : REQ_BUS_ON; + break; + } + + if (req_type) { + msm_host->pwr_irq_flag = 0; + /* + * Since this register write may trigger a power irq, ensure + * all previous register writes are complete by this point. + */ + mb(); + } + return req_type; +} + +/* This function may sleep*/ +static void sdhci_msm_writew(struct sdhci_host *host, u16 val, int reg) +{ + u32 req_type = 0; + + req_type = __sdhci_msm_check_write(host, val, reg); + writew_relaxed(val, host->ioaddr + reg); + + if (req_type) + sdhci_msm_check_power_status(host, req_type); +} + +/* This function may sleep*/ +static void sdhci_msm_writeb(struct sdhci_host *host, u8 val, int reg) +{ + u32 req_type = 0; + + req_type = __sdhci_msm_check_write(host, val, reg); + + writeb_relaxed(val, host->ioaddr + reg); + + if (req_type) + sdhci_msm_check_power_status(host, req_type); +} + static const struct of_device_id sdhci_msm_dt_match[] = { { .compatible = "qcom,sdhci-msm-v4" }, {}, @@ -1146,6 +1312,8 @@ static const struct sdhci_ops sdhci_msm_ops = { .get_max_clock = sdhci_msm_get_max_clock, .set_bus_width = sdhci_set_bus_width, .set_uhs_signaling = sdhci_msm_set_uhs_signaling, + .write_w = sdhci_msm_writew, + .write_b = sdhci_msm_writeb, }; static const struct sdhci_pltfm_data sdhci_msm_pdata = { @@ -1324,6 +1492,7 @@ static int sdhci_msm_probe(struct platform_device *pdev) goto clk_disable; } + sdhci_msm_init_pwr_irq_wait(msm_host); /* Enable pwr irq interrupts */ writel_relaxed(INT_MASK, msm_host->core_mem + CORE_PWRCTL_MASK); From 62467bbbbc4e37a0f491588bb6059bce56f31737 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 2 Oct 2017 14:02:41 +0200 Subject: [PATCH 0704/1085] mmc: cavium: Depend on GPIO driver Without the ThunderX/OcteonTx GPIO driver the MMC driver would not power up any MMC devices. Therefore add a dependency to the GPIO driver and remove the unneeded GPIOLIB dependency. Signed-off-by: Jan Glauber Acked-by: David Daney Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 37281d4201b1..8a4447b624a8 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -664,7 +664,7 @@ config MMC_CAVIUM_OCTEON config MMC_CAVIUM_THUNDERX tristate "Cavium ThunderX SD/MMC Card Interface support" depends on PCI && 64BIT && (ARM64 || COMPILE_TEST) - depends on GPIOLIB + depends on GPIO_THUNDERX depends on OF_ADDRESS help This selects Cavium ThunderX SD/MMC Card Interface. From 3a8e9cad3e6d7870a789ffd6de0b8687691e10c9 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 4 Oct 2017 08:38:24 +0200 Subject: [PATCH 0705/1085] mmc: sdhci-s3c: Fix driver data for Exynos4 SoCs Support for non-dt based initialization for Exynos SoCs has been removed, so there is no need to keep driver IDs for this case. While touching this, replace odd conditional code for instantiating driver data for Exynos4 SoCs with a simple reference and move that driver data under CONFIG_OF. Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Reviewed-by: Sylwester Nawrocki Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-s3c.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index d328fcf284d1..cda83ccb2702 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -761,32 +761,24 @@ static const struct dev_pm_ops sdhci_s3c_pmops = { NULL) }; -#if defined(CONFIG_CPU_EXYNOS4210) || defined(CONFIG_SOC_EXYNOS4212) -static struct sdhci_s3c_drv_data exynos4_sdhci_drv_data = { - .no_divider = true, -}; -#define EXYNOS4_SDHCI_DRV_DATA ((kernel_ulong_t)&exynos4_sdhci_drv_data) -#else -#define EXYNOS4_SDHCI_DRV_DATA ((kernel_ulong_t)NULL) -#endif - static const struct platform_device_id sdhci_s3c_driver_ids[] = { { .name = "s3c-sdhci", .driver_data = (kernel_ulong_t)NULL, - }, { - .name = "exynos4-sdhci", - .driver_data = EXYNOS4_SDHCI_DRV_DATA, }, { } }; MODULE_DEVICE_TABLE(platform, sdhci_s3c_driver_ids); #ifdef CONFIG_OF +static struct sdhci_s3c_drv_data exynos4_sdhci_drv_data = { + .no_divider = true, +}; + static const struct of_device_id sdhci_s3c_dt_match[] = { { .compatible = "samsung,s3c6410-sdhci", }, { .compatible = "samsung,exynos4210-sdhci", - .data = (void *)EXYNOS4_SDHCI_DRV_DATA }, + .data = &exynos4_sdhci_drv_data }, {}, }; MODULE_DEVICE_TABLE(of, sdhci_s3c_dt_match); From 085cc3ab39d44ccb0ee051b0bc58f4dd0c0cbfaf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 10:56:54 +0100 Subject: [PATCH 0706/1085] mmc: dw_mmc-k3: make array hs_timing_cfg static The array hs_timing_cfg is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'hs_timing_cfg' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-k3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c index 64cda84b2302..73fd75c3c824 100644 --- a/drivers/mmc/host/dw_mmc-k3.c +++ b/drivers/mmc/host/dw_mmc-k3.c @@ -75,7 +75,7 @@ struct hs_timing { u32 smpl_phase_min; }; -struct hs_timing hs_timing_cfg[TIMING_MODE][TIMING_CFG_NUM] = { +static struct hs_timing hs_timing_cfg[TIMING_MODE][TIMING_CFG_NUM] = { { /* reserved */ }, { /* SD */ {7, 0, 15, 15,}, /* 0: LEGACY 400k */ From 519c51af0d176aef90ba9d3266cccbbb09e68d0c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 11:06:36 +0100 Subject: [PATCH 0707/1085] mmc: sdhci-of-at91: make function sdhci_at91_set_uhs_signaling static The function sdhci_at91_set_uhs_signaling is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'sdhci_at91_set_uhs_signaling' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-at91.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 4e47ed6bc716..682c573e20a7 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -114,7 +114,8 @@ static void sdhci_at91_set_power(struct sdhci_host *host, unsigned char mode, sdhci_set_power_noreg(host, mode, vdd); } -void sdhci_at91_set_uhs_signaling(struct sdhci_host *host, unsigned int timing) +static void sdhci_at91_set_uhs_signaling(struct sdhci_host *host, + unsigned int timing) { if (timing == MMC_TIMING_MMC_DDR52) sdhci_writeb(host, SDMMC_MC1R_DDR, SDMMC_MC1R); From 17f5e716703b3281293790e83ad3d6a0bd8b9681 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 3 Oct 2017 13:24:16 +0200 Subject: [PATCH 0708/1085] dt-bindings: mmc: Document the Amlogic Meson8 and Meson8b SDIO bindings This documents the devicetree bindings for the SDIO/MMC host found in Amlogic Meson8 and Meson8b SoCs. It supports the SD specification v2.0 and the eMMC specification v4.41. It has an internal "mux" which allows connecting up to three MMC devices to it. The maximum supported bus-width is 4-bits. Amlogic's GPL kernel sources call it "SDIO" to differentiate it from the other MMC controller in (at least the Meson8 and Meson8b) the SoCs (they call the other one "SDHC", which supports a bus-width of up to 8-bits). Signed-off-by: Carlo Caione Signed-off-by: Martin Blumenstingl Acked-by: Rob Herring Signed-off-by: Ulf Hansson --- .../bindings/mmc/amlogic,meson-mx-sdio.txt | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 Documentation/devicetree/bindings/mmc/amlogic,meson-mx-sdio.txt diff --git a/Documentation/devicetree/bindings/mmc/amlogic,meson-mx-sdio.txt b/Documentation/devicetree/bindings/mmc/amlogic,meson-mx-sdio.txt new file mode 100644 index 000000000000..8765c605e6bc --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/amlogic,meson-mx-sdio.txt @@ -0,0 +1,54 @@ +* Amlogic Meson6, Meson8 and Meson8b SDIO/MMC controller + +The highspeed MMC host controller on Amlogic SoCs provides an interface +for MMC, SD, SDIO and SDHC types of memory cards. + +Supported maximum speeds are the ones of the eMMC standard 4.41 as well +as the speed of SD standard 2.0. + +The hardware provides an internal "mux" which allows up to three slots +to be controlled. Only one slot can be accessed at a time. + +Required properties: + - compatible : must be one of + - "amlogic,meson8-sdio" + - "amlogic,meson8b-sdio" + along with the generic "amlogic,meson-mx-sdio" + - reg : mmc controller base registers + - interrupts : mmc controller interrupt + - #address-cells : must be 1 + - size-cells : must be 0 + - clocks : phandle to clock providers + - clock-names : must contain "core" and "clkin" + +Required child nodes: +A node for each slot provided by the MMC controller is required. +NOTE: due to a driver limitation currently only one slot (= child node) + is supported! + +Required properties on each child node (= slot): + - compatible : must be "mmc-slot" (see mmc.txt within this directory) + - reg : the slot (or "port") ID + +Optional properties on each child node (= slot): + - bus-width : must be 1 or 4 (8-bit bus is not supported) + - for cd and all other additional generic mmc parameters + please refer to mmc.txt within this directory + +Examples: + mmc@c1108c20 { + compatible = "amlogic,meson8-sdio", "amlogic,meson-mx-sdio"; + reg = <0xc1108c20 0x20>; + interrupts = <0 28 1>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&clkc CLKID_SDIO>, <&clkc CLKID_CLK81>; + clock-names = "core", "clkin"; + + slot@1 { + compatible = "mmc-slot"; + reg = <1>; + + bus-width = <4>; + }; + }; From ed80a13bb4c4c9a66aca11228930c5950d21c6f7 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 3 Oct 2017 13:24:17 +0200 Subject: [PATCH 0709/1085] mmc: meson-mx-sdio: Add a driver for the Amlogic Meson8 and Meson8b SoCs Add a driver for the SDIO/MMC host found on the Amlogic Meson SoCs. This is an MMC controller which provides an interface between the application processor and various memory cards. It supports the SD specification v2.0 and the eMMC specification v4.41. The controller provides an internal "mux" which allows connecting up to three MMC devices to it. Only one device can be used at a time though since the registers are shared across all devices. The driver takes care of synchronizing access (similar to the dw_mmc driver). The maximum supported bus-width is 4-bits. Amlogic's GPL kernel sources call the corresponding driver "aml_sdio" to differentiate it from the other MMC controller in (at least the Meson8 and Meson8b) the SoCs (they call the other drivers aml_sdhc and aml_sdhc_m8, which seem to support a bus-width of up to 8-bits). This means that there are three different MMC host controller IP blocks from Amlogic (each of them with completely own register layout and features): - "SDIO": 1 and 4 bit bus width, support for high-speed modes up to UHS-I SDR50, part of Meson6, Meson8 and Meson8b (the driver from this patch targets this controller) - "SDHC": 1, 4 and 8 bit bus width, compatible with standard iNAND interface, support for speeds up to HS200 and MMC spec up to version 4.5x, part of Meson8 and Meson8b SoCs (there is no mainline driver for this controller yet) - "SDEMMC": 1, 4 and 8 bit bus width, support for speeds up to HS400 and MMC spec up to version 5.0, part of the Meson GX (64-bit) SoCs (supported by the meson-gx MMC host driver) Signed-off-by: Carlo Caione Signed-off-by: Martin Blumenstingl Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 13 + drivers/mmc/host/Makefile | 1 + drivers/mmc/host/meson-mx-sdio.c | 769 +++++++++++++++++++++++++++++++ 3 files changed, 783 insertions(+) create mode 100644 drivers/mmc/host/meson-mx-sdio.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 8a4447b624a8..567028c9219a 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -352,6 +352,19 @@ config MMC_MESON_GX If you have a controller with this interface, say Y here. +config MMC_MESON_MX_SDIO + tristate "Amlogic Meson6/Meson8/Meson8b SD/MMC Host Controller support" + depends on ARCH_MESON || COMPILE_TEST + depends on COMMON_CLK + depends on HAS_DMA + depends on OF + help + This selects support for the SD/MMC Host Controller on + Amlogic Meson6, Meson8 and Meson8b SoCs. + + If you have a controller with this interface, say Y or M here. + If unsure, say N. + config MMC_MOXART tristate "MOXART SD/MMC Host Controller support" depends on ARCH_MOXART && MMC diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 2b5a8133948d..ab61a3e39c0b 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_MMC_VUB300) += vub300.o obj-$(CONFIG_MMC_USHC) += ushc.o obj-$(CONFIG_MMC_WMT) += wmt-sdmmc.o obj-$(CONFIG_MMC_MESON_GX) += meson-gx-mmc.o +obj-$(CONFIG_MMC_MESON_MX_SDIO) += meson-mx-sdio.o obj-$(CONFIG_MMC_MOXART) += moxart-mmc.o obj-$(CONFIG_MMC_SUNXI) += sunxi-mmc.o obj-$(CONFIG_MMC_USDHI6ROL0) += usdhi6rol0.o diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c new file mode 100644 index 000000000000..19b499bbe691 --- /dev/null +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -0,0 +1,769 @@ +/* + * meson-mx-sdio.c - Meson6, Meson8 and Meson8b SDIO/MMC Host Controller + * + * Copyright (C) 2015 Endless Mobile, Inc. + * Author: Carlo Caione + * Copyright (C) 2017 Martin Blumenstingl + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MESON_MX_SDIO_ARGU 0x00 + +#define MESON_MX_SDIO_SEND 0x04 + #define MESON_MX_SDIO_SEND_COMMAND_INDEX_MASK GENMASK(7, 0) + #define MESON_MX_SDIO_SEND_CMD_RESP_BITS_MASK GENMASK(15, 8) + #define MESON_MX_SDIO_SEND_RESP_WITHOUT_CRC7 BIT(16) + #define MESON_MX_SDIO_SEND_RESP_HAS_DATA BIT(17) + #define MESON_MX_SDIO_SEND_RESP_CRC7_FROM_8 BIT(18) + #define MESON_MX_SDIO_SEND_CHECK_DAT0_BUSY BIT(19) + #define MESON_MX_SDIO_SEND_DATA BIT(20) + #define MESON_MX_SDIO_SEND_USE_INT_WINDOW BIT(21) + #define MESON_MX_SDIO_SEND_REPEAT_PACKAGE_TIMES_MASK GENMASK(31, 24) + +#define MESON_MX_SDIO_CONF 0x08 + #define MESON_MX_SDIO_CONF_CMD_CLK_DIV_SHIFT 0 + #define MESON_MX_SDIO_CONF_CMD_CLK_DIV_WIDTH 10 + #define MESON_MX_SDIO_CONF_CMD_DISABLE_CRC BIT(10) + #define MESON_MX_SDIO_CONF_CMD_OUT_AT_POSITIVE_EDGE BIT(11) + #define MESON_MX_SDIO_CONF_CMD_ARGUMENT_BITS_MASK GENMASK(17, 12) + #define MESON_MX_SDIO_CONF_RESP_LATCH_AT_NEGATIVE_EDGE BIT(18) + #define MESON_MX_SDIO_CONF_DATA_LATCH_AT_NEGATIVE_EDGE BIT(19) + #define MESON_MX_SDIO_CONF_BUS_WIDTH BIT(20) + #define MESON_MX_SDIO_CONF_M_ENDIAN_MASK GENMASK(22, 21) + #define MESON_MX_SDIO_CONF_WRITE_NWR_MASK GENMASK(28, 23) + #define MESON_MX_SDIO_CONF_WRITE_CRC_OK_STATUS_MASK GENMASK(31, 29) + +#define MESON_MX_SDIO_IRQS 0x0c + #define MESON_MX_SDIO_IRQS_STATUS_STATE_MACHINE_MASK GENMASK(3, 0) + #define MESON_MX_SDIO_IRQS_CMD_BUSY BIT(4) + #define MESON_MX_SDIO_IRQS_RESP_CRC7_OK BIT(5) + #define MESON_MX_SDIO_IRQS_DATA_READ_CRC16_OK BIT(6) + #define MESON_MX_SDIO_IRQS_DATA_WRITE_CRC16_OK BIT(7) + #define MESON_MX_SDIO_IRQS_IF_INT BIT(8) + #define MESON_MX_SDIO_IRQS_CMD_INT BIT(9) + #define MESON_MX_SDIO_IRQS_STATUS_INFO_MASK GENMASK(15, 12) + #define MESON_MX_SDIO_IRQS_TIMING_OUT_INT BIT(16) + #define MESON_MX_SDIO_IRQS_AMRISC_TIMING_OUT_INT_EN BIT(17) + #define MESON_MX_SDIO_IRQS_ARC_TIMING_OUT_INT_EN BIT(18) + #define MESON_MX_SDIO_IRQS_TIMING_OUT_COUNT_MASK GENMASK(31, 19) + +#define MESON_MX_SDIO_IRQC 0x10 + #define MESON_MX_SDIO_IRQC_ARC_IF_INT_EN BIT(3) + #define MESON_MX_SDIO_IRQC_ARC_CMD_INT_EN BIT(4) + #define MESON_MX_SDIO_IRQC_IF_CONFIG_MASK GENMASK(7, 6) + #define MESON_MX_SDIO_IRQC_FORCE_DATA_CLK BIT(8) + #define MESON_MX_SDIO_IRQC_FORCE_DATA_CMD BIT(9) + #define MESON_MX_SDIO_IRQC_FORCE_DATA_DAT_MASK GENMASK(10, 13) + #define MESON_MX_SDIO_IRQC_SOFT_RESET BIT(15) + #define MESON_MX_SDIO_IRQC_FORCE_HALT BIT(30) + #define MESON_MX_SDIO_IRQC_HALT_HOLE BIT(31) + +#define MESON_MX_SDIO_MULT 0x14 + #define MESON_MX_SDIO_MULT_PORT_SEL_MASK GENMASK(1, 0) + #define MESON_MX_SDIO_MULT_MEMORY_STICK_ENABLE BIT(2) + #define MESON_MX_SDIO_MULT_MEMORY_STICK_SCLK_ALWAYS BIT(3) + #define MESON_MX_SDIO_MULT_STREAM_ENABLE BIT(4) + #define MESON_MX_SDIO_MULT_STREAM_8BITS_MODE BIT(5) + #define MESON_MX_SDIO_MULT_WR_RD_OUT_INDEX BIT(8) + #define MESON_MX_SDIO_MULT_DAT0_DAT1_SWAPPED BIT(10) + #define MESON_MX_SDIO_MULT_DAT1_DAT0_SWAPPED BIT(11) + #define MESON_MX_SDIO_MULT_RESP_READ_INDEX_MASK GENMASK(15, 12) + +#define MESON_MX_SDIO_ADDR 0x18 + +#define MESON_MX_SDIO_EXT 0x1c + #define MESON_MX_SDIO_EXT_DATA_RW_NUMBER_MASK GENMASK(29, 16) + +#define MESON_MX_SDIO_BOUNCE_REQ_SIZE (128 * 1024) +#define MESON_MX_SDIO_RESPONSE_CRC16_BITS (16 - 1) +#define MESON_MX_SDIO_MAX_SLOTS 3 + +struct meson_mx_mmc_host { + struct device *controller_dev; + + struct clk *parent_clk; + struct clk *core_clk; + struct clk_divider cfg_div; + struct clk *cfg_div_clk; + struct clk_fixed_factor fixed_factor; + struct clk *fixed_factor_clk; + + void __iomem *base; + int irq; + spinlock_t irq_lock; + + struct timer_list cmd_timeout; + + unsigned int slot_id; + struct mmc_host *mmc; + + struct mmc_request *mrq; + struct mmc_command *cmd; + int error; +}; + +static void meson_mx_mmc_mask_bits(struct mmc_host *mmc, char reg, u32 mask, + u32 val) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + u32 regval; + + regval = readl(host->base + reg); + regval &= ~mask; + regval |= (val & mask); + + writel(regval, host->base + reg); +} + +static void meson_mx_mmc_soft_reset(struct meson_mx_mmc_host *host) +{ + writel(MESON_MX_SDIO_IRQC_SOFT_RESET, host->base + MESON_MX_SDIO_IRQC); + udelay(2); +} + +static struct mmc_command *meson_mx_mmc_get_next_cmd(struct mmc_command *cmd) +{ + if (cmd->opcode == MMC_SET_BLOCK_COUNT && !cmd->error) + return cmd->mrq->cmd; + else if (mmc_op_multi(cmd->opcode) && + (!cmd->mrq->sbc || cmd->error || cmd->data->error)) + return cmd->mrq->stop; + else + return NULL; +} + +static void meson_mx_mmc_start_cmd(struct mmc_host *mmc, + struct mmc_command *cmd) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + unsigned int pack_size; + unsigned long irqflags, timeout; + u32 mult, send = 0, ext = 0; + + host->cmd = cmd; + + if (cmd->busy_timeout) + timeout = msecs_to_jiffies(cmd->busy_timeout); + else + timeout = msecs_to_jiffies(1000); + + switch (mmc_resp_type(cmd)) { + case MMC_RSP_R1: + case MMC_RSP_R1B: + case MMC_RSP_R3: + /* 7 (CMD) + 32 (response) + 7 (CRC) -1 */ + send |= FIELD_PREP(MESON_MX_SDIO_SEND_CMD_RESP_BITS_MASK, 45); + break; + case MMC_RSP_R2: + /* 7 (CMD) + 120 (response) + 7 (CRC) -1 */ + send |= FIELD_PREP(MESON_MX_SDIO_SEND_CMD_RESP_BITS_MASK, 133); + send |= MESON_MX_SDIO_SEND_RESP_CRC7_FROM_8; + break; + default: + break; + } + + if (!(cmd->flags & MMC_RSP_CRC)) + send |= MESON_MX_SDIO_SEND_RESP_WITHOUT_CRC7; + + if (cmd->flags & MMC_RSP_BUSY) + send |= MESON_MX_SDIO_SEND_CHECK_DAT0_BUSY; + + if (cmd->data) { + send |= FIELD_PREP(MESON_MX_SDIO_SEND_REPEAT_PACKAGE_TIMES_MASK, + (cmd->data->blocks - 1)); + + pack_size = cmd->data->blksz * BITS_PER_BYTE; + if (mmc->ios.bus_width == MMC_BUS_WIDTH_4) + pack_size += MESON_MX_SDIO_RESPONSE_CRC16_BITS * 4; + else + pack_size += MESON_MX_SDIO_RESPONSE_CRC16_BITS * 1; + + ext |= FIELD_PREP(MESON_MX_SDIO_EXT_DATA_RW_NUMBER_MASK, + pack_size); + + if (cmd->data->flags & MMC_DATA_WRITE) + send |= MESON_MX_SDIO_SEND_DATA; + else + send |= MESON_MX_SDIO_SEND_RESP_HAS_DATA; + + cmd->data->bytes_xfered = 0; + } + + send |= FIELD_PREP(MESON_MX_SDIO_SEND_COMMAND_INDEX_MASK, + (0x40 | cmd->opcode)); + + spin_lock_irqsave(&host->irq_lock, irqflags); + + mult = readl(host->base + MESON_MX_SDIO_MULT); + mult &= ~MESON_MX_SDIO_MULT_PORT_SEL_MASK; + mult |= FIELD_PREP(MESON_MX_SDIO_MULT_PORT_SEL_MASK, host->slot_id); + mult |= BIT(31); + writel(mult, host->base + MESON_MX_SDIO_MULT); + + /* enable the CMD done interrupt */ + meson_mx_mmc_mask_bits(mmc, MESON_MX_SDIO_IRQC, + MESON_MX_SDIO_IRQC_ARC_CMD_INT_EN, + MESON_MX_SDIO_IRQC_ARC_CMD_INT_EN); + + /* clear pending interrupts */ + meson_mx_mmc_mask_bits(mmc, MESON_MX_SDIO_IRQS, + MESON_MX_SDIO_IRQS_CMD_INT, + MESON_MX_SDIO_IRQS_CMD_INT); + + writel(cmd->arg, host->base + MESON_MX_SDIO_ARGU); + writel(ext, host->base + MESON_MX_SDIO_EXT); + writel(send, host->base + MESON_MX_SDIO_SEND); + + spin_unlock_irqrestore(&host->irq_lock, irqflags); + + mod_timer(&host->cmd_timeout, jiffies + timeout); +} + +static void meson_mx_mmc_request_done(struct meson_mx_mmc_host *host) +{ + struct mmc_request *mrq; + + mrq = host->mrq; + + host->mrq = NULL; + host->cmd = NULL; + + mmc_request_done(host->mmc, mrq); +} + +static void meson_mx_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + unsigned short vdd = ios->vdd; + unsigned long clk_rate = ios->clock; + + switch (ios->bus_width) { + case MMC_BUS_WIDTH_1: + meson_mx_mmc_mask_bits(mmc, MESON_MX_SDIO_CONF, + MESON_MX_SDIO_CONF_BUS_WIDTH, 0); + break; + + case MMC_BUS_WIDTH_4: + meson_mx_mmc_mask_bits(mmc, MESON_MX_SDIO_CONF, + MESON_MX_SDIO_CONF_BUS_WIDTH, + MESON_MX_SDIO_CONF_BUS_WIDTH); + break; + + case MMC_BUS_WIDTH_8: + default: + dev_err(mmc_dev(mmc), "unsupported bus width: %d\n", + ios->bus_width); + host->error = -EINVAL; + return; + } + + host->error = clk_set_rate(host->cfg_div_clk, ios->clock); + if (host->error) { + dev_warn(mmc_dev(mmc), + "failed to set MMC clock to %lu: %d\n", + clk_rate, host->error); + return; + } + + mmc->actual_clock = clk_get_rate(host->cfg_div_clk); + + switch (ios->power_mode) { + case MMC_POWER_OFF: + vdd = 0; + /* fall-through: */ + case MMC_POWER_UP: + if (!IS_ERR(mmc->supply.vmmc)) { + host->error = mmc_regulator_set_ocr(mmc, + mmc->supply.vmmc, + vdd); + if (host->error) + return; + } + break; + } +} + +static int meson_mx_mmc_map_dma(struct mmc_host *mmc, struct mmc_request *mrq) +{ + struct mmc_data *data = mrq->data; + int dma_len; + struct scatterlist *sg; + + if (!data) + return 0; + + sg = data->sg; + if (sg->offset & 3 || sg->length & 3) { + dev_err(mmc_dev(mmc), + "unaligned scatterlist: offset %x length %d\n", + sg->offset, sg->length); + return -EINVAL; + } + + dma_len = dma_map_sg(mmc_dev(mmc), data->sg, data->sg_len, + mmc_get_dma_dir(data)); + if (dma_len <= 0) { + dev_err(mmc_dev(mmc), "dma_map_sg failed\n"); + return -ENOMEM; + } + + return 0; +} + +static void meson_mx_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + struct mmc_command *cmd = mrq->cmd; + + if (!host->error) + host->error = meson_mx_mmc_map_dma(mmc, mrq); + + if (host->error) { + cmd->error = host->error; + mmc_request_done(mmc, mrq); + return; + } + + host->mrq = mrq; + + if (mrq->data) + writel(sg_dma_address(mrq->data->sg), + host->base + MESON_MX_SDIO_ADDR); + + if (mrq->sbc) + meson_mx_mmc_start_cmd(mmc, mrq->sbc); + else + meson_mx_mmc_start_cmd(mmc, mrq->cmd); +} + +static int meson_mx_mmc_card_busy(struct mmc_host *mmc) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + u32 irqc = readl(host->base + MESON_MX_SDIO_IRQC); + + return !!(irqc & MESON_MX_SDIO_IRQC_FORCE_DATA_DAT_MASK); +} + +static void meson_mx_mmc_read_response(struct mmc_host *mmc, + struct mmc_command *cmd) +{ + struct meson_mx_mmc_host *host = mmc_priv(mmc); + u32 mult; + int i, resp[4]; + + mult = readl(host->base + MESON_MX_SDIO_MULT); + mult |= MESON_MX_SDIO_MULT_WR_RD_OUT_INDEX; + mult &= ~MESON_MX_SDIO_MULT_RESP_READ_INDEX_MASK; + mult |= FIELD_PREP(MESON_MX_SDIO_MULT_RESP_READ_INDEX_MASK, 0); + writel(mult, host->base + MESON_MX_SDIO_MULT); + + if (cmd->flags & MMC_RSP_136) { + for (i = 0; i <= 3; i++) + resp[3 - i] = readl(host->base + MESON_MX_SDIO_ARGU); + cmd->resp[0] = (resp[0] << 8) | ((resp[1] >> 24) & 0xff); + cmd->resp[1] = (resp[1] << 8) | ((resp[2] >> 24) & 0xff); + cmd->resp[2] = (resp[2] << 8) | ((resp[3] >> 24) & 0xff); + cmd->resp[3] = (resp[3] << 8); + } else if (cmd->flags & MMC_RSP_PRESENT) { + cmd->resp[0] = readl(host->base + MESON_MX_SDIO_ARGU); + } +} + +static irqreturn_t meson_mx_mmc_process_cmd_irq(struct meson_mx_mmc_host *host, + u32 irqs, u32 send) +{ + struct mmc_command *cmd = host->cmd; + + /* + * NOTE: even though it shouldn't happen we sometimes get command + * interrupts twice (at least this is what it looks like). Ideally + * we find out why this happens and warn here as soon as it occurs. + */ + if (!cmd) + return IRQ_HANDLED; + + cmd->error = 0; + meson_mx_mmc_read_response(host->mmc, cmd); + + if (cmd->data) { + if (!((irqs & MESON_MX_SDIO_IRQS_DATA_READ_CRC16_OK) || + (irqs & MESON_MX_SDIO_IRQS_DATA_WRITE_CRC16_OK))) + cmd->error = -EILSEQ; + } else { + if (!((irqs & MESON_MX_SDIO_IRQS_RESP_CRC7_OK) || + (send & MESON_MX_SDIO_SEND_RESP_WITHOUT_CRC7))) + cmd->error = -EILSEQ; + } + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t meson_mx_mmc_irq(int irq, void *data) +{ + struct meson_mx_mmc_host *host = (void *) data; + u32 irqs, send; + unsigned long irqflags; + irqreturn_t ret; + + spin_lock_irqsave(&host->irq_lock, irqflags); + + irqs = readl(host->base + MESON_MX_SDIO_IRQS); + send = readl(host->base + MESON_MX_SDIO_SEND); + + if (irqs & MESON_MX_SDIO_IRQS_CMD_INT) + ret = meson_mx_mmc_process_cmd_irq(host, irqs, send); + else + ret = IRQ_HANDLED; + + /* finally ACK all pending interrupts */ + writel(irqs, host->base + MESON_MX_SDIO_IRQS); + + spin_unlock_irqrestore(&host->irq_lock, irqflags); + + return ret; +} + +static irqreturn_t meson_mx_mmc_irq_thread(int irq, void *irq_data) +{ + struct meson_mx_mmc_host *host = (void *) irq_data; + struct mmc_command *cmd = host->cmd, *next_cmd; + + if (WARN_ON(!cmd)) + return IRQ_HANDLED; + + del_timer_sync(&host->cmd_timeout); + + if (cmd->data) { + dma_unmap_sg(mmc_dev(host->mmc), cmd->data->sg, + cmd->data->sg_len, + mmc_get_dma_dir(cmd->data)); + + cmd->data->bytes_xfered = cmd->data->blksz * cmd->data->blocks; + } + + next_cmd = meson_mx_mmc_get_next_cmd(cmd); + if (next_cmd) + meson_mx_mmc_start_cmd(host->mmc, next_cmd); + else + meson_mx_mmc_request_done(host); + + return IRQ_HANDLED; +} + +static void meson_mx_mmc_timeout(unsigned long arg) +{ + struct meson_mx_mmc_host *host = (void *) arg; + unsigned long irqflags; + u32 irqc; + + spin_lock_irqsave(&host->irq_lock, irqflags); + + /* disable the CMD interrupt */ + irqc = readl(host->base + MESON_MX_SDIO_IRQC); + irqc &= ~MESON_MX_SDIO_IRQC_ARC_CMD_INT_EN; + writel(irqc, host->base + MESON_MX_SDIO_IRQC); + + spin_unlock_irqrestore(&host->irq_lock, irqflags); + + /* + * skip the timeout handling if the interrupt handler already processed + * the command. + */ + if (!host->cmd) + return; + + dev_dbg(mmc_dev(host->mmc), + "Timeout on CMD%u (IRQS = 0x%08x, ARGU = 0x%08x)\n", + host->cmd->opcode, readl(host->base + MESON_MX_SDIO_IRQS), + readl(host->base + MESON_MX_SDIO_ARGU)); + + host->cmd->error = -ETIMEDOUT; + + meson_mx_mmc_request_done(host); +} + +static struct mmc_host_ops meson_mx_mmc_ops = { + .request = meson_mx_mmc_request, + .set_ios = meson_mx_mmc_set_ios, + .card_busy = meson_mx_mmc_card_busy, + .get_cd = mmc_gpio_get_cd, + .get_ro = mmc_gpio_get_ro, +}; + +static struct platform_device *meson_mx_mmc_slot_pdev(struct device *parent) +{ + struct device_node *slot_node; + + /* + * TODO: the MMC core framework currently does not support + * controllers with multiple slots properly. So we only register + * the first slot for now + */ + slot_node = of_find_compatible_node(parent->of_node, NULL, "mmc-slot"); + if (!slot_node) { + dev_warn(parent, "no 'mmc-slot' sub-node found\n"); + return ERR_PTR(-ENOENT); + } + + return of_platform_device_create(slot_node, NULL, parent); +} + +static int meson_mx_mmc_add_host(struct meson_mx_mmc_host *host) +{ + struct mmc_host *mmc = host->mmc; + struct device *slot_dev = mmc_dev(mmc); + int ret; + + if (of_property_read_u32(slot_dev->of_node, "reg", &host->slot_id)) { + dev_err(slot_dev, "missing 'reg' property\n"); + return -EINVAL; + } + + if (host->slot_id >= MESON_MX_SDIO_MAX_SLOTS) { + dev_err(slot_dev, "invalid 'reg' property value %d\n", + host->slot_id); + return -EINVAL; + } + + /* Get regulators and the supported OCR mask */ + ret = mmc_regulator_get_supply(mmc); + if (ret == -EPROBE_DEFER) + return ret; + + mmc->max_req_size = MESON_MX_SDIO_BOUNCE_REQ_SIZE; + mmc->max_seg_size = mmc->max_req_size; + mmc->max_blk_count = + FIELD_GET(MESON_MX_SDIO_SEND_REPEAT_PACKAGE_TIMES_MASK, + 0xffffffff); + mmc->max_blk_size = FIELD_GET(MESON_MX_SDIO_EXT_DATA_RW_NUMBER_MASK, + 0xffffffff); + mmc->max_blk_size -= (4 * MESON_MX_SDIO_RESPONSE_CRC16_BITS); + mmc->max_blk_size /= BITS_PER_BYTE; + + /* Get the min and max supported clock rates */ + mmc->f_min = clk_round_rate(host->cfg_div_clk, 1); + mmc->f_max = clk_round_rate(host->cfg_div_clk, + clk_get_rate(host->parent_clk)); + + mmc->caps |= MMC_CAP_ERASE | MMC_CAP_CMD23; + mmc->ops = &meson_mx_mmc_ops; + + ret = mmc_of_parse(mmc); + if (ret) + return ret; + + ret = mmc_add_host(mmc); + if (ret) + return ret; + + return 0; +} + +static int meson_mx_mmc_register_clks(struct meson_mx_mmc_host *host) +{ + struct clk_init_data init; + const char *clk_div_parent, *clk_fixed_factor_parent; + + clk_fixed_factor_parent = __clk_get_name(host->parent_clk); + init.name = devm_kasprintf(host->controller_dev, GFP_KERNEL, + "%s#fixed_factor", + dev_name(host->controller_dev)); + init.ops = &clk_fixed_factor_ops; + init.flags = 0; + init.parent_names = &clk_fixed_factor_parent; + init.num_parents = 1; + host->fixed_factor.div = 2; + host->fixed_factor.mult = 1; + host->fixed_factor.hw.init = &init; + + host->fixed_factor_clk = devm_clk_register(host->controller_dev, + &host->fixed_factor.hw); + if (WARN_ON(PTR_ERR_OR_ZERO(host->fixed_factor_clk))) + return PTR_ERR(host->fixed_factor_clk); + + clk_div_parent = __clk_get_name(host->fixed_factor_clk); + init.name = devm_kasprintf(host->controller_dev, GFP_KERNEL, + "%s#div", dev_name(host->controller_dev)); + init.ops = &clk_divider_ops; + init.flags = CLK_SET_RATE_PARENT; + init.parent_names = &clk_div_parent; + init.num_parents = 1; + host->cfg_div.reg = host->base + MESON_MX_SDIO_CONF; + host->cfg_div.shift = MESON_MX_SDIO_CONF_CMD_CLK_DIV_SHIFT; + host->cfg_div.width = MESON_MX_SDIO_CONF_CMD_CLK_DIV_WIDTH; + host->cfg_div.hw.init = &init; + host->cfg_div.flags = CLK_DIVIDER_ALLOW_ZERO; + + host->cfg_div_clk = devm_clk_register(host->controller_dev, + &host->cfg_div.hw); + if (WARN_ON(PTR_ERR_OR_ZERO(host->cfg_div_clk))) + return PTR_ERR(host->fixed_factor_clk); + + return 0; +} + +static int meson_mx_mmc_probe(struct platform_device *pdev) +{ + struct platform_device *slot_pdev; + struct mmc_host *mmc; + struct meson_mx_mmc_host *host; + struct resource *res; + int ret, irq; + u32 conf; + + slot_pdev = meson_mx_mmc_slot_pdev(&pdev->dev); + if (!slot_pdev) + return -ENODEV; + else if (IS_ERR(slot_pdev)) + return PTR_ERR(slot_pdev); + + mmc = mmc_alloc_host(sizeof(*host), &slot_pdev->dev); + if (!mmc) { + ret = -ENOMEM; + goto error_unregister_slot_pdev; + } + + host = mmc_priv(mmc); + host->mmc = mmc; + host->controller_dev = &pdev->dev; + + spin_lock_init(&host->irq_lock); + setup_timer(&host->cmd_timeout, meson_mx_mmc_timeout, + (unsigned long)host); + + platform_set_drvdata(pdev, host); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + host->base = devm_ioremap_resource(host->controller_dev, res); + if (IS_ERR(host->base)) { + ret = PTR_ERR(host->base); + goto error_free_mmc; + } + + irq = platform_get_irq(pdev, 0); + ret = devm_request_threaded_irq(host->controller_dev, irq, + meson_mx_mmc_irq, + meson_mx_mmc_irq_thread, IRQF_ONESHOT, + NULL, host); + if (ret) + goto error_free_mmc; + + host->core_clk = devm_clk_get(host->controller_dev, "core"); + if (IS_ERR(host->core_clk)) { + ret = PTR_ERR(host->core_clk); + goto error_free_mmc; + } + + host->parent_clk = devm_clk_get(host->controller_dev, "clkin"); + if (IS_ERR(host->parent_clk)) { + ret = PTR_ERR(host->parent_clk); + goto error_free_mmc; + } + + ret = meson_mx_mmc_register_clks(host); + if (ret) + goto error_free_mmc; + + ret = clk_prepare_enable(host->core_clk); + if (ret) { + dev_err(host->controller_dev, "Failed to enable core clock\n"); + goto error_free_mmc; + } + + ret = clk_prepare_enable(host->cfg_div_clk); + if (ret) { + dev_err(host->controller_dev, "Failed to enable MMC clock\n"); + goto error_disable_core_clk; + } + + conf = 0; + conf |= FIELD_PREP(MESON_MX_SDIO_CONF_CMD_ARGUMENT_BITS_MASK, 39); + conf |= FIELD_PREP(MESON_MX_SDIO_CONF_M_ENDIAN_MASK, 0x3); + conf |= FIELD_PREP(MESON_MX_SDIO_CONF_WRITE_NWR_MASK, 0x2); + conf |= FIELD_PREP(MESON_MX_SDIO_CONF_WRITE_CRC_OK_STATUS_MASK, 0x2); + writel(conf, host->base + MESON_MX_SDIO_CONF); + + meson_mx_mmc_soft_reset(host); + + ret = meson_mx_mmc_add_host(host); + if (ret) + goto error_disable_clks; + + return 0; + +error_disable_clks: + clk_disable_unprepare(host->cfg_div_clk); +error_disable_core_clk: + clk_disable_unprepare(host->core_clk); +error_free_mmc: + mmc_free_host(mmc); +error_unregister_slot_pdev: + of_platform_device_destroy(&slot_pdev->dev, NULL); + return ret; +} + +static int meson_mx_mmc_remove(struct platform_device *pdev) +{ + struct meson_mx_mmc_host *host = platform_get_drvdata(pdev); + struct device *slot_dev = mmc_dev(host->mmc); + + del_timer_sync(&host->cmd_timeout); + + mmc_remove_host(host->mmc); + + of_platform_device_destroy(slot_dev, NULL); + + clk_disable_unprepare(host->cfg_div_clk); + clk_disable_unprepare(host->core_clk); + + mmc_free_host(host->mmc); + + return 0; +} + +static const struct of_device_id meson_mx_mmc_of_match[] = { + { .compatible = "amlogic,meson8-sdio", }, + { .compatible = "amlogic,meson8b-sdio", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, meson_mx_mmc_of_match); + +static struct platform_driver meson_mx_mmc_driver = { + .probe = meson_mx_mmc_probe, + .remove = meson_mx_mmc_remove, + .driver = { + .name = "meson-mx-sdio", + .of_match_table = of_match_ptr(meson_mx_mmc_of_match), + }, +}; + +module_platform_driver(meson_mx_mmc_driver); + +MODULE_DESCRIPTION("Meson6, Meson8 and Meson8b SDIO/MMC Host Driver"); +MODULE_AUTHOR("Carlo Caione "); +MODULE_AUTHOR("Martin Blumenstingl "); +MODULE_LICENSE("GPL v2"); From aaab3c46557029b760a476d5afc070a55df935b4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 8 Oct 2017 16:50:08 +0200 Subject: [PATCH 0710/1085] mmc: sunxi: drop superfluous error message This error message can go because a) currently nothing else than EPROBE_DEFER is returned and b) if this is going to change a much more detailed error message should come from mmc_regulator_get_supply() anyhow. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/sunxi-mmc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 53c970fe0873..cc98355dbdb9 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1175,11 +1175,8 @@ static int sunxi_mmc_resource_request(struct sunxi_mmc_host *host, return -EINVAL; ret = mmc_regulator_get_supply(host->mmc); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "Could not get vmmc supply\n"); + if (ret) return ret; - } host->reg_base = devm_ioremap_resource(&pdev->dev, platform_get_resource(pdev, IORESOURCE_MEM, 0)); From a3d95d1d4007b1fefd6d8b12db26fda05de05cfb Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Fri, 22 Sep 2017 12:22:17 +0100 Subject: [PATCH 0711/1085] mmc: tmio: check mmc_regulator_get_supply return value mmc_regulator_get_supply returns -EPROBE_DEFER if either vmmc or vqmmc regulators had their probing deferred. vqmmc regulator is needed by UHS to work properly, therefore this patch checks the value returned by mmc_regulator_get_supply to make sure we have a reference to both vmmc and vqmmc (if found in the DT). Signed-off-by: Fabrizio Castro Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 9c4e6199b854..3a6d49f07e22 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -1113,8 +1113,11 @@ static int tmio_mmc_init_ocr(struct tmio_mmc_host *host) { struct tmio_mmc_data *pdata = host->pdata; struct mmc_host *mmc = host->mmc; + int err; - mmc_regulator_get_supply(mmc); + err = mmc_regulator_get_supply(mmc); + if (err) + return err; /* use ocr_mask if no regulator */ if (!mmc->ocr_avail) From 9ccfa81725b92f9aec78bfb3e6c3e2e3c903e701 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Oct 2017 11:14:16 +0200 Subject: [PATCH 0712/1085] mmc: sdhci-msm: fix x86 build error The __WARN_printf() function is not portable across architectures and causes a compile-time error on x86 and others that don't use the asm-generic version of asm/bug.h: drivers/mmc/host/sdhci-msm.c: In function 'sdhci_msm_check_power_status': drivers/mmc/host/sdhci-msm.c:1066:4: error: implicit declaration of function '__WARN_printf'; did you mean '__dev_printk'? [-Werror=implicit-function-declaration] __WARN_printf("%s: pwr_irq for req: (%d) timed out\n", ^~~~~~~~~~~~~ The change that introduced this error, "mmc: sdhci-msm: Add sdhci msm register write APIs which wait for pwr irq", likely meant to use dev_warn(), so I'm changing over to that. Signed-off-by: Arnd Bergmann Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 85a882b4c541..3fb7d2eec93f 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1063,8 +1063,9 @@ static void sdhci_msm_check_power_status(struct sdhci_host *host, u32 req_type) if (!wait_event_timeout(msm_host->pwr_irq_wait, msm_host->pwr_irq_flag, msecs_to_jiffies(MSM_PWR_IRQ_TIMEOUT_MS))) - __WARN_printf("%s: pwr_irq for req: (%d) timed out\n", - mmc_hostname(host->mmc), req_type); + dev_warn(&msm_host->pdev->dev, + "%s: pwr_irq for req: (%d) timed out\n", + mmc_hostname(host->mmc), req_type); } pr_debug("%s: %s: request %d done\n", mmc_hostname(host->mmc), __func__, req_type); From 04fa0540255ee5707a1addf4c7c2a5ba4ac2c407 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 12 Oct 2017 10:46:59 -0700 Subject: [PATCH 0713/1085] mmc: core: export emmc revision Expose emmc revision as part of device attributes. Signed-off-by: Jin Qian Signed-off-by: Ulf Hansson --- Documentation/ABI/testing/sysfs-bus-mmc | 4 ++++ drivers/mmc/core/mmc.c | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-mmc diff --git a/Documentation/ABI/testing/sysfs-bus-mmc b/Documentation/ABI/testing/sysfs-bus-mmc new file mode 100644 index 000000000000..519f028d19cc --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-mmc @@ -0,0 +1,4 @@ +What: /sys/bus/mmc/devices/.../rev +Date: October 2017 +Contact: Jin Qian +Description: Extended CSD revision number diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 44d67ee8bf7b..066efbb89483 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -780,6 +780,7 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid); MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name); MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid); MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv); +MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev); MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info); MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n", card->ext_csd.device_life_time_est_typ_a, @@ -838,6 +839,7 @@ static struct attribute *mmc_std_attrs[] = { &dev_attr_name.attr, &dev_attr_oemid.attr, &dev_attr_prv.attr, + &dev_attr_rev.attr, &dev_attr_pre_eol_info.attr, &dev_attr_life_time.attr, &dev_attr_serial.attr, From 7599b8493292ae28295127f617148b1c7394c27d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Oct 2017 14:19:43 +0300 Subject: [PATCH 0714/1085] mmc: meson-mx-sdio: return correct error code This has a copy and paste bug so we use "host->fixed_factor_clk" which is a valid pointer instead of "host->cfg_div_clk" which holds the error code. Fixes: ed80a13bb4c4 ("mmc: meson-mx-sdio: Add a driver for the Amlogic Meson8 and Meson8b SoCs") Signed-off-by: Dan Carpenter Acked-by: Martin Blumenstingl Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-mx-sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index 19b499bbe691..dc30ed5e964b 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -621,7 +621,7 @@ static int meson_mx_mmc_register_clks(struct meson_mx_mmc_host *host) host->cfg_div_clk = devm_clk_register(host->controller_dev, &host->cfg_div.hw); if (WARN_ON(PTR_ERR_OR_ZERO(host->cfg_div_clk))) - return PTR_ERR(host->fixed_factor_clk); + return PTR_ERR(host->cfg_div_clk); return 0; } From 2f129d39adb37899dec2c872bea0a6d1034f4325 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Oct 2017 14:20:49 +0300 Subject: [PATCH 0715/1085] mmc: meson-mx-sdio: Cleanup IS_ERR() checks Using PTR_ERR_OR_ZERO() instead of IS_ERR() works, but it's not how you're supposed to write these conditions. Signed-off-by: Dan Carpenter Acked-by: Martin Blumenstingl Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-mx-sdio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index dc30ed5e964b..d0d921e0e589 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -602,7 +602,7 @@ static int meson_mx_mmc_register_clks(struct meson_mx_mmc_host *host) host->fixed_factor_clk = devm_clk_register(host->controller_dev, &host->fixed_factor.hw); - if (WARN_ON(PTR_ERR_OR_ZERO(host->fixed_factor_clk))) + if (WARN_ON(IS_ERR(host->fixed_factor_clk))) return PTR_ERR(host->fixed_factor_clk); clk_div_parent = __clk_get_name(host->fixed_factor_clk); @@ -620,7 +620,7 @@ static int meson_mx_mmc_register_clks(struct meson_mx_mmc_host *host) host->cfg_div_clk = devm_clk_register(host->controller_dev, &host->cfg_div.hw); - if (WARN_ON(PTR_ERR_OR_ZERO(host->cfg_div_clk))) + if (WARN_ON(IS_ERR(host->cfg_div_clk))) return PTR_ERR(host->cfg_div_clk); return 0; From f02cebdfe91a3818d4d71eb96a34fd90c9192c52 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 Oct 2017 21:50:31 +0900 Subject: [PATCH 0716/1085] mmc: sdhci-cadence: use bitfield access macros for cleanup Accessing register fields generally need mask and shift part. Defining them separately, like SDHCI_CDNS_HRS06_TUNE_{SHIFT,MASK}, is tedious. Register fields can be always defined by GENMASK (or, BIT if it it a single bit). They are nicely handled by FIELD_* macros. Signed-off-by: Masahiro Yamada Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-cadence.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index 56529c3d389a..0f589e26ee63 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -13,6 +13,7 @@ * GNU General Public License for more details. */ +#include #include #include #include @@ -27,15 +28,14 @@ #define SDHCI_CDNS_HRS04_ACK BIT(26) #define SDHCI_CDNS_HRS04_RD BIT(25) #define SDHCI_CDNS_HRS04_WR BIT(24) -#define SDHCI_CDNS_HRS04_RDATA_SHIFT 16 -#define SDHCI_CDNS_HRS04_WDATA_SHIFT 8 -#define SDHCI_CDNS_HRS04_ADDR_SHIFT 0 +#define SDHCI_CDNS_HRS04_RDATA GENMASK(23, 16) +#define SDHCI_CDNS_HRS04_WDATA GENMASK(15, 8) +#define SDHCI_CDNS_HRS04_ADDR GENMASK(5, 0) #define SDHCI_CDNS_HRS06 0x18 /* eMMC control */ #define SDHCI_CDNS_HRS06_TUNE_UP BIT(15) -#define SDHCI_CDNS_HRS06_TUNE_SHIFT 8 -#define SDHCI_CDNS_HRS06_TUNE_MASK 0x3f -#define SDHCI_CDNS_HRS06_MODE_MASK 0x7 +#define SDHCI_CDNS_HRS06_TUNE GENMASK(13, 8) +#define SDHCI_CDNS_HRS06_MODE GENMASK(2, 0) #define SDHCI_CDNS_HRS06_MODE_SD 0x0 #define SDHCI_CDNS_HRS06_MODE_MMC_SDR 0x2 #define SDHCI_CDNS_HRS06_MODE_MMC_DDR 0x3 @@ -105,8 +105,8 @@ static int sdhci_cdns_write_phy_reg(struct sdhci_cdns_priv *priv, u32 tmp; int ret; - tmp = (data << SDHCI_CDNS_HRS04_WDATA_SHIFT) | - (addr << SDHCI_CDNS_HRS04_ADDR_SHIFT); + tmp = FIELD_PREP(SDHCI_CDNS_HRS04_WDATA, data) | + FIELD_PREP(SDHCI_CDNS_HRS04_ADDR, addr); writel(tmp, reg); tmp |= SDHCI_CDNS_HRS04_WR; @@ -189,8 +189,8 @@ static void sdhci_cdns_set_emmc_mode(struct sdhci_cdns_priv *priv, u32 mode) /* The speed mode for eMMC is selected by HRS06 register */ tmp = readl(priv->hrs_addr + SDHCI_CDNS_HRS06); - tmp &= ~SDHCI_CDNS_HRS06_MODE_MASK; - tmp |= mode; + tmp &= ~SDHCI_CDNS_HRS06_MODE; + tmp |= FIELD_PREP(SDHCI_CDNS_HRS06_MODE, mode); writel(tmp, priv->hrs_addr + SDHCI_CDNS_HRS06); } @@ -199,7 +199,7 @@ static u32 sdhci_cdns_get_emmc_mode(struct sdhci_cdns_priv *priv) u32 tmp; tmp = readl(priv->hrs_addr + SDHCI_CDNS_HRS06); - return tmp & SDHCI_CDNS_HRS06_MODE_MASK; + return FIELD_GET(SDHCI_CDNS_HRS06_MODE, tmp); } static void sdhci_cdns_set_uhs_signaling(struct sdhci_host *host, @@ -254,12 +254,12 @@ static int sdhci_cdns_set_tune_val(struct sdhci_host *host, unsigned int val) void __iomem *reg = priv->hrs_addr + SDHCI_CDNS_HRS06; u32 tmp; - if (WARN_ON(val > SDHCI_CDNS_HRS06_TUNE_MASK)) + if (WARN_ON(!FIELD_FIT(SDHCI_CDNS_HRS06_TUNE, val))) return -EINVAL; tmp = readl(reg); - tmp &= ~(SDHCI_CDNS_HRS06_TUNE_MASK << SDHCI_CDNS_HRS06_TUNE_SHIFT); - tmp |= val << SDHCI_CDNS_HRS06_TUNE_SHIFT; + tmp &= ~SDHCI_CDNS_HRS06_TUNE; + tmp |= FIELD_PREP(SDHCI_CDNS_HRS06_TUNE, val); tmp |= SDHCI_CDNS_HRS06_TUNE_UP; writel(tmp, reg); From 1f90e9a38c469e5abc1c7e06a8dd33dd9d2b922d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:09 +0200 Subject: [PATCH 0717/1085] mmc: add kerneldoc to mmc_regulator_get_supply() Especially, make clear what the return value means. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 12b271c2a912..1f0f44f4dd5f 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1588,6 +1588,16 @@ EXPORT_SYMBOL_GPL(mmc_regulator_set_vqmmc); #endif /* CONFIG_REGULATOR */ +/** + * mmc_regulator_get_supply - try to get VMMC and VQMMC regulators for a host + * @mmc: the host to regulate + * + * Returns 0 or errno. errno should be handled, it is either a critical error + * or -EPROBE_DEFER. 0 means no critical error but it does not mean all + * regulators have been found because they all are optional. If you require + * certain regulators, you need to check separately in your driver if they got + * populated after calling this function. + */ int mmc_regulator_get_supply(struct mmc_host *mmc) { struct device *dev = mmc_dev(mmc); From 10b0b012d2f7b0bf56588cd57fdb0ddc49c63e8b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:10 +0200 Subject: [PATCH 0718/1085] mmc: cavium: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index fbd29f00fca0..ed5cefb83768 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -967,7 +967,7 @@ static int cvm_mmc_of_parse(struct device *dev, struct cvm_mmc_slot *slot) } ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) return ret; /* * Legacy Octeon firmware has no regulator entry, fall-back to From 0f3a47b80090cc4843f3ea6dea5c02b4f559698f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:11 +0200 Subject: [PATCH 0719/1085] mmc: dw_mmc: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 45289c5e0295..20d66f818909 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2741,7 +2741,7 @@ static int dw_mci_init_slot(struct dw_mci *host) /*if there are external regulators, get them*/ ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto err_host_allocated; if (!mmc->ocr_avail) From fa54f3e359d6d007f6c3f0f990c35d10a46b9919 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:12 +0200 Subject: [PATCH 0720/1085] mmc: meson-gx-mmc: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 85745ef179e2..e0862d3f65b3 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -1190,7 +1190,7 @@ static int meson_mmc_probe(struct platform_device *pdev) /* Get regulators and the supported OCR mask */ host->vqmmc_enabled = false; ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto free_host; ret = mmc_of_parse(mmc); From aa5754c7d169b7ff5da9f97ead2327ba62565237 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:13 +0200 Subject: [PATCH 0721/1085] mmc: meson-mx-sdio: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-mx-sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index d0d921e0e589..ce1d9216b5a7 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -551,7 +551,7 @@ static int meson_mx_mmc_add_host(struct meson_mx_mmc_host *host) /* Get regulators and the supported OCR mask */ ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) return ret; mmc->max_req_size = MESON_MX_SDIO_BOUNCE_REQ_SIZE; From 510069527bd8dddea719bb88a28ce9cb5d0c8378 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:14 +0200 Subject: [PATCH 0722/1085] mmc: mmci: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index f1f54a818489..e8a1bb1ae694 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1658,7 +1658,7 @@ static int mmci_probe(struct amba_device *dev, /* Get regulators and the supported OCR mask */ ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto clk_disable; if (!mmc->ocr_avail) From 2f98ef63b19004a114ce3b6d659176069bb3b96d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:15 +0200 Subject: [PATCH 0723/1085] mmc: mtk-sd: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 267f7ab08420..7c518a560089 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -1641,7 +1641,7 @@ static int msdc_drv_probe(struct platform_device *pdev) } ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto host_free; host->src_clk = devm_clk_get(&pdev->dev, "source"); From 337d7c8a4ee7c966aba8e246d1b3096b03673a46 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:16 +0200 Subject: [PATCH 0724/1085] mmc: mxcmmc: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mxcmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index 328484b96620..8bd9f060be2a 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1075,7 +1075,7 @@ static int mxcmci_probe(struct platform_device *pdev) dat3_card_detect = true; ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto out_free; if (!mmc->ocr_avail) { From 3b649a73692464cbd48eeb9785eeda9eaff5b5e5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:17 +0200 Subject: [PATCH 0725/1085] mmc: omap_hsmmc: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index f2a1137be8bc..071693ebfe18 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -445,7 +445,7 @@ static int omap_hsmmc_reg_get(struct omap_hsmmc_host *host) ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) return ret; /* Allow an aux regulator */ From 2a63303d9b87cfc3f08144e54daab718a8173e15 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:18 +0200 Subject: [PATCH 0726/1085] mmc: sdhci: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 0d5fcca18c9e..1052b52045b1 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -3238,7 +3238,7 @@ int sdhci_setup_host(struct sdhci_host *host) * available. */ ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) return ret; DBG("Version: 0x%08x | Present: 0x%08x\n", From 2d87ddd7b63e276a2e0a11788fce9e91c4d23a86 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 14 Oct 2017 21:17:19 +0200 Subject: [PATCH 0727/1085] mmc: usdhi6rol0: catch all errors when getting regulators Bail out everytime when mmc_regulator_get_supply() returns an errno, not only when probing gets deferred. This is currently a no-op, because this function only returns -EPROBE_DEFER or 0 right now. But if it will throw another error somewhen, it will be for a reason. (This still doesn't change that getting regulators is optional, so 0 can still mean no regulators found). So, let us a) be future proof and b) have driver code which is easier to understand. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/usdhi6rol0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 64da6a88cfb9..cdfeb15b6f05 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1757,7 +1757,7 @@ static int usdhi6_probe(struct platform_device *pdev) return -ENOMEM; ret = mmc_regulator_get_supply(mmc); - if (ret == -EPROBE_DEFER) + if (ret) goto e_free_mmc; ret = mmc_of_parse(mmc); From f7834cbd907abb5d4d84f85dba5a7321e5a37b83 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 15 Oct 2017 14:46:13 +0200 Subject: [PATCH 0728/1085] dt-bindings: mmc: describe new eMMC binding for fixed driver type Some boards may have to use a certain driver type (or drive strength) to achieve stable eMMC communication. Describe a binding to set this up via DT. Signed-off-by: Wolfram Sang Acked-by: Rob Herring Reviewed-by: Simon Horman Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/mmc.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt index b32ade645ad9..94a90b49a692 100644 --- a/Documentation/devicetree/bindings/mmc/mmc.txt +++ b/Documentation/devicetree/bindings/mmc/mmc.txt @@ -53,6 +53,9 @@ Optional properties: - no-sdio: controller is limited to send sdio cmd during initialization - no-sd: controller is limited to send sd cmd during initialization - no-mmc: controller is limited to send mmc cmd during initialization +- fixed-emmc-driver-type: for non-removable eMMC, enforce this driver type. + The value is the driver type as specified in the eMMC specification + (table 206 in spec version 5.1). *NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line polarity properties, we have to fix the meaning of the "normal" and "inverted" From 6186d06c519e217351fac95b545f334f8582af90 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 15 Oct 2017 14:46:14 +0200 Subject: [PATCH 0729/1085] mmc: parse new binding for eMMC fixed driver type Parse the new binding and store it in the host struct after doing some sanity checks. The code is designed to support fixed SD driver type if we ever need that. Signed-off-by: Wolfram Sang Reviewed-by: Simon Horman Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 13 ++++++++++++- drivers/mmc/core/mmc.c | 11 ++++++++--- include/linux/mmc/host.h | 2 ++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index e58be39b1568..35a9e4fd1a9f 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -179,7 +179,7 @@ static void mmc_retune_timer(unsigned long data) int mmc_of_parse(struct mmc_host *host) { struct device *dev = host->parent; - u32 bus_width; + u32 bus_width, drv_type; int ret; bool cd_cap_invert, cd_gpio_invert = false; bool ro_cap_invert, ro_gpio_invert = false; @@ -321,6 +321,15 @@ int mmc_of_parse(struct mmc_host *host) if (device_property_read_bool(dev, "no-mmc")) host->caps2 |= MMC_CAP2_NO_MMC; + /* Must be after "non-removable" check */ + if (device_property_read_u32(dev, "fixed-emmc-driver-type", &drv_type) == 0) { + if (host->caps & MMC_CAP_NONREMOVABLE) + host->fixed_drv_type = drv_type; + else + dev_err(host->parent, + "can't use fixed driver type, media is removable\n"); + } + host->dsr_req = !device_property_read_u32(dev, "dsr", &host->dsr); if (host->dsr_req && (host->dsr & ~0xffff)) { dev_err(host->parent, @@ -393,6 +402,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) host->max_blk_size = 512; host->max_blk_count = PAGE_SIZE / 512; + host->fixed_drv_type = -EINVAL; + return host; } diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 066efbb89483..a552f61060d2 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1291,13 +1291,18 @@ out_err: static void mmc_select_driver_type(struct mmc_card *card) { int card_drv_type, drive_strength, drv_type; + int fixed_drv_type = card->host->fixed_drv_type; card_drv_type = card->ext_csd.raw_driver_strength | mmc_driver_type_mask(0); - drive_strength = mmc_select_drive_strength(card, - card->ext_csd.hs200_max_dtr, - card_drv_type, &drv_type); + if (fixed_drv_type >= 0) + drive_strength = card_drv_type & mmc_driver_type_mask(fixed_drv_type) + ? fixed_drv_type : 0; + else + drive_strength = mmc_select_drive_strength(card, + card->ext_csd.hs200_max_dtr, + card_drv_type, &drv_type); card->drive_strength = drive_strength; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index c296f4351c1d..e7743eca1021 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -354,6 +354,8 @@ struct mmc_host { #define MMC_CAP2_CQE (1 << 23) /* Has eMMC command queue engine */ #define MMC_CAP2_CQE_DCMD (1 << 24) /* CQE can issue a direct command */ + int fixed_drv_type; /* fixed driver type for non-removable media */ + mmc_pm_flag_t pm_caps; /* supported pm features */ /* host specific block data */ From 95e91ade6900547a74ac8e3ce35213aacfbdd0d3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 18 Oct 2017 09:00:21 +0200 Subject: [PATCH 0730/1085] dt-bindings: mmc: renesas_sdhi: provide example in bindings documentation Provide an example of the usage of the DT bindings for TMIO in their documentation. The example given is for the r8a7790 (R-Car H2). Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/tmio_mmc.txt | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt index 54ef642f23a0..b63392d9cc47 100644 --- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt +++ b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt @@ -43,3 +43,61 @@ Optional properties: - pinctrl-names: should be "default", "state_uhs" - pinctrl-0: should contain default/high speed pin ctrl - pinctrl-1: should contain uhs mode pin ctrl + +Example: R8A7790 (R-Car H2) SDHI controller nodes + + sdhi0: sd@ee100000 { + compatible = "renesas,sdhi-r8a7790"; + reg = <0 0xee100000 0 0x328>; + interrupts = ; + clocks = <&cpg CPG_MOD 314>; + dmas = <&dmac0 0xcd>, <&dmac0 0xce>, + <&dmac1 0xcd>, <&dmac1 0xce>; + dma-names = "tx", "rx", "tx", "rx"; + max-frequency = <195000000>; + power-domains = <&sysc R8A7790_PD_ALWAYS_ON>; + resets = <&cpg 314>; + status = "disabled"; + }; + + sdhi1: sd@ee120000 { + compatible = "renesas,sdhi-r8a7790"; + reg = <0 0xee120000 0 0x328>; + interrupts = ; + clocks = <&cpg CPG_MOD 313>; + dmas = <&dmac0 0xc9>, <&dmac0 0xca>, + <&dmac1 0xc9>, <&dmac1 0xca>; + dma-names = "tx", "rx", "tx", "rx"; + max-frequency = <195000000>; + power-domains = <&sysc R8A7790_PD_ALWAYS_ON>; + resets = <&cpg 313>; + status = "disabled"; + }; + + sdhi2: sd@ee140000 { + compatible = "renesas,sdhi-r8a7790"; + reg = <0 0xee140000 0 0x100>; + interrupts = ; + clocks = <&cpg CPG_MOD 312>; + dmas = <&dmac0 0xc1>, <&dmac0 0xc2>, + <&dmac1 0xc1>, <&dmac1 0xc2>; + dma-names = "tx", "rx", "tx", "rx"; + max-frequency = <97500000>; + power-domains = <&sysc R8A7790_PD_ALWAYS_ON>; + resets = <&cpg 312>; + status = "disabled"; + }; + + sdhi3: sd@ee160000 { + compatible = "renesas,sdhi-r8a7790"; + reg = <0 0xee160000 0 0x100>; + interrupts = ; + clocks = <&cpg CPG_MOD 311>; + dmas = <&dmac0 0xd3>, <&dmac0 0xd4>, + <&dmac1 0xd3>, <&dmac1 0xd4>; + dma-names = "tx", "rx", "tx", "rx"; + max-frequency = <97500000>; + power-domains = <&sysc R8A7790_PD_ALWAYS_ON>; + resets = <&cpg 311>; + status = "disabled"; + }; From 54839d012d5f98cde2fa102fdcd22e1da661d138 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 18 Oct 2017 09:00:22 +0200 Subject: [PATCH 0731/1085] dt-bindings: mmc: renesas_sdhi: add R-Car Gen[123] fallback compatibility strings Add fallback compatibility strings for R-Car Gen 1, 2 and 3. In the case of Renesas R-Car hardware we know that there are generations of SoCs, f.e. Gen 1 and 2. But beyond that its not clear what the relationship between IP blocks might be. For example, I believe that r8a7790 is older than r8a7791 but that doesn't imply that the latter is a descendant of the former or vice versa. We can, however, by examining the documentation and behaviour of the hardware at run-time observe that the current driver implementation appears to be compatible with the IP blocks on SoCs within a given generation. For the above reasons and convenience when enabling new SoCs a per-generation fallback compatibility string scheme is being adopted for drivers for Renesas SoCs. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/tmio_mmc.txt | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt index b63392d9cc47..3c6762430fd9 100644 --- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt +++ b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt @@ -10,7 +10,7 @@ described in mmc.txt, can be used. Additionally the following tmio_mmc-specific optional bindings can be used. Required properties: -- compatible: "renesas,sdhi-shmobile" - a generic sh-mobile SDHI unit +- compatible: should contain one or more of the following: "renesas,sdhi-sh73a0" - SDHI IP on SH73A0 SoC "renesas,sdhi-r7s72100" - SDHI IP on R7S72100 SoC "renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC @@ -26,6 +26,16 @@ Required properties: "renesas,sdhi-r8a7794" - SDHI IP on R8A7794 SoC "renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC "renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC + "renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller + "renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller + "renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 or RZ/G1 + SDHI controller + "renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 SDHI controller + + + When compatible with the generic version, nodes must list + the SoC-specific version corresponding to the platform + first followed by the generic version. - clocks: Most controllers only have 1 clock source per channel. However, on some variations of this controller, the internal card detection @@ -47,7 +57,7 @@ Optional properties: Example: R8A7790 (R-Car H2) SDHI controller nodes sdhi0: sd@ee100000 { - compatible = "renesas,sdhi-r8a7790"; + compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi"; reg = <0 0xee100000 0 0x328>; interrupts = ; clocks = <&cpg CPG_MOD 314>; @@ -61,7 +71,7 @@ Example: R8A7790 (R-Car H2) SDHI controller nodes }; sdhi1: sd@ee120000 { - compatible = "renesas,sdhi-r8a7790"; + compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi"; reg = <0 0xee120000 0 0x328>; interrupts = ; clocks = <&cpg CPG_MOD 313>; @@ -75,7 +85,7 @@ Example: R8A7790 (R-Car H2) SDHI controller nodes }; sdhi2: sd@ee140000 { - compatible = "renesas,sdhi-r8a7790"; + compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi"; reg = <0 0xee140000 0 0x100>; interrupts = ; clocks = <&cpg CPG_MOD 312>; @@ -89,7 +99,7 @@ Example: R8A7790 (R-Car H2) SDHI controller nodes }; sdhi3: sd@ee160000 { - compatible = "renesas,sdhi-r8a7790"; + compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi"; reg = <0 0xee160000 0 0x100>; interrupts = ; clocks = <&cpg CPG_MOD 311>; From d6dc425ae595e14026beac3720e43edd70215dc8 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 18 Oct 2017 09:00:23 +0200 Subject: [PATCH 0732/1085] mmc: renesas_sdhi: implement R-Car Gen[123] fallback compatibility strings Implement fallback compatibility strings for R-Car Gen 1, 2 and 3. In the case of Renesas R-Car hardware we know that there are generations of SoCs, f.e. Gen 1 and 2. But beyond that its not clear what the relationship between IP blocks might be. For example, I believe that r8a7790 is older than r8a7791 but that doesn't imply that the latter is a descendant of the former or vice versa. We can, however, by examining the documentation and behaviour of the hardware at run-time observe that the current driver implementation appears to be compatible with the IP blocks on SoCs within a given generation. For the above reasons and convenience when enabling new SoCs a per-generation fallback compatibility string scheme is being adopted for drivers for Renesas SoCs. Also, improve readability by listing the shmobile fallback compatibility string after the more-specific compatibility strings they provide a fallback for. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_internal_dmac.c | 1 + drivers/mmc/host/renesas_sdhi_sys_dmac.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 8bae88a150fd..41cbe84c1d18 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -88,6 +88,7 @@ static const struct renesas_sdhi_of_data of_rcar_gen3_compatible = { static const struct of_device_id renesas_sdhi_internal_dmac_of_match[] = { { .compatible = "renesas,sdhi-r8a7795", .data = &of_rcar_gen3_compatible, }, { .compatible = "renesas,sdhi-r8a7796", .data = &of_rcar_gen3_compatible, }, + { .compatible = "renesas,rcar-gen3-sdhi", .data = &of_rcar_gen3_compatible, }, {}, }; MODULE_DEVICE_TABLE(of, renesas_sdhi_internal_dmac_of_match); diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c index df4465439e13..9ab10436e4b8 100644 --- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c @@ -91,7 +91,6 @@ static const struct renesas_sdhi_of_data of_rcar_gen3_compatible = { }; static const struct of_device_id renesas_sdhi_sys_dmac_of_match[] = { - { .compatible = "renesas,sdhi-shmobile" }, { .compatible = "renesas,sdhi-sh73a0", .data = &of_default_cfg, }, { .compatible = "renesas,sdhi-r8a73a4", .data = &of_default_cfg, }, { .compatible = "renesas,sdhi-r8a7740", .data = &of_default_cfg, }, @@ -107,6 +106,10 @@ static const struct of_device_id renesas_sdhi_sys_dmac_of_match[] = { { .compatible = "renesas,sdhi-r8a7794", .data = &of_rcar_gen2_compatible, }, { .compatible = "renesas,sdhi-r8a7795", .data = &of_rcar_gen3_compatible, }, { .compatible = "renesas,sdhi-r8a7796", .data = &of_rcar_gen3_compatible, }, + { .compatible = "renesas,rcar-gen1-sdhi", .data = &of_rcar_gen1_compatible, }, + { .compatible = "renesas,rcar-gen2-sdhi", .data = &of_rcar_gen2_compatible, }, + { .compatible = "renesas,rcar-gen3-sdhi", .data = &of_rcar_gen3_compatible, }, + { .compatible = "renesas,sdhi-shmobile" }, {}, }; MODULE_DEVICE_TABLE(of, renesas_sdhi_sys_dmac_of_match); From a2038497e817107f194b8c34632df6aabe50eb30 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 19 Oct 2017 13:41:43 +0300 Subject: [PATCH 0733/1085] mmc: sdhci-acpi: Use helper function acpi_device_uid() Make use of acpi_device_uid() instead of open coding. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 08ae0ff13513..e4b7fac5efb2 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -443,7 +443,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev) return -ENODEV; hid = acpi_device_hid(device); - uid = device->pnp.unique_id; + uid = acpi_device_uid(device); /* Power on the SDHCI controller and its children */ acpi_device_fix_up_power(device); From 159cd328e34891898d5db686d753f9acc1f6ed73 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 19 Oct 2017 13:41:44 +0300 Subject: [PATCH 0734/1085] mmc: sdhci-acpi: Tidy Intel slot probe functions into one Tidy Intel slot probe functions into one. A single function can be used because the logic uses hid / uid as necessary to identify devices anyway. This gets rid of some pointless comments and checks for variables that cannot possibly be NULL, as well as giving the function a name that identifies it as specific to Intel controllers. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 48 +++++------------------------------ 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index e4b7fac5efb2..640be5b618fc 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -269,53 +269,17 @@ out: return ret; } -static int sdhci_acpi_emmc_probe_slot(struct platform_device *pdev, - const char *hid, const char *uid) +static int intel_probe_slot(struct platform_device *pdev, const char *hid, + const char *uid) { struct sdhci_acpi_host *c = platform_get_drvdata(pdev); - struct sdhci_host *host; - - if (!c || !c->host) - return 0; - - host = c->host; - - /* Platform specific code during emmc probe slot goes here */ + struct sdhci_host *host = c->host; if (hid && uid && !strcmp(hid, "80860F14") && !strcmp(uid, "1") && sdhci_readl(host, SDHCI_CAPABILITIES) == 0x446cc8b2 && sdhci_readl(host, SDHCI_CAPABILITIES_1) == 0x00000807) host->timeout_clk = 1000; /* 1000 kHz i.e. 1 MHz */ - return 0; -} - -static int sdhci_acpi_sdio_probe_slot(struct platform_device *pdev, - const char *hid, const char *uid) -{ - struct sdhci_acpi_host *c = platform_get_drvdata(pdev); - - if (!c || !c->host) - return 0; - - /* Platform specific code during sdio probe slot goes here */ - - return 0; -} - -static int sdhci_acpi_sd_probe_slot(struct platform_device *pdev, - const char *hid, const char *uid) -{ - struct sdhci_acpi_host *c = platform_get_drvdata(pdev); - struct sdhci_host *host; - - if (!c || !c->host || !c->slot) - return 0; - - host = c->host; - - /* Platform specific code during sd probe slot goes here */ - if (hid && !strcmp(hid, "80865ACA")) host->mmc_host_ops.get_cd = bxt_get_cd; @@ -332,7 +296,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = { .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | SDHCI_QUIRK2_STOP_WITH_TC | SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400, - .probe_slot = sdhci_acpi_emmc_probe_slot, + .probe_slot = intel_probe_slot, }; static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = { @@ -343,7 +307,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = { MMC_CAP_WAIT_WHILE_BUSY, .flags = SDHCI_ACPI_RUNTIME_PM, .pm_caps = MMC_PM_KEEP_POWER, - .probe_slot = sdhci_acpi_sdio_probe_slot, + .probe_slot = intel_probe_slot, }; static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = { @@ -353,7 +317,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = { .quirks2 = SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON | SDHCI_QUIRK2_STOP_WITH_TC, .caps = MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_AGGRESSIVE_PM, - .probe_slot = sdhci_acpi_sd_probe_slot, + .probe_slot = intel_probe_slot, }; static const struct sdhci_acpi_slot sdhci_acpi_slot_qcom_sd_3v = { From 361eeda0ca1674e54ba66ac4435865ebbed702e0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 19 Oct 2017 15:04:13 +0300 Subject: [PATCH 0735/1085] mmc: sdhci-pci: Tidy o2micro definitions We keep PCI Ids in sdhci-pci.h and the O2-specific definitions belong in sdhci-pci-o2micro.c. Move those definitions accordingly. Remove unused O2 definitions in sdhci-pci-core.c. The 3 o2micro external function declarations might as well be in sdhci-pci.h as well, so move them there and get rid of sdhci-pci-o2micro.h entirely. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 10 ---- drivers/mmc/host/sdhci-pci-o2micro.c | 35 ++++++++++++- drivers/mmc/host/sdhci-pci-o2micro.h | 73 ---------------------------- drivers/mmc/host/sdhci-pci.h | 12 +++++ 4 files changed, 46 insertions(+), 84 deletions(-) delete mode 100644 drivers/mmc/host/sdhci-pci-o2micro.h diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index e854ee5d7f4a..3e4f04fd5175 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -32,7 +32,6 @@ #include "sdhci.h" #include "sdhci-pci.h" -#include "sdhci-pci-o2micro.h" static int sdhci_pci_enable_dma(struct sdhci_host *host); static void sdhci_pci_hw_reset(struct sdhci_host *host); @@ -798,15 +797,6 @@ static const struct sdhci_pci_fixes sdhci_intel_mrfld_mmc = { .probe_slot = intel_mrfld_mmc_probe_slot, }; -/* O2Micro extra registers */ -#define O2_SD_LOCK_WP 0xD3 -#define O2_SD_MULTI_VCC3V 0xEE -#define O2_SD_CLKREQ 0xEC -#define O2_SD_CAPS 0xE0 -#define O2_SD_ADMA1 0xE2 -#define O2_SD_ADMA2 0xE7 -#define O2_SD_INF_MOD 0xF1 - static int jmicron_pmos(struct sdhci_pci_chip *chip, int on) { u8 scratch; diff --git a/drivers/mmc/host/sdhci-pci-o2micro.c b/drivers/mmc/host/sdhci-pci-o2micro.c index 14273ca00641..555970a29c94 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.c +++ b/drivers/mmc/host/sdhci-pci-o2micro.c @@ -19,7 +19,40 @@ #include "sdhci.h" #include "sdhci-pci.h" -#include "sdhci-pci-o2micro.h" + +/* + * O2Micro device registers + */ + +#define O2_SD_MISC_REG5 0x64 +#define O2_SD_LD0_CTRL 0x68 +#define O2_SD_DEV_CTRL 0x88 +#define O2_SD_LOCK_WP 0xD3 +#define O2_SD_TEST_REG 0xD4 +#define O2_SD_FUNC_REG0 0xDC +#define O2_SD_MULTI_VCC3V 0xEE +#define O2_SD_CLKREQ 0xEC +#define O2_SD_CAPS 0xE0 +#define O2_SD_ADMA1 0xE2 +#define O2_SD_ADMA2 0xE7 +#define O2_SD_INF_MOD 0xF1 +#define O2_SD_MISC_CTRL4 0xFC +#define O2_SD_TUNING_CTRL 0x300 +#define O2_SD_PLL_SETTING 0x304 +#define O2_SD_CLK_SETTING 0x328 +#define O2_SD_CAP_REG2 0x330 +#define O2_SD_CAP_REG0 0x334 +#define O2_SD_UHS1_CAP_SETTING 0x33C +#define O2_SD_DELAY_CTRL 0x350 +#define O2_SD_UHS2_L1_CTRL 0x35C +#define O2_SD_FUNC_REG3 0x3E0 +#define O2_SD_FUNC_REG4 0x3E4 +#define O2_SD_LED_ENABLE BIT(6) +#define O2_SD_FREG0_LEDOFF BIT(13) +#define O2_SD_FREG4_ENABLE_CLK_SET BIT(22) + +#define O2_SD_VENDOR_SETTING 0x110 +#define O2_SD_VENDOR_SETTING2 0x1C8 static void o2_pci_set_baseclk(struct sdhci_pci_chip *chip, u32 value) { diff --git a/drivers/mmc/host/sdhci-pci-o2micro.h b/drivers/mmc/host/sdhci-pci-o2micro.h deleted file mode 100644 index 770f53857211..000000000000 --- a/drivers/mmc/host/sdhci-pci-o2micro.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (C) 2013 BayHub Technology Ltd. - * - * Authors: Peter Guo - * Adam Lee - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef __SDHCI_PCI_O2MICRO_H -#define __SDHCI_PCI_O2MICRO_H - -#include "sdhci-pci.h" - -/* - * O2Micro device IDs - */ - -#define PCI_DEVICE_ID_O2_SDS0 0x8420 -#define PCI_DEVICE_ID_O2_SDS1 0x8421 -#define PCI_DEVICE_ID_O2_FUJIN2 0x8520 -#define PCI_DEVICE_ID_O2_SEABIRD0 0x8620 -#define PCI_DEVICE_ID_O2_SEABIRD1 0x8621 - -/* - * O2Micro device registers - */ - -#define O2_SD_MISC_REG5 0x64 -#define O2_SD_LD0_CTRL 0x68 -#define O2_SD_DEV_CTRL 0x88 -#define O2_SD_LOCK_WP 0xD3 -#define O2_SD_TEST_REG 0xD4 -#define O2_SD_FUNC_REG0 0xDC -#define O2_SD_MULTI_VCC3V 0xEE -#define O2_SD_CLKREQ 0xEC -#define O2_SD_CAPS 0xE0 -#define O2_SD_ADMA1 0xE2 -#define O2_SD_ADMA2 0xE7 -#define O2_SD_INF_MOD 0xF1 -#define O2_SD_MISC_CTRL4 0xFC -#define O2_SD_TUNING_CTRL 0x300 -#define O2_SD_PLL_SETTING 0x304 -#define O2_SD_CLK_SETTING 0x328 -#define O2_SD_CAP_REG2 0x330 -#define O2_SD_CAP_REG0 0x334 -#define O2_SD_UHS1_CAP_SETTING 0x33C -#define O2_SD_DELAY_CTRL 0x350 -#define O2_SD_UHS2_L1_CTRL 0x35C -#define O2_SD_FUNC_REG3 0x3E0 -#define O2_SD_FUNC_REG4 0x3E4 -#define O2_SD_LED_ENABLE BIT(6) -#define O2_SD_FREG0_LEDOFF BIT(13) -#define O2_SD_FREG4_ENABLE_CLK_SET BIT(22) - -#define O2_SD_VENDOR_SETTING 0x110 -#define O2_SD_VENDOR_SETTING2 0x1C8 - -extern int sdhci_pci_o2_probe_slot(struct sdhci_pci_slot *slot); - -extern int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip); - -extern int sdhci_pci_o2_resume(struct sdhci_pci_chip *chip); - -#endif /* __SDHCI_PCI_O2MICRO_H */ diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 3c1dd79fdbde..063506cc2ca9 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -5,6 +5,12 @@ * PCI device IDs, sub IDs */ +#define PCI_DEVICE_ID_O2_SDS0 0x8420 +#define PCI_DEVICE_ID_O2_SDS1 0x8421 +#define PCI_DEVICE_ID_O2_FUJIN2 0x8520 +#define PCI_DEVICE_ID_O2_SEABIRD0 0x8620 +#define PCI_DEVICE_ID_O2_SEABIRD1 0x8621 + #define PCI_DEVICE_ID_INTEL_PCH_SDIO0 0x8809 #define PCI_DEVICE_ID_INTEL_PCH_SDIO1 0x880a #define PCI_DEVICE_ID_INTEL_BYT_EMMC 0x0f14 @@ -164,4 +170,10 @@ static inline void *sdhci_pci_priv(struct sdhci_pci_slot *slot) int sdhci_pci_resume_host(struct sdhci_pci_chip *chip); #endif +int sdhci_pci_o2_probe_slot(struct sdhci_pci_slot *slot); +int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip); +#ifdef CONFIG_PM_SLEEP +int sdhci_pci_o2_resume(struct sdhci_pci_chip *chip); +#endif + #endif /* __SDHCI_PCI_H */ From d91ca84edeb04fad89faf0f723029b692f6b13b4 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:28 +0800 Subject: [PATCH 0736/1085] mmc: dt-bindings: Add reg/source_cg/latch-ck for Mediatek MMC bindings Change the comptiable for support of multi-platform Make compatible explicit, as MMC host of mt8173 has difference with mt8135(mt8173 supports hs400 and hs400_tune),so that need separate mt8173/mt8135 compatible name. Add description for reg Add description for source_cg Add description for mediatek,latch-ck Note that source_cg and mediatek,latch-ck are optional for some projects, eg, MT2701 do not have source_cg, and MT2712 do not need mediatek,latch-ck Signed-off-by: Chaotian Jing Acked-by: Rob Herring Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/mtk-sd.txt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/mmc/mtk-sd.txt b/Documentation/devicetree/bindings/mmc/mtk-sd.txt index 4182ea36ca5b..72d2a734ab85 100644 --- a/Documentation/devicetree/bindings/mmc/mtk-sd.txt +++ b/Documentation/devicetree/bindings/mmc/mtk-sd.txt @@ -7,10 +7,18 @@ This file documents differences between the core properties in mmc.txt and the properties used by the msdc driver. Required properties: -- compatible: Should be "mediatek,mt8173-mmc","mediatek,mt8135-mmc" +- compatible: value should be either of the following. + "mediatek,mt8135-mmc": for mmc host ip compatible with mt8135 + "mediatek,mt8173-mmc": for mmc host ip compatible with mt8173 + "mediatek,mt2701-mmc": for mmc host ip compatible with mt2701 + "mediatek,mt2712-mmc": for mmc host ip compatible with mt2712 +- reg: physical base address of the controller and length - interrupts: Should contain MSDC interrupt number -- clocks: MSDC source clock, HCLK -- clock-names: "source", "hclk" +- clocks: Should contain phandle for the clock feeding the MMC controller +- clock-names: Should contain the following: + "source" - source clock (required) + "hclk" - HCLK which used for host (required) + "source_cg" - independent source clock gate (required for MT2712) - pinctrl-names: should be "default", "state_uhs" - pinctrl-0: should contain default/high speed pin ctrl - pinctrl-1: should contain uhs mode pin ctrl @@ -30,6 +38,10 @@ Optional properties: - mediatek,hs400-cmd-resp-sel-rising: HS400 command response sample selection If present,HS400 command responses are sampled on rising edges. If not present,HS400 command responses are sampled on falling edges. +- mediatek,latch-ck: Some SoCs do not support enhance_rx, need set correct latch-ck to avoid data crc + error caused by stop clock(fifo full) + Valid range = [0:0x7]. if not present, default value is 0. + applied to compatible "mediatek,mt2701-mmc". Examples: mmc0: mmc@11230000 { From 762d491a8bff7aea33a521ddc896a62d323df0a9 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:29 +0800 Subject: [PATCH 0737/1085] mmc: mediatek: add support of mt2701/mt2712 mt2701/mt2712 has 12bit clock div, which is not compatible with mt8135/mt8173. and, some additional features will be added in mt2701/mt2712, so that need distinguish it by comatibale name. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 82 ++++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 7c518a560089..407745ab5c0a 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -95,6 +95,9 @@ #define MSDC_CFG_CKDIV (0xff << 8) /* RW */ #define MSDC_CFG_CKMOD (0x3 << 16) /* RW */ #define MSDC_CFG_HS400_CK_MODE (0x1 << 18) /* RW */ +#define MSDC_CFG_HS400_CK_MODE_EXTRA (0x1 << 22) /* RW */ +#define MSDC_CFG_CKDIV_EXTRA (0xfff << 8) /* RW */ +#define MSDC_CFG_CKMOD_EXTRA (0x3 << 20) /* RW */ /* MSDC_IOCON mask */ #define MSDC_IOCON_SDR104CKS (0x1 << 0) /* RW */ @@ -295,6 +298,10 @@ struct msdc_save_para { u32 emmc50_cfg0; }; +struct mtk_mmc_compatible { + u8 clk_div_bits; +}; + struct msdc_tune_para { u32 iocon; u32 pad_tune; @@ -309,6 +316,7 @@ struct msdc_delay_phase { struct msdc_host { struct device *dev; + const struct mtk_mmc_compatible *dev_comp; struct mmc_host *mmc; /* mmc structure */ int cmd_rsp; @@ -350,6 +358,31 @@ struct msdc_host { struct msdc_tune_para saved_tune_para; /* tune result of CMD21/CMD19 */ }; +static const struct mtk_mmc_compatible mt8135_compat = { + .clk_div_bits = 8, +}; + +static const struct mtk_mmc_compatible mt8173_compat = { + .clk_div_bits = 8, +}; + +static const struct mtk_mmc_compatible mt2701_compat = { + .clk_div_bits = 12, +}; + +static const struct mtk_mmc_compatible mt2712_compat = { + .clk_div_bits = 12, +}; + +static const struct of_device_id msdc_of_ids[] = { + { .compatible = "mediatek,mt8135-mmc", .data = &mt8135_compat}, + { .compatible = "mediatek,mt8173-mmc", .data = &mt8173_compat}, + { .compatible = "mediatek,mt2701-mmc", .data = &mt2701_compat}, + { .compatible = "mediatek,mt2712-mmc", .data = &mt2712_compat}, + {} +}; +MODULE_DEVICE_TABLE(of, msdc_of_ids); + static void sdr_set_bits(void __iomem *reg, u32 bs) { u32 val = readl(reg); @@ -509,7 +542,12 @@ static void msdc_set_timeout(struct msdc_host *host, u32 ns, u32 clks) timeout = (ns + clk_ns - 1) / clk_ns + clks; /* in 1048576 sclk cycle unit */ timeout = (timeout + (0x1 << 20) - 1) >> 20; - sdr_get_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD, &mode); + if (host->dev_comp->clk_div_bits == 8) + sdr_get_field(host->base + MSDC_CFG, + MSDC_CFG_CKMOD, &mode); + else + sdr_get_field(host->base + MSDC_CFG, + MSDC_CFG_CKMOD_EXTRA, &mode); /*DDR mode will double the clk cycles for data timeout */ timeout = mode >= 2 ? timeout * 2 : timeout; timeout = timeout > 1 ? timeout - 1 : 0; @@ -548,7 +586,11 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) flags = readl(host->base + MSDC_INTEN); sdr_clr_bits(host->base + MSDC_INTEN, flags); - sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_HS400_CK_MODE); + if (host->dev_comp->clk_div_bits == 8) + sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_HS400_CK_MODE); + else + sdr_clr_bits(host->base + MSDC_CFG, + MSDC_CFG_HS400_CK_MODE_EXTRA); if (timing == MMC_TIMING_UHS_DDR50 || timing == MMC_TIMING_MMC_DDR52 || timing == MMC_TIMING_MMC_HS400) { @@ -568,8 +610,12 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) if (timing == MMC_TIMING_MMC_HS400 && hz >= (host->src_clk_freq >> 1)) { - sdr_set_bits(host->base + MSDC_CFG, - MSDC_CFG_HS400_CK_MODE); + if (host->dev_comp->clk_div_bits == 8) + sdr_set_bits(host->base + MSDC_CFG, + MSDC_CFG_HS400_CK_MODE); + else + sdr_set_bits(host->base + MSDC_CFG, + MSDC_CFG_HS400_CK_MODE_EXTRA); sclk = host->src_clk_freq >> 1; div = 0; /* div is ignore when bit18 is set */ } @@ -587,8 +633,15 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) sclk = (host->src_clk_freq >> 2) / div; } } - sdr_set_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD | MSDC_CFG_CKDIV, - (mode << 8) | div); + if (host->dev_comp->clk_div_bits == 8) + sdr_set_field(host->base + MSDC_CFG, + MSDC_CFG_CKMOD | MSDC_CFG_CKDIV, + (mode << 8) | div); + else + sdr_set_field(host->base + MSDC_CFG, + MSDC_CFG_CKMOD_EXTRA | MSDC_CFG_CKDIV_EXTRA, + (mode << 12) | div); + sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB)) cpu_relax(); @@ -1617,12 +1670,17 @@ static int msdc_drv_probe(struct platform_device *pdev) struct mmc_host *mmc; struct msdc_host *host; struct resource *res; + const struct of_device_id *of_id; int ret; if (!pdev->dev.of_node) { dev_err(&pdev->dev, "No DT found\n"); return -EINVAL; } + + of_id = of_match_node(msdc_of_ids, pdev->dev.of_node); + if (!of_id) + return -EINVAL; /* Allocate MMC host for this device */ mmc = mmc_alloc_host(sizeof(struct msdc_host), &pdev->dev); if (!mmc) @@ -1686,11 +1744,15 @@ static int msdc_drv_probe(struct platform_device *pdev) msdc_of_property_parse(pdev, host); host->dev = &pdev->dev; + host->dev_comp = of_id->data; host->mmc = mmc; host->src_clk_freq = clk_get_rate(host->src_clk); /* Set host parameters to mmc */ mmc->ops = &mt_msdc_ops; - mmc->f_min = DIV_ROUND_UP(host->src_clk_freq, 4 * 255); + if (host->dev_comp->clk_div_bits == 8) + mmc->f_min = DIV_ROUND_UP(host->src_clk_freq, 4 * 255); + else + mmc->f_min = DIV_ROUND_UP(host->src_clk_freq, 4 * 4095); mmc->caps |= MMC_CAP_ERASE | MMC_CAP_CMD23; /* MMC core transfer sizes tunable parameters */ @@ -1839,12 +1901,6 @@ static const struct dev_pm_ops msdc_dev_pm_ops = { SET_RUNTIME_PM_OPS(msdc_runtime_suspend, msdc_runtime_resume, NULL) }; -static const struct of_device_id msdc_of_ids[] = { - { .compatible = "mediatek,mt8135-mmc", }, - {} -}; -MODULE_DEVICE_TABLE(of, msdc_of_ids); - static struct platform_driver mt_msdc_driver = { .probe = msdc_drv_probe, .remove = msdc_drv_remove, From 85936eeb48e59af43b0130291027485dc19239ef Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 19 Sep 2017 22:33:00 +0900 Subject: [PATCH 0738/1085] openrisc: dts: or1ksim: Add stdout-path During reviews of the OpenRISC SMP patch series it was suggested to add stdout-path to the SMP dts file. Add stdout-path to our other dts files to be a good example. Cc: Geert Uytterhoeven Signed-off-by: Stafford Horne --- arch/openrisc/boot/dts/or1ksim.dts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/openrisc/boot/dts/or1ksim.dts b/arch/openrisc/boot/dts/or1ksim.dts index 5d4f9027afaf..8f41bd8c9f84 100644 --- a/arch/openrisc/boot/dts/or1ksim.dts +++ b/arch/openrisc/boot/dts/or1ksim.dts @@ -5,8 +5,13 @@ #size-cells = <1>; interrupt-parent = <&pic>; + aliases { + uart0 = &serial0; + }; + chosen { - bootargs = "console=uart,mmio,0x90000000,115200"; + bootargs = "earlycon"; + stdout-path = "uart0:115200"; }; memory@0 { From 7f340fea3a78201f8d5d14b414d6a314cb67452a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 19 Sep 2017 22:39:31 +0900 Subject: [PATCH 0739/1085] MAINTAINERS: Add OpenRISC pic maintainer The OpenRISC team is the maintainer of the irqchip or1k-pic driver under drivers/irqchip. Signed-off-by: Stafford Horne --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d85c08956875..543d9fda92aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10032,6 +10032,7 @@ L: openrisc@lists.librecores.org W: http://openrisc.io S: Maintained F: arch/openrisc/ +F: drivers/irqchip/irq-or1k-* OPENVSWITCH M: Pravin Shelar From 00aa61d36da1969b139dc10b9b555677ebb03c42 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 19 Sep 2017 22:55:54 +0900 Subject: [PATCH 0740/1085] Documentation: Move OpenRISC docs out of arch/ The OpenRISC docs have traditionally been in arch/ but that does not seem like the correct place to be. Move them so they will be more visible to others. Also update MAINTAINERS to make sure we get notifications of changes. Signed-off-by: Stafford Horne --- arch/openrisc/README.openrisc => Documentation/openrisc/README | 0 arch/openrisc/TODO.openrisc => Documentation/openrisc/TODO | 0 MAINTAINERS | 1 + 3 files changed, 1 insertion(+) rename arch/openrisc/README.openrisc => Documentation/openrisc/README (100%) rename arch/openrisc/TODO.openrisc => Documentation/openrisc/TODO (100%) diff --git a/arch/openrisc/README.openrisc b/Documentation/openrisc/README similarity index 100% rename from arch/openrisc/README.openrisc rename to Documentation/openrisc/README diff --git a/arch/openrisc/TODO.openrisc b/Documentation/openrisc/TODO similarity index 100% rename from arch/openrisc/TODO.openrisc rename to Documentation/openrisc/TODO diff --git a/MAINTAINERS b/MAINTAINERS index 543d9fda92aa..4e2b720ca661 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10031,6 +10031,7 @@ T: git git://github.com/openrisc/linux.git L: openrisc@lists.librecores.org W: http://openrisc.io S: Maintained +F: Documentation/openrisc/ F: arch/openrisc/ F: drivers/irqchip/irq-or1k-* From e4082de21c5c4f9001b8e44b39b5e72f7ed3b2a7 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 21 Oct 2017 22:39:09 +0900 Subject: [PATCH 0741/1085] Documentation: openrisc: Updates to README Update the OpenRISC readme to provide some more up-to-date information on how to get started with OpenRISC. This includes: - remove references to southpole who no longer are consulting for OpenRISC (confirmed with Jonas) - suggested QEMU instead of the old or1ksim as QEMU is well supported - include instructions on how to get an FPGA board running Suggested-by: Pavel Machek Signed-off-by: Stafford Horne --- Documentation/openrisc/README | 67 ++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/Documentation/openrisc/README b/Documentation/openrisc/README index 072069ab5100..777a893d533d 100644 --- a/Documentation/openrisc/README +++ b/Documentation/openrisc/README @@ -7,13 +7,7 @@ target architecture, specifically, is the 32-bit OpenRISC 1000 family (or1k). For information about OpenRISC processors and ongoing development: website http://openrisc.io - -For more information about Linux on OpenRISC, please contact South Pole AB. - - email: info@southpole.se - - website: http://southpole.se - http://southpoleconsulting.com + email openrisc@lists.librecores.org --------------------------------------------------------------------- @@ -24,37 +18,54 @@ In order to build and run Linux for OpenRISC, you'll need at least a basic toolchain and, perhaps, the architectural simulator. Steps to get these bits in place are outlined here. -1) The toolchain can be obtained from openrisc.io. Instructions for building -a toolchain can be found at: +1) Toolchain -https://github.com/openrisc/tutorials +Toolchain binaries can be obtained from openrisc.io or our github releases page. +Instructions for building the different toolchains can be found on openrisc.io +or Stafford's toolchain build and release scripts. -2) or1ksim (optional) + binaries https://github.com/openrisc/or1k-gcc/releases + toolchains https://openrisc.io/software + building https://github.com/stffrdhrn/or1k-toolchain-build -or1ksim is the architectural simulator which will allow you to actually run -your OpenRISC Linux kernel if you don't have an OpenRISC processor at hand. +2) Building - git clone https://github.com/openrisc/or1ksim.git - - cd or1ksim - ./configure --prefix=$OPENRISC_PREFIX - make - make install - -3) Linux kernel - -Build the kernel as usual +Build the Linux kernel as usual make ARCH=openrisc defconfig make ARCH=openrisc -4) Run in architectural simulator +3) Running on FPGA (optional) -Grab the or1ksim platform configuration file (from the or1ksim source) and -together with your freshly built vmlinux, run your kernel with the following -incantation: +The OpenRISC community typically uses FuseSoC to manage building and programming +an SoC into an FPGA. The below is an example of programming a De0 Nano +development board with the OpenRISC SoC. During the build FPGA RTL is code +downloaded from the FuseSoC IP cores repository and built using the FPGA vendor +tools. Binaries are loaded onto the board with openocd. - sim -f arch/openrisc/or1ksim.cfg vmlinux + git clone https://github.com/olofk/fusesoc + cd fusesoc + sudo pip install -e . + + fusesoc init + fusesoc build de0_nano + fusesoc pgm de0_nano + + openocd -f interface/altera-usb-blaster.cfg \ + -f board/or1k_generic.cfg + + telnet localhost 4444 + > init + > halt; load_image vmlinux ; reset + +4) Running on a Simulator (optional) + +QEMU is a processor emulator which we recommend for simulating the OpenRISC +platform. Please follow the OpenRISC instructions on the QEMU website to get +Linux running on QEMU. You can build QEMU yourself, but your Linux distribution +likely provides binary packages to support OpenRISC. + + qemu openrisc https://wiki.qemu.org/Documentation/Platforms/OpenRISC --------------------------------------------------------------------- From 642ee1c6df4c8571d8a6846a2623fb54f925ef92 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Oct 2017 13:11:15 -0300 Subject: [PATCH 0742/1085] perf script: Print information about per-event-dump files For a file generated by "perf sched record sleep 50": # perf script --per-event-dump [ perf script: Wrote 23.121 MB perf.data.sched:sched_switch.dump (206015 samples) ] [ perf script: Wrote 0.000 MB perf.data.sched:sched_stat_wait.dump (0 samples) ] [ perf script: Wrote 0.000 MB perf.data.sched:sched_stat_sleep.dump (0 samples) ] [ perf script: Wrote 0.000 MB perf.data.sched:sched_stat_iowait.dump (0 samples) ] [ perf script: Wrote 17.680 MB perf.data.sched:sched_stat_runtime.dump (129342 samples) ] [ perf script: Wrote 0.000 MB perf.data.sched:sched_process_fork.dump (24 samples) ] [ perf script: Wrote 11.328 MB perf.data.sched:sched_wakeup.dump (106770 samples) ] [ perf script: Wrote 0.000 MB perf.data.sched:sched_wakeup_new.dump (24 samples) ] [ perf script: Wrote 2.477 MB perf.data.sched:sched_migrate_task.dump (20434 samples) ] # Similar to what is generated by 'perf record'. Based-on-a-patch-by: yuzhoujian Suggested-by: Jiri Olsa Cc: Adrian Hunter Cc: David Ahern Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/r/1508921599-10832-3-git-send-email-yuzhoujian@didichuxing.com Link: http://lkml.kernel.org/n/tip-xuketkkjuk2c0qz546ypd1u7@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 77 +++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index fb5e49b3bc44..4d198f73b29a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -210,6 +210,51 @@ static struct { }, }; +struct perf_evsel_script { + char *filename; + FILE *fp; + u64 samples; +}; + +static struct perf_evsel_script *perf_evsel_script__new(struct perf_evsel *evsel, + struct perf_data_file *file) +{ + struct perf_evsel_script *es = malloc(sizeof(*es)); + + if (es != NULL) { + if (asprintf(&es->filename, "%s.%s.dump", file->path, perf_evsel__name(evsel)) < 0) + goto out_free; + es->fp = fopen(es->filename, "w"); + if (es->fp == NULL) + goto out_free_filename; + es->samples = 0; + } + + return es; +out_free_filename: + zfree(&es->filename); +out_free: + free(es); + return NULL; +} + +static void perf_evsel_script__delete(struct perf_evsel_script *es) +{ + zfree(&es->filename); + fclose(es->fp); + es->fp = NULL; + free(es); +} + +static int perf_evsel_script__fprintf(struct perf_evsel_script *es, FILE *fp) +{ + struct stat st; + + fstat(fileno(es->fp), &st); + return fprintf(fp, "[ perf script: Wrote %.3f MB %s (%" PRIu64 " samples) ]\n", + st.st_size / 1024.0 / 1024.0, es->filename, es->samples); +} + static inline int output_type(unsigned int type) { switch (type) { @@ -1439,11 +1484,14 @@ static void process_event(struct perf_script *script, struct thread *thread = al->thread; struct perf_event_attr *attr = &evsel->attr; unsigned int type = output_type(attr->type); - FILE *fp = evsel->priv; + struct perf_evsel_script *es = evsel->priv; + FILE *fp = es->fp; if (output[type].fields == 0) return; + ++es->samples; + perf_sample__fprintf_start(sample, thread, evsel, fp); if (PRINT_FIELD(PERIOD)) @@ -1896,7 +1944,7 @@ static void perf_script__fclose_per_event_dump(struct perf_script *script) evlist__for_each_entry(evlist, evsel) { if (!evsel->priv) break; - fclose(evsel->priv); + perf_evsel_script__delete(evsel->priv); evsel->priv = NULL; } } @@ -1906,10 +1954,7 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script) struct perf_evsel *evsel; evlist__for_each_entry(script->session->evlist, evsel) { - char filename[PATH_MAX]; - snprintf(filename, sizeof(filename), "%s.%s.dump", - script->session->file->path, perf_evsel__name(evsel)); - evsel->priv = fopen(filename, "w"); + evsel->priv = perf_evsel_script__new(evsel, script->session->file); if (evsel->priv == NULL) goto out_err_fclose; } @@ -1924,16 +1969,32 @@ out_err_fclose: static int perf_script__setup_per_event_dump(struct perf_script *script) { struct perf_evsel *evsel; + static struct perf_evsel_script es_stdout; if (script->per_event_dump) return perf_script__fopen_per_event_dump(script); + es_stdout.fp = stdout; + evlist__for_each_entry(script->session->evlist, evsel) - evsel->priv = stdout; + evsel->priv = &es_stdout; return 0; } +static void perf_script__exit_per_event_dump_stats(struct perf_script *script) +{ + struct perf_evsel *evsel; + + evlist__for_each_entry(script->session->evlist, evsel) { + struct perf_evsel_script *es = evsel->priv; + + perf_evsel_script__fprintf(es, stdout); + perf_evsel_script__delete(es); + evsel->priv = NULL; + } +} + static int __cmd_script(struct perf_script *script) { int ret; @@ -1963,7 +2024,7 @@ static int __cmd_script(struct perf_script *script) ret = perf_session__process_events(script->session); if (script->per_event_dump) - perf_script__fclose_per_event_dump(script); + perf_script__exit_per_event_dump_stats(script); if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); From 8ceb41d7e305f186543c58178d2e1fe34f708948 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 23 Jan 2017 22:07:59 +0100 Subject: [PATCH 0743/1085] perf tools: Rename struct perf_data_file to perf_data Rename struct perf_data_file to perf_data, because we will add the possibility to have multiple files under perf.data, so the 'perf_data' name fits better. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Changbin Du Cc: David Ahern Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-39wn4d77phel3dgkzo3lyan0@git.kernel.org [ Fixup recent changes in 'perf script --per-event-dump' ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 10 ++-- tools/perf/builtin-buildid-cache.c | 8 +-- tools/perf/builtin-buildid-list.c | 8 +-- tools/perf/builtin-c2c.c | 10 ++-- tools/perf/builtin-diff.c | 18 +++--- tools/perf/builtin-evlist.c | 4 +- tools/perf/builtin-inject.c | 26 ++++----- tools/perf/builtin-kmem.c | 8 +-- tools/perf/builtin-kvm.c | 6 +- tools/perf/builtin-lock.c | 4 +- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-record.c | 50 ++++++++-------- tools/perf/builtin-report.c | 14 ++--- tools/perf/builtin-sched.c | 8 +-- tools/perf/builtin-script.c | 20 +++---- tools/perf/builtin-stat.c | 32 +++++------ tools/perf/builtin-timechart.c | 6 +- tools/perf/builtin-trace.c | 4 +- tools/perf/tests/topology.c | 10 ++-- tools/perf/util/auxtrace.c | 4 +- tools/perf/util/data-convert-bt.c | 6 +- tools/perf/util/data.c | 92 +++++++++++++++--------------- tools/perf/util/data.h | 32 +++++------ tools/perf/util/header.c | 20 +++---- tools/perf/util/intel-bts.c | 6 +- tools/perf/util/intel-pt.c | 6 +- tools/perf/util/jit.h | 2 +- tools/perf/util/jitdump.c | 10 ++-- tools/perf/util/session.c | 44 +++++++------- tools/perf/util/session.h | 4 +- 30 files changed, 238 insertions(+), 238 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index c38373195c4a..2d06be81a109 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -356,7 +356,7 @@ static int __cmd_annotate(struct perf_annotate *ann) } if (total_nr_samples == 0) { - ui__error("The %s file has no samples!\n", session->file->path); + ui__error("The %s file has no samples!\n", session->data->path); goto out; } @@ -400,7 +400,7 @@ int cmd_annotate(int argc, const char **argv) .ordering_requires_timestamps = true, }, }; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; struct option options[] = { @@ -410,7 +410,7 @@ int cmd_annotate(int argc, const char **argv) "only consider symbols in these dsos"), OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol", "symbol to annotate"), - OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"), @@ -482,9 +482,9 @@ int cmd_annotate(int argc, const char **argv) if (quiet) perf_quiet_option(); - file.path = input_name; + data.path = input_name; - annotate.session = perf_session__new(&file, false, &annotate.tool); + annotate.session = perf_session__new(&data, false, &annotate.tool); if (annotate.session == NULL) return -1; diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index e3eb6240ced0..9fceae47a02e 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -311,7 +311,7 @@ int cmd_buildid_cache(int argc, const char **argv) *kcore_filename = NULL; char sbuf[STRERR_BUFSIZE]; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; struct perf_session *session = NULL; @@ -352,10 +352,10 @@ int cmd_buildid_cache(int argc, const char **argv) nsi = nsinfo__new(ns_id); if (missing_filename) { - file.path = missing_filename; - file.force = force; + data.path = missing_filename; + data.force = force; - session = perf_session__new(&file, false, NULL); + session = perf_session__new(&data, false, NULL); if (session == NULL) return -1; } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index fdaca16e0c74..72bdc0eba990 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -50,7 +50,7 @@ static bool dso__skip_buildid(struct dso *dso, int with_hits) static int perf_session__list_build_ids(bool force, bool with_hits) { struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = force, @@ -63,7 +63,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) if (filename__fprintf_build_id(input_name, stdout) > 0) goto out; - session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops); + session = perf_session__new(&data, false, &build_id__mark_dso_hit_ops); if (session == NULL) return -1; @@ -71,7 +71,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) * We take all buildids when the file contains AUX area tracing data * because we do not decode the trace because it would take too long. */ - if (!perf_data_file__is_pipe(&file) && + if (!perf_data__is_pipe(&data) && perf_header__has_feat(&session->header, HEADER_AUXTRACE)) with_hits = false; @@ -79,7 +79,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) * in pipe-mode, the only way to get the buildids is to parse * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID */ - if (with_hits || perf_data_file__is_pipe(&file)) + if (with_hits || perf_data__is_pipe(&data)) perf_session__process_events(session); perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index bb1ee22bd221..87a52d09da29 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2523,7 +2523,7 @@ static int perf_c2c__report(int argc, const char **argv) { struct perf_session *session; struct ui_progress prog; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; char callchain_default_opt[] = CALLCHAIN_DEFAULT_OPT; @@ -2572,8 +2572,8 @@ static int perf_c2c__report(int argc, const char **argv) if (!input_name || !strlen(input_name)) input_name = "perf.data"; - file.path = input_name; - file.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; err = setup_display(display); if (err) @@ -2591,7 +2591,7 @@ static int perf_c2c__report(int argc, const char **argv) goto out; } - session = perf_session__new(&file, 0, &c2c.tool); + session = perf_session__new(&data, 0, &c2c.tool); if (session == NULL) { pr_debug("No memory for session\n"); goto out; @@ -2611,7 +2611,7 @@ static int perf_c2c__report(int argc, const char **argv) goto out_session; /* No pipe support at the moment. */ - if (perf_data_file__is_pipe(session->file)) { + if (perf_data__is_pipe(session->data)) { pr_debug("No pipe support at the moment.\n"); goto out_session; } diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 0cd4cf6a344b..5292e3d13cec 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -47,7 +47,7 @@ struct diff_hpp_fmt { struct data__file { struct perf_session *session; - struct perf_data_file file; + struct perf_data data; int idx; struct hists *hists; struct diff_hpp_fmt fmt[PERF_HPP_DIFF__MAX_INDEX]; @@ -707,7 +707,7 @@ static void data__fprintf(void) data__for_each_file(i, d) fprintf(stdout, "# [%d] %s %s\n", - d->idx, d->file.path, + d->idx, d->data.path, !d->idx ? "(Baseline)" : ""); fprintf(stdout, "#\n"); @@ -776,16 +776,16 @@ static int __cmd_diff(void) int ret = -EINVAL, i; data__for_each_file(i, d) { - d->session = perf_session__new(&d->file, false, &tool); + d->session = perf_session__new(&d->data, false, &tool); if (!d->session) { - pr_err("Failed to open %s\n", d->file.path); + pr_err("Failed to open %s\n", d->data.path); ret = -1; goto out_delete; } ret = perf_session__process_events(d->session); if (ret) { - pr_err("Failed to process %s\n", d->file.path); + pr_err("Failed to process %s\n", d->data.path); goto out_delete; } @@ -1286,11 +1286,11 @@ static int data_init(int argc, const char **argv) return -ENOMEM; data__for_each_file(i, d) { - struct perf_data_file *file = &d->file; + struct perf_data *data = &d->data; - file->path = use_default ? defaults[i] : argv[i]; - file->mode = PERF_DATA_MODE_READ, - file->force = force, + data->path = use_default ? defaults[i] : argv[i]; + data->mode = PERF_DATA_MODE_READ, + data->force = force, d->idx = i; } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 6d210e40d611..cd79c26e16a4 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -21,14 +21,14 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details { struct perf_session *session; struct perf_evsel *pos; - struct perf_data_file file = { + struct perf_data data = { .path = file_name, .mode = PERF_DATA_MODE_READ, .force = details->force, }; bool has_tracepoint = false; - session = perf_session__new(&file, 0, NULL); + session = perf_session__new(&data, 0, NULL); if (session == NULL) return -1; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 2b8032908fb2..ac7486f6c46f 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -35,7 +35,7 @@ struct perf_inject { bool strip; bool jit_mode; const char *input_name; - struct perf_data_file output; + struct perf_data output; u64 bytes_written; u64 aux_id; struct list_head samples; @@ -52,7 +52,7 @@ static int output_bytes(struct perf_inject *inject, void *buf, size_t sz) { ssize_t size; - size = perf_data_file__write(&inject->output, buf, sz); + size = perf_data__write(&inject->output, buf, sz); if (size < 0) return -errno; @@ -154,11 +154,11 @@ static s64 perf_event__repipe_auxtrace(struct perf_tool *tool, return ret; } - if (perf_data_file__is_pipe(session->file) || !session->one_mmap) { + if (perf_data__is_pipe(session->data) || !session->one_mmap) { ret = output_bytes(inject, event, event->header.size); if (ret < 0) return ret; - ret = copy_bytes(inject, perf_data_file__fd(session->file), + ret = copy_bytes(inject, perf_data__fd(session->data), event->auxtrace.size); } else { ret = output_bytes(inject, event, @@ -637,8 +637,8 @@ static int __cmd_inject(struct perf_inject *inject) { int ret = -EINVAL; struct perf_session *session = inject->session; - struct perf_data_file *file_out = &inject->output; - int fd = perf_data_file__fd(file_out); + struct perf_data *data_out = &inject->output; + int fd = perf_data__fd(data_out); u64 output_data_offset; signal(SIGINT, sig_handler); @@ -693,14 +693,14 @@ static int __cmd_inject(struct perf_inject *inject) if (!inject->itrace_synth_opts.set) auxtrace_index__free(&session->auxtrace_index); - if (!file_out->is_pipe) + if (!data_out->is_pipe) lseek(fd, output_data_offset, SEEK_SET); ret = perf_session__process_events(session); if (ret) return ret; - if (!file_out->is_pipe) { + if (!data_out->is_pipe) { if (inject->build_ids) perf_header__set_feat(&session->header, HEADER_BUILD_ID); @@ -779,7 +779,7 @@ int cmd_inject(int argc, const char **argv) .mode = PERF_DATA_MODE_WRITE, }, }; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; int ret; @@ -801,7 +801,7 @@ int cmd_inject(int argc, const char **argv) "be more verbose (show build ids, etc)"), OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file", "kallsyms pathname"), - OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"), OPT_CALLBACK_OPTARG(0, "itrace", &inject.itrace_synth_opts, NULL, "opts", "Instruction Tracing options", itrace_parse_synth_opts), @@ -829,15 +829,15 @@ int cmd_inject(int argc, const char **argv) return -1; } - if (perf_data_file__open(&inject.output)) { + if (perf_data__open(&inject.output)) { perror("failed to create output file"); return -1; } inject.tool.ordered_events = inject.sched_stat; - file.path = inject.input_name; - inject.session = perf_session__new(&file, true, &inject.tool); + data.path = inject.input_name; + inject.session = perf_session__new(&data, true, &inject.tool); if (inject.session == NULL) return -1; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index d8f25ef8157b..d45740a3e5f1 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1893,7 +1893,7 @@ int cmd_kmem(int argc, const char **argv) { const char * const default_slab_sort = "frag,hit,bytes"; const char * const default_page_sort = "bytes,hit"; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; const struct option kmem_options[] = { @@ -1909,7 +1909,7 @@ int cmd_kmem(int argc, const char **argv) "page, order, migtype, gfp", parse_sort_opt), OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt), OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"), - OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"), OPT_CALLBACK_NOOPT(0, "slab", NULL, NULL, "Analyze slab allocator", parse_slab_opt), OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator", @@ -1949,9 +1949,9 @@ int cmd_kmem(int argc, const char **argv) return __cmd_record(argc, argv); } - file.path = input_name; + data.path = input_name; - kmem_session = session = perf_session__new(&file, false, &perf_kmem); + kmem_session = session = perf_session__new(&data, false, &perf_kmem); if (session == NULL) return -1; diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 721f4f91291a..4301fc34f23c 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1067,7 +1067,7 @@ static int read_events(struct perf_kvm_stat *kvm) .namespaces = perf_event__process_namespaces, .ordered_events = true, }; - struct perf_data_file file = { + struct perf_data file = { .path = kvm->file_name, .mode = PERF_DATA_MODE_READ, .force = kvm->force, @@ -1358,7 +1358,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, "perf kvm stat live []", NULL }; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_WRITE, }; @@ -1432,7 +1432,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, /* * perf session */ - kvm->session = perf_session__new(&file, false, &kvm->tool); + kvm->session = perf_session__new(&data, false, &kvm->tool); if (kvm->session == NULL) { err = -1; goto out; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index ff98652484a7..2e281f7b0fca 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -864,13 +864,13 @@ static int __cmd_report(bool display_info) .namespaces = perf_event__process_namespaces, .ordered_events = true, }; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = force, }; - session = perf_session__new(&file, false, &eops); + session = perf_session__new(&data, false, &eops); if (!session) { pr_err("Initializing perf session failed\n"); return -1; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 6940490bc3f9..5a4a6f8e614d 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -236,13 +236,13 @@ static int process_sample_event(struct perf_tool *tool, static int report_raw_events(struct perf_mem *mem) { - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = mem->force, }; int ret; - struct perf_session *session = perf_session__new(&file, false, + struct perf_session *session = perf_session__new(&data, false, &mem->tool); if (session == NULL) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a6cbf1640269..0ab7dd0e4f2b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -66,7 +66,7 @@ struct record { struct perf_tool tool; struct record_opts opts; u64 bytes_written; - struct perf_data_file file; + struct perf_data data; struct auxtrace_record *itr; struct perf_evlist *evlist; struct perf_session *session; @@ -107,7 +107,7 @@ static bool switch_output_time(struct record *rec) static int record__write(struct record *rec, void *bf, size_t size) { - if (perf_data_file__write(rec->session->file, bf, size) < 0) { + if (perf_data__write(rec->session->data, bf, size) < 0) { pr_err("failed to write perf data, error: %m\n"); return -1; } @@ -173,13 +173,13 @@ static int record__process_auxtrace(struct perf_tool *tool, size_t len1, void *data2, size_t len2) { struct record *rec = container_of(tool, struct record, tool); - struct perf_data_file *file = &rec->file; + struct perf_data *data = &rec->data; size_t padding; u8 pad[8] = {0}; - if (!perf_data_file__is_pipe(file)) { + if (!perf_data__is_pipe(data)) { off_t file_offset; - int fd = perf_data_file__fd(file); + int fd = perf_data__fd(data); int err; file_offset = lseek(fd, 0, SEEK_CUR); @@ -398,10 +398,10 @@ static int process_sample_event(struct perf_tool *tool, static int process_buildids(struct record *rec) { - struct perf_data_file *file = &rec->file; + struct perf_data *data = &rec->data; struct perf_session *session = rec->session; - if (file->size == 0) + if (data->size == 0) return 0; /* @@ -544,14 +544,14 @@ static void record__init_features(struct record *rec) static void record__finish_output(struct record *rec) { - struct perf_data_file *file = &rec->file; - int fd = perf_data_file__fd(file); + struct perf_data *data = &rec->data; + int fd = perf_data__fd(data); - if (file->is_pipe) + if (data->is_pipe) return; rec->session->header.data_size += rec->bytes_written; - file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR); + data->size = lseek(perf_data__fd(data), 0, SEEK_CUR); if (!rec->no_buildid) { process_buildids(rec); @@ -590,7 +590,7 @@ static int record__synthesize(struct record *rec, bool tail); static int record__switch_output(struct record *rec, bool at_exit) { - struct perf_data_file *file = &rec->file; + struct perf_data *data = &rec->data; int fd, err; /* Same Size: "2015122520103046"*/ @@ -608,7 +608,7 @@ record__switch_output(struct record *rec, bool at_exit) return -EINVAL; } - fd = perf_data_file__switch(file, timestamp, + fd = perf_data__switch(data, timestamp, rec->session->header.data_offset, at_exit); if (fd >= 0 && !at_exit) { @@ -618,7 +618,7 @@ record__switch_output(struct record *rec, bool at_exit) if (!quiet) fprintf(stderr, "[ perf record: Dump %s.%s ]\n", - file->path, timestamp); + data->path, timestamp); /* Output tracking events */ if (!at_exit) { @@ -693,16 +693,16 @@ static int record__synthesize(struct record *rec, bool tail) { struct perf_session *session = rec->session; struct machine *machine = &session->machines.host; - struct perf_data_file *file = &rec->file; + struct perf_data *data = &rec->data; struct record_opts *opts = &rec->opts; struct perf_tool *tool = &rec->tool; - int fd = perf_data_file__fd(file); + int fd = perf_data__fd(data); int err = 0; if (rec->opts.tail_synthesize != tail) return 0; - if (file->is_pipe) { + if (data->is_pipe) { err = perf_event__synthesize_features( tool, session, rec->evlist, process_synthesized_event); if (err < 0) { @@ -781,7 +781,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) struct machine *machine; struct perf_tool *tool = &rec->tool; struct record_opts *opts = &rec->opts; - struct perf_data_file *file = &rec->file; + struct perf_data *data = &rec->data; struct perf_session *session; bool disabled = false, draining = false; int fd; @@ -807,20 +807,20 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGUSR2, SIG_IGN); } - session = perf_session__new(file, false, tool); + session = perf_session__new(data, false, tool); if (session == NULL) { pr_err("Perf session creation failed.\n"); return -1; } - fd = perf_data_file__fd(file); + fd = perf_data__fd(data); rec->session = session; record__init_features(rec); if (forks) { err = perf_evlist__prepare_workload(rec->evlist, &opts->target, - argv, file->is_pipe, + argv, data->is_pipe, workload_exec_failed_signal); if (err < 0) { pr_err("Couldn't run the workload!\n"); @@ -856,7 +856,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (!rec->evlist->nr_groups) perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); - if (file->is_pipe) { + if (data->is_pipe) { err = perf_header__write_pipe(fd); if (err < 0) goto out_child; @@ -1117,8 +1117,8 @@ out_child: samples[0] = '\0'; fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n", - perf_data_file__size(file) / 1024.0 / 1024.0, - file->path, postfix, samples); + perf_data__size(data) / 1024.0 / 1024.0, + data->path, postfix, samples); } out_delete_session: @@ -1482,7 +1482,7 @@ static struct option __record_options[] = { OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", "list of cpus to monitor"), OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), - OPT_STRING('o', "output", &record.file.path, "file", + OPT_STRING('o', "output", &record.data.path, "file", "output file name"), OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, &record.opts.no_inherit_set, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f9dff652dcbd..0dc323772b5e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -257,7 +257,7 @@ static int report__setup_sample_type(struct report *rep) { struct perf_session *session = rep->session; u64 sample_type = perf_evlist__combined_sample_type(session->evlist); - bool is_pipe = perf_data_file__is_pipe(session->file); + bool is_pipe = perf_data__is_pipe(session->data); if (session->itrace_synth_opts->callchain || (!is_pipe && @@ -568,7 +568,7 @@ static int __cmd_report(struct report *rep) int ret; struct perf_session *session = rep->session; struct perf_evsel *pos; - struct perf_data_file *file = session->file; + struct perf_data *data = session->data; signal(SIGINT, sig_handler); @@ -637,7 +637,7 @@ static int __cmd_report(struct report *rep) rep->nr_entries += evsel__hists(pos)->nr_entries; if (rep->nr_entries == 0) { - ui__error("The %s file has no samples!\n", file->path); + ui__error("The %s file has no samples!\n", data->path); return 0; } @@ -879,7 +879,7 @@ int cmd_report(int argc, const char **argv) "Show inline function"), OPT_END() }; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; int ret = hists__init(); @@ -940,11 +940,11 @@ int cmd_report(int argc, const char **argv) input_name = "perf.data"; } - file.path = input_name; - file.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; repeat: - session = perf_session__new(&file, false, &report.tool); + session = perf_session__new(&data, false, &report.tool); if (session == NULL) return -1; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b7e8812ee80c..cb5410511f69 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1700,14 +1700,14 @@ static int perf_sched__read_events(struct perf_sched *sched) { "sched:sched_migrate_task", process_sched_migrate_task_event, }, }; struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = sched->force, }; int rc = -1; - session = perf_session__new(&file, false, &sched->tool); + session = perf_session__new(&data, false, &sched->tool); if (session == NULL) { pr_debug("No Memory for session\n"); return -1; @@ -2902,7 +2902,7 @@ static int perf_sched__timehist(struct perf_sched *sched) const struct perf_evsel_str_handler migrate_handlers[] = { { "sched:sched_migrate_task", timehist_migrate_task_event, }, }; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = sched->force, @@ -2930,7 +2930,7 @@ static int perf_sched__timehist(struct perf_sched *sched) symbol_conf.use_callchain = sched->show_callchain; - session = perf_session__new(&file, false, &sched->tool); + session = perf_session__new(&data, false, &sched->tool); if (session == NULL) return -ENOMEM; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 4d198f73b29a..8f8fa952b506 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -217,12 +217,12 @@ struct perf_evsel_script { }; static struct perf_evsel_script *perf_evsel_script__new(struct perf_evsel *evsel, - struct perf_data_file *file) + struct perf_data *data) { struct perf_evsel_script *es = malloc(sizeof(*es)); if (es != NULL) { - if (asprintf(&es->filename, "%s.%s.dump", file->path, perf_evsel__name(evsel)) < 0) + if (asprintf(&es->filename, "%s.%s.dump", data->path, perf_evsel__name(evsel)) < 0) goto out_free; es->fp = fopen(es->filename, "w"); if (es->fp == NULL) @@ -1954,7 +1954,7 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script) struct perf_evsel *evsel; evlist__for_each_entry(script->session->evlist, evsel) { - evsel->priv = perf_evsel_script__new(evsel, script->session->file); + evsel->priv = perf_evsel_script__new(evsel, script->session->data); if (evsel->priv == NULL) goto out_err_fclose; } @@ -2590,14 +2590,14 @@ int find_scripts(char **scripts_array, char **scripts_path_array) char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, }; char *temp; int i = 0; - session = perf_session__new(&file, false, NULL); + session = perf_session__new(&data, false, NULL); if (!session) return -1; @@ -2875,7 +2875,7 @@ int cmd_script(int argc, const char **argv) .ordering_requires_timestamps = true, }, }; - struct perf_data_file file = { + struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; const struct option options[] = { @@ -2982,8 +2982,8 @@ int cmd_script(int argc, const char **argv) argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, PARSE_OPT_STOP_AT_NON_OPTION); - file.path = input_name; - file.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { rec_script_path = get_script_path(argv[1], RECORD_SUFFIX); @@ -3150,7 +3150,7 @@ int cmd_script(int argc, const char **argv) if (!script_name) setup_pager(); - session = perf_session__new(&file, false, &script.tool); + session = perf_session__new(&data, false, &script.tool); if (session == NULL) return -1; @@ -3206,7 +3206,7 @@ int cmd_script(int argc, const char **argv) goto out_delete; } - input = open(file.path, O_RDONLY); /* input_name */ + input = open(data.path, O_RDONLY); /* input_name */ if (input < 0) { err = -errno; perror("failed to open file"); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 988bdfa5d832..85af6d291c06 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -175,7 +175,7 @@ static int print_free_counters_hint; struct perf_stat { bool record; - struct perf_data_file file; + struct perf_data data; struct perf_session *session; u64 bytes_written; struct perf_tool tool; @@ -253,7 +253,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) * by attr->sample_type != 0, and we can't run it on * stat sessions. */ - if (!(STAT_RECORD && perf_stat.file.is_pipe)) + if (!(STAT_RECORD && perf_stat.data.is_pipe)) attr->sample_type = PERF_SAMPLE_IDENTIFIER; /* @@ -295,7 +295,7 @@ static int process_synthesized_event(struct perf_tool *tool __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) { - if (perf_data_file__write(&perf_stat.file, event, event->header.size) < 0) { + if (perf_data__write(&perf_stat.data, event, event->header.size) < 0) { pr_err("failed to write perf data, error: %m\n"); return -1; } @@ -628,7 +628,7 @@ static int __run_perf_stat(int argc, const char **argv) size_t l; int status = 0; const bool forks = (argc > 0); - bool is_pipe = STAT_RECORD ? perf_stat.file.is_pipe : false; + bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; struct perf_evsel_config_term *err_term; if (interval) { @@ -719,10 +719,10 @@ try_again: } if (STAT_RECORD) { - int err, fd = perf_data_file__fd(&perf_stat.file); + int err, fd = perf_data__fd(&perf_stat.data); if (is_pipe) { - err = perf_header__write_pipe(perf_data_file__fd(&perf_stat.file)); + err = perf_header__write_pipe(perf_data__fd(&perf_stat.data)); } else { err = perf_session__write_header(perf_stat.session, evsel_list, fd, false); @@ -1696,7 +1696,7 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) char buf[64], *prefix = NULL; /* Do not print anything if we record to the pipe. */ - if (STAT_RECORD && perf_stat.file.is_pipe) + if (STAT_RECORD && perf_stat.data.is_pipe) return; if (interval) @@ -2406,20 +2406,20 @@ static void init_features(struct perf_session *session) static int __cmd_record(int argc, const char **argv) { struct perf_session *session; - struct perf_data_file *file = &perf_stat.file; + struct perf_data *data = &perf_stat.data; argc = parse_options(argc, argv, stat_options, stat_record_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (output_name) - file->path = output_name; + data->path = output_name; if (run_count != 1 || forever) { pr_err("Cannot use -r option with perf stat record.\n"); return -1; } - session = perf_session__new(file, false, NULL); + session = perf_session__new(data, false, NULL); if (session == NULL) { pr_err("Perf session creation failed.\n"); return -1; @@ -2477,7 +2477,7 @@ int process_stat_config_event(struct perf_tool *tool, if (st->aggr_mode != AGGR_UNSET) stat_config.aggr_mode = st->aggr_mode; - if (perf_stat.file.is_pipe) + if (perf_stat.data.is_pipe) perf_stat_init_aggr_mode(); else perf_stat_init_aggr_mode_file(st); @@ -2585,10 +2585,10 @@ static int __cmd_report(int argc, const char **argv) input_name = "perf.data"; } - perf_stat.file.path = input_name; - perf_stat.file.mode = PERF_DATA_MODE_READ; + perf_stat.data.path = input_name; + perf_stat.data.mode = PERF_DATA_MODE_READ; - session = perf_session__new(&perf_stat.file, false, &perf_stat.tool); + session = perf_session__new(&perf_stat.data, false, &perf_stat.tool); if (session == NULL) return -1; @@ -2859,7 +2859,7 @@ int cmd_stat(int argc, const char **argv) * records, but the need to suppress the kptr_restrict messages in older * tools remain -acme */ - int fd = perf_data_file__fd(&perf_stat.file); + int fd = perf_data__fd(&perf_stat.data); int err = perf_event__synthesize_kernel_mmap((void *)&perf_stat, process_synthesized_event, &perf_stat.session->machines.host); @@ -2873,7 +2873,7 @@ int cmd_stat(int argc, const char **argv) pr_err("failed to write stat round event\n"); } - if (!perf_stat.file.is_pipe) { + if (!perf_stat.data.is_pipe) { perf_stat.session->header.data_size += perf_stat.bytes_written; perf_session__write_header(perf_stat.session, evsel_list, fd, true); } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 01de01ca14f2..0f79ea5e2f0f 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1601,13 +1601,13 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) { "syscalls:sys_exit_pselect6", process_exit_poll }, { "syscalls:sys_exit_select", process_exit_poll }, }; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = tchart->force, }; - struct perf_session *session = perf_session__new(&file, false, + struct perf_session *session = perf_session__new(&data, false, &tchart->tool); int ret = -EINVAL; @@ -1617,7 +1617,7 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) symbol__init(&session->header.env); (void)perf_header__process_sections(&session->header, - perf_data_file__fd(session->file), + perf_data__fd(session->data), tchart, process_header); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 78855916f4b0..7def6947ad61 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2532,7 +2532,7 @@ static int trace__replay(struct trace *trace) const struct perf_evsel_str_handler handlers[] = { { "probe:vfs_getname", trace__vfs_getname, }, }; - struct perf_data_file file = { + struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = trace->force, @@ -2558,7 +2558,7 @@ static int trace__replay(struct trace *trace) /* add tid to output */ trace->multiple_threads = true; - session = perf_session__new(&file, false, &trace->tool); + session = perf_session__new(&data, false, &trace->tool); if (session == NULL) return -1; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 19b0561fd6f6..7536782e8495 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -29,12 +29,12 @@ static int get_temp(char *path) static int session_write_header(char *path) { struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = path, .mode = PERF_DATA_MODE_WRITE, }; - session = perf_session__new(&file, false, NULL); + session = perf_session__new(&data, false, NULL); TEST_ASSERT_VAL("can't get session", session); session->evlist = perf_evlist__new_default(); @@ -46,7 +46,7 @@ static int session_write_header(char *path) session->header.data_size += DATA_SIZE; TEST_ASSERT_VAL("failed to write header", - !perf_session__write_header(session, session->evlist, file.fd, true)); + !perf_session__write_header(session, session->evlist, data.fd, true)); perf_session__delete(session); @@ -56,13 +56,13 @@ static int session_write_header(char *path) static int check_cpu_topology(char *path, struct cpu_map *map) { struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = path, .mode = PERF_DATA_MODE_READ, }; int i; - session = perf_session__new(&file, false, NULL); + session = perf_session__new(&data, false, NULL); TEST_ASSERT_VAL("can't get session", session); for (i = 0; i < session->header.env.nr_cpus_avail; i++) { diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 5547457566a7..a33491416400 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -208,7 +208,7 @@ static int auxtrace_queues__grow(struct auxtrace_queues *queues, static void *auxtrace_copy_data(u64 size, struct perf_session *session) { - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); void *p; ssize_t ret; @@ -305,7 +305,7 @@ static int auxtrace_queues__add_event_buffer(struct auxtrace_queues *queues, if (session->one_mmap) { buffer->data = buffer->data_offset - session->one_mmap_offset + session->one_mmap_addr; - } else if (perf_data_file__is_pipe(session->file)) { + } else if (perf_data__is_pipe(session->data)) { buffer->data = auxtrace_copy_data(buffer->size, session); if (!buffer->data) return -ENOMEM; diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 2346cecb8ea2..9fdae383a58c 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1577,7 +1577,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, struct perf_data_convert_opts *opts) { struct perf_session *session; - struct perf_data_file file = { + struct perf_data data = { .path = input, .mode = PERF_DATA_MODE_READ, .force = opts->force, @@ -1619,7 +1619,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, err = -1; /* perf.data session */ - session = perf_session__new(&file, 0, &c.tool); + session = perf_session__new(&data, 0, &c.tool); if (!session) goto free_writer; @@ -1650,7 +1650,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, fprintf(stderr, "[ perf data convert: Converted '%s' into CTF data '%s' ]\n", - file.path, path); + data.path, path); fprintf(stderr, "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples", diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 1123b30e3033..a6eea3df4c10 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -21,56 +21,56 @@ #endif #endif -static bool check_pipe(struct perf_data_file *file) +static bool check_pipe(struct perf_data *data) { struct stat st; bool is_pipe = false; - int fd = perf_data_file__is_read(file) ? + int fd = perf_data__is_read(data) ? STDIN_FILENO : STDOUT_FILENO; - if (!file->path) { + if (!data->path) { if (!fstat(fd, &st) && S_ISFIFO(st.st_mode)) is_pipe = true; } else { - if (!strcmp(file->path, "-")) + if (!strcmp(data->path, "-")) is_pipe = true; } if (is_pipe) - file->fd = fd; + data->fd = fd; - return file->is_pipe = is_pipe; + return data->is_pipe = is_pipe; } -static int check_backup(struct perf_data_file *file) +static int check_backup(struct perf_data *data) { struct stat st; - if (!stat(file->path, &st) && st.st_size) { + if (!stat(data->path, &st) && st.st_size) { /* TODO check errors properly */ char oldname[PATH_MAX]; snprintf(oldname, sizeof(oldname), "%s.old", - file->path); + data->path); unlink(oldname); - rename(file->path, oldname); + rename(data->path, oldname); } return 0; } -static int open_file_read(struct perf_data_file *file) +static int open_file_read(struct perf_data *data) { struct stat st; int fd; char sbuf[STRERR_BUFSIZE]; - fd = open(file->path, O_RDONLY); + fd = open(data->path, O_RDONLY); if (fd < 0) { int err = errno; - pr_err("failed to open %s: %s", file->path, + pr_err("failed to open %s: %s", data->path, str_error_r(err, sbuf, sizeof(sbuf))); - if (err == ENOENT && !strcmp(file->path, "perf.data")) + if (err == ENOENT && !strcmp(data->path, "perf.data")) pr_err(" (try 'perf record' first)"); pr_err("\n"); return -err; @@ -79,19 +79,19 @@ static int open_file_read(struct perf_data_file *file) if (fstat(fd, &st) < 0) goto out_close; - if (!file->force && st.st_uid && (st.st_uid != geteuid())) { + if (!data->force && st.st_uid && (st.st_uid != geteuid())) { pr_err("File %s not owned by current user or root (use -f to override)\n", - file->path); + data->path); goto out_close; } if (!st.st_size) { - pr_info("zero-sized file (%s), nothing to do!\n", - file->path); + pr_info("zero-sized data (%s), nothing to do!\n", + data->path); goto out_close; } - file->size = st.st_size; + data->size = st.st_size; return fd; out_close: @@ -99,93 +99,93 @@ static int open_file_read(struct perf_data_file *file) return -1; } -static int open_file_write(struct perf_data_file *file) +static int open_file_write(struct perf_data *data) { int fd; char sbuf[STRERR_BUFSIZE]; - if (check_backup(file)) + if (check_backup(data)) return -1; - fd = open(file->path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, + fd = open(data->path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd < 0) - pr_err("failed to open %s : %s\n", file->path, + pr_err("failed to open %s : %s\n", data->path, str_error_r(errno, sbuf, sizeof(sbuf))); return fd; } -static int open_file(struct perf_data_file *file) +static int open_file(struct perf_data *data) { int fd; - fd = perf_data_file__is_read(file) ? - open_file_read(file) : open_file_write(file); + fd = perf_data__is_read(data) ? + open_file_read(data) : open_file_write(data); - file->fd = fd; + data->fd = fd; return fd < 0 ? -1 : 0; } -int perf_data_file__open(struct perf_data_file *file) +int perf_data__open(struct perf_data *data) { - if (check_pipe(file)) + if (check_pipe(data)) return 0; - if (!file->path) - file->path = "perf.data"; + if (!data->path) + data->path = "perf.data"; - return open_file(file); + return open_file(data); } -void perf_data_file__close(struct perf_data_file *file) +void perf_data__close(struct perf_data *data) { - close(file->fd); + close(data->fd); } -ssize_t perf_data_file__write(struct perf_data_file *file, +ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size) { - return writen(file->fd, buf, size); + return writen(data->fd, buf, size); } -int perf_data_file__switch(struct perf_data_file *file, +int perf_data__switch(struct perf_data *data, const char *postfix, size_t pos, bool at_exit) { char *new_filepath; int ret; - if (check_pipe(file)) + if (check_pipe(data)) return -EINVAL; - if (perf_data_file__is_read(file)) + if (perf_data__is_read(data)) return -EINVAL; - if (asprintf(&new_filepath, "%s.%s", file->path, postfix) < 0) + if (asprintf(&new_filepath, "%s.%s", data->path, postfix) < 0) return -ENOMEM; /* * Only fire a warning, don't return error, continue fill * original file. */ - if (rename(file->path, new_filepath)) - pr_warning("Failed to rename %s to %s\n", file->path, new_filepath); + if (rename(data->path, new_filepath)) + pr_warning("Failed to rename %s to %s\n", data->path, new_filepath); if (!at_exit) { - close(file->fd); - ret = perf_data_file__open(file); + close(data->fd); + ret = perf_data__open(data); if (ret < 0) goto out; - if (lseek(file->fd, pos, SEEK_SET) == (off_t)-1) { + if (lseek(data->fd, pos, SEEK_SET) == (off_t)-1) { ret = -errno; pr_debug("Failed to lseek to %zu: %s", pos, strerror(errno)); goto out; } } - ret = file->fd; + ret = data->fd; out: free(new_filepath); return ret; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index ae510ce16cb1..a1f9d70426b1 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -8,7 +8,7 @@ enum perf_data_mode { PERF_DATA_MODE_READ, }; -struct perf_data_file { +struct perf_data { const char *path; int fd; bool is_pipe; @@ -17,43 +17,43 @@ struct perf_data_file { enum perf_data_mode mode; }; -static inline bool perf_data_file__is_read(struct perf_data_file *file) +static inline bool perf_data__is_read(struct perf_data *data) { - return file->mode == PERF_DATA_MODE_READ; + return data->mode == PERF_DATA_MODE_READ; } -static inline bool perf_data_file__is_write(struct perf_data_file *file) +static inline bool perf_data__is_write(struct perf_data *data) { - return file->mode == PERF_DATA_MODE_WRITE; + return data->mode == PERF_DATA_MODE_WRITE; } -static inline int perf_data_file__is_pipe(struct perf_data_file *file) +static inline int perf_data__is_pipe(struct perf_data *data) { - return file->is_pipe; + return data->is_pipe; } -static inline int perf_data_file__fd(struct perf_data_file *file) +static inline int perf_data__fd(struct perf_data *data) { - return file->fd; + return data->fd; } -static inline unsigned long perf_data_file__size(struct perf_data_file *file) +static inline unsigned long perf_data__size(struct perf_data *data) { - return file->size; + return data->size; } -int perf_data_file__open(struct perf_data_file *file); -void perf_data_file__close(struct perf_data_file *file); -ssize_t perf_data_file__write(struct perf_data_file *file, +int perf_data__open(struct perf_data *data); +void perf_data__close(struct perf_data *data); +ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size); /* * If at_exit is set, only rename current perf.data to - * perf.data., continue write on original file. + * perf.data., continue write on original data. * Set at_exit when flushing the last output. * * Return value is fd of new output. */ -int perf_data_file__switch(struct perf_data_file *file, +int perf_data__switch(struct perf_data *data, const char *postfix, size_t pos, bool at_exit); #endif /* __PERF_DATA_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 605bbd5404fb..d7be552b21c8 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1762,7 +1762,7 @@ process_event_desc(struct feat_fd *ff, void *data __maybe_unused) session = container_of(ff->ph, struct perf_session, header); - if (session->file->is_pipe) { + if (session->data->is_pipe) { /* Save events for reading later by print_event_desc, * since they can't be read again in pipe mode. */ ff->events = events; @@ -1771,7 +1771,7 @@ process_event_desc(struct feat_fd *ff, void *data __maybe_unused) for (evsel = events; evsel->attr.size; evsel++) perf_evlist__set_event_name(session->evlist, evsel); - if (!session->file->is_pipe) + if (!session->data->is_pipe) free_event_desc(events); return 0; @@ -2248,7 +2248,7 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full) { struct header_print_data hd; struct perf_header *header = &session->header; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); struct stat st; int ret, bit; @@ -2264,7 +2264,7 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full) perf_header__process_sections(header, fd, &hd, perf_file_section__fprintf_info); - if (session->file->is_pipe) + if (session->data->is_pipe) return 0; fprintf(fp, "# missing features: "); @@ -2757,7 +2757,7 @@ static int perf_header__read_pipe(struct perf_session *session) struct perf_pipe_file_header f_header; if (perf_file_header__read_pipe(&f_header, header, - perf_data_file__fd(session->file), + perf_data__fd(session->data), session->repipe) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; @@ -2860,13 +2860,13 @@ static int perf_evlist__prepare_tracepoint_events(struct perf_evlist *evlist, int perf_session__read_header(struct perf_session *session) { - struct perf_data_file *file = session->file; + struct perf_data *data = session->data; struct perf_header *header = &session->header; struct perf_file_header f_header; struct perf_file_attr f_attr; u64 f_id; int nr_attrs, nr_ids, i, j; - int fd = perf_data_file__fd(file); + int fd = perf_data__fd(data); session->evlist = perf_evlist__new(); if (session->evlist == NULL) @@ -2874,7 +2874,7 @@ int perf_session__read_header(struct perf_session *session) session->evlist->env = &header->env; session->machines.host.env = &header->env; - if (perf_data_file__is_pipe(file)) + if (perf_data__is_pipe(data)) return perf_header__read_pipe(session); if (perf_file_header__read(&f_header, header, fd) < 0) @@ -2889,7 +2889,7 @@ int perf_session__read_header(struct perf_session *session) if (f_header.data.size == 0) { pr_warning("WARNING: The %s file's data size field is 0 which is unexpected.\n" "Was the 'perf record' command properly terminated?\n", - file->path); + data->path); } nr_attrs = f_header.attrs.size / f_header.attr_size; @@ -3397,7 +3397,7 @@ int perf_event__process_tracing_data(struct perf_tool *tool __maybe_unused, struct perf_session *session) { ssize_t size_read, padding, size = event->tracing_data.size; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); off_t offset = lseek(fd, 0, SEEK_CUR); char buf[BUFSIZ]; diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 218ee2bac9a5..5325e65f9711 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -500,7 +500,7 @@ static int intel_bts_process_queue(struct intel_bts_queue *btsq, u64 *timestamp) } if (!buffer->data) { - int fd = perf_data_file__fd(btsq->bts->session->file); + int fd = perf_data__fd(btsq->bts->session->data); buffer->data = auxtrace_buffer__get_data(buffer, fd); if (!buffer->data) { @@ -664,10 +664,10 @@ static int intel_bts_process_auxtrace_event(struct perf_session *session, if (!bts->data_queued) { struct auxtrace_buffer *buffer; off_t data_offset; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); int err; - if (perf_data_file__is_pipe(session->file)) { + if (perf_data__is_pipe(session->data)) { data_offset = 0; } else { data_offset = lseek(fd, 0, SEEK_CUR); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index b58f9fd1e2ee..23f9ba676df0 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -271,7 +271,7 @@ next: ptq->buffer = buffer; if (!buffer->data) { - int fd = perf_data_file__fd(ptq->pt->session->file); + int fd = perf_data__fd(ptq->pt->session->data); buffer->data = auxtrace_buffer__get_data(buffer, fd); if (!buffer->data) @@ -2084,10 +2084,10 @@ static int intel_pt_process_auxtrace_event(struct perf_session *session, if (!pt->data_queued) { struct auxtrace_buffer *buffer; off_t data_offset; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); int err; - if (perf_data_file__is_pipe(session->file)) { + if (perf_data__is_pipe(session->data)) { data_offset = 0; } else { data_offset = lseek(fd, 0, SEEK_CUR); diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h index 3f42ee4d2a0b..961e7a8a0e17 100644 --- a/tools/perf/util/jit.h +++ b/tools/perf/util/jit.h @@ -3,7 +3,7 @@ #include -int jit_process(struct perf_session *session, struct perf_data_file *output, +int jit_process(struct perf_session *session, struct perf_data *output, struct machine *machine, char *filename, pid_t pid, u64 *nbytes); int jit_inject_record(const char *filename); diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 9084930e1757..e7645098a323 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -29,7 +29,7 @@ #include "sane_ctype.h" struct jit_buf_desc { - struct perf_data_file *output; + struct perf_data *output; struct perf_session *session; struct machine *machine; union jr_entry *entry; @@ -60,8 +60,8 @@ struct debug_line_info { struct jit_tool { struct perf_tool tool; - struct perf_data_file output; - struct perf_data_file input; + struct perf_data output; + struct perf_data input; u64 bytes_written; }; @@ -356,7 +356,7 @@ jit_inject_event(struct jit_buf_desc *jd, union perf_event *event) { ssize_t size; - size = perf_data_file__write(jd->output, event, event->header.size); + size = perf_data__write(jd->output, event, event->header.size); if (size < 0) return -1; @@ -751,7 +751,7 @@ jit_detect(char *mmap_name, pid_t pid) int jit_process(struct perf_session *session, - struct perf_data_file *output, + struct perf_data *output, struct machine *machine, char *filename, pid_t pid, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index b3fd62f7e4c9..c09b748ab599 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -32,14 +32,14 @@ static int perf_session__deliver_event(struct perf_session *session, static int perf_session__open(struct perf_session *session) { - struct perf_data_file *file = session->file; + struct perf_data *data = session->data; if (perf_session__read_header(session) < 0) { pr_err("incompatible file format (rerun with -v to learn more)\n"); return -1; } - if (perf_data_file__is_pipe(file)) + if (perf_data__is_pipe(data)) return 0; if (perf_header__has_feat(&session->header, HEADER_STAT)) @@ -120,7 +120,7 @@ static int ordered_events__deliver_event(struct ordered_events *oe, session->tool, event->file_offset); } -struct perf_session *perf_session__new(struct perf_data_file *file, +struct perf_session *perf_session__new(struct perf_data *data, bool repipe, struct perf_tool *tool) { struct perf_session *session = zalloc(sizeof(*session)); @@ -134,13 +134,13 @@ struct perf_session *perf_session__new(struct perf_data_file *file, machines__init(&session->machines); ordered_events__init(&session->ordered_events, ordered_events__deliver_event); - if (file) { - if (perf_data_file__open(file)) + if (data) { + if (perf_data__open(data)) goto out_delete; - session->file = file; + session->data = data; - if (perf_data_file__is_read(file)) { + if (perf_data__is_read(data)) { if (perf_session__open(session) < 0) goto out_close; @@ -148,7 +148,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file, * set session attributes that are present in perf.data * but not in pipe-mode. */ - if (!file->is_pipe) { + if (!data->is_pipe) { perf_session__set_id_hdr_size(session); perf_session__set_comm_exec(session); } @@ -157,7 +157,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file, session->machines.host.env = &perf_env; } - if (!file || perf_data_file__is_write(file)) { + if (!data || perf_data__is_write(data)) { /* * In O_RDONLY mode this will be performed when reading the * kernel MMAP event, in perf_event__process_mmap(). @@ -170,7 +170,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file, * In pipe-mode, evlist is empty until PERF_RECORD_HEADER_ATTR is * processed, so perf_evlist__sample_id_all is not meaningful here. */ - if ((!file || !file->is_pipe) && tool && tool->ordering_requires_timestamps && + if ((!data || !data->is_pipe) && tool && tool->ordering_requires_timestamps && tool->ordered_events && !perf_evlist__sample_id_all(session->evlist)) { dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); tool->ordered_events = false; @@ -179,7 +179,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file, return session; out_close: - perf_data_file__close(file); + perf_data__close(data); out_delete: perf_session__delete(session); out: @@ -201,8 +201,8 @@ void perf_session__delete(struct perf_session *session) perf_session__delete_threads(session); perf_env__exit(&session->header.env); machines__exit(&session->machines); - if (session->file) - perf_data_file__close(session->file); + if (session->data) + perf_data__close(session->data); free(session); } @@ -290,8 +290,8 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused, __maybe_unused) { dump_printf(": unhandled!\n"); - if (perf_data_file__is_pipe(session->file)) - skipn(perf_data_file__fd(session->file), event->auxtrace.size); + if (perf_data__is_pipe(session->data)) + skipn(perf_data__fd(session->data), event->auxtrace.size); return event->auxtrace.size; } @@ -1349,7 +1349,7 @@ static s64 perf_session__process_user_event(struct perf_session *session, { struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); int err; dump_event(session->evlist, event, file_offset, NULL); @@ -1449,10 +1449,10 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset, goto out_parse_sample; } - if (perf_data_file__is_pipe(session->file)) + if (perf_data__is_pipe(session->data)) return -1; - fd = perf_data_file__fd(session->file); + fd = perf_data__fd(session->data); hdr_sz = sizeof(struct perf_event_header); if (buf_sz < hdr_sz) @@ -1687,7 +1687,7 @@ static int __perf_session__process_pipe_events(struct perf_session *session) { struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); union perf_event *event; uint32_t size, cur_size = 0; void *buf = NULL; @@ -1828,7 +1828,7 @@ static int __perf_session__process_events(struct perf_session *session, { struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; - int fd = perf_data_file__fd(session->file); + int fd = perf_data__fd(session->data); u64 head, page_offset, file_offset, file_pos, size; int err, mmap_prot, mmap_flags, map_idx = 0; size_t mmap_size; @@ -1945,13 +1945,13 @@ out_err: int perf_session__process_events(struct perf_session *session) { - u64 size = perf_data_file__size(session->file); + u64 size = perf_data__size(session->data); int err; if (perf_session__register_idle_thread(session) < 0) return -ENOMEM; - if (!perf_data_file__is_pipe(session->file)) + if (!perf_data__is_pipe(session->data)) err = __perf_session__process_events(session, session->header.data_offset, session->header.data_size, size); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 47b5e7dbcb18..cc1c5ea53c39 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -32,13 +32,13 @@ struct perf_session { void *one_mmap_addr; u64 one_mmap_offset; struct ordered_events ordered_events; - struct perf_data_file *file; + struct perf_data *data; struct perf_tool *tool; }; struct perf_tool; -struct perf_session *perf_session__new(struct perf_data_file *file, +struct perf_session *perf_session__new(struct perf_data *data, bool repipe, struct perf_tool *tool); void perf_session__delete(struct perf_session *session); From eae8ad8042d82775da1ddf3faa915b32854d9cf4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 23 Jan 2017 22:25:41 +0100 Subject: [PATCH 0744/1085] perf tools: Add struct perf_data_file Add struct perf_data_file to represent a single file within a perf_data struct. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Changbin Du Cc: David Ahern Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-c3f9p4xzykr845ktqcek6p4t@git.kernel.org [ Fixup recent changes in 'perf script --per-event-dump' ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 4 +-- tools/perf/builtin-buildid-cache.c | 4 +-- tools/perf/builtin-buildid-list.c | 8 +++-- tools/perf/builtin-c2c.c | 4 +-- tools/perf/builtin-diff.c | 12 ++++---- tools/perf/builtin-evlist.c | 8 +++-- tools/perf/builtin-inject.c | 12 ++++---- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-kvm.c | 8 +++-- tools/perf/builtin-lock.c | 8 +++-- tools/perf/builtin-mem.c | 8 +++-- tools/perf/builtin-record.c | 6 ++-- tools/perf/builtin-report.c | 6 ++-- tools/perf/builtin-sched.c | 16 ++++++---- tools/perf/builtin-script.c | 14 +++++---- tools/perf/builtin-stat.c | 6 ++-- tools/perf/builtin-timechart.c | 8 +++-- tools/perf/builtin-trace.c | 8 +++-- tools/perf/tests/topology.c | 14 +++++---- tools/perf/util/data-convert-bt.c | 8 ++--- tools/perf/util/data.c | 48 +++++++++++++++--------------- tools/perf/util/data.h | 10 +++++-- tools/perf/util/header.c | 2 +- 23 files changed, 127 insertions(+), 97 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 2d06be81a109..2d5c87578f83 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -356,7 +356,7 @@ static int __cmd_annotate(struct perf_annotate *ann) } if (total_nr_samples == 0) { - ui__error("The %s file has no samples!\n", session->data->path); + ui__error("The %s file has no samples!\n", session->data->file.path); goto out; } @@ -482,7 +482,7 @@ int cmd_annotate(int argc, const char **argv) if (quiet) perf_quiet_option(); - data.path = input_name; + data.file.path = input_name; annotate.session = perf_session__new(&data, false, &annotate.tool); if (annotate.session == NULL) diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 9fceae47a02e..cb2453b29365 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -352,8 +352,8 @@ int cmd_buildid_cache(int argc, const char **argv) nsi = nsinfo__new(ns_id); if (missing_filename) { - data.path = missing_filename; - data.force = force; + data.file.path = missing_filename; + data.force = force; session = perf_session__new(&data, false, NULL); if (session == NULL) diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 72bdc0eba990..00099a830b0d 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -51,9 +51,11 @@ static int perf_session__list_build_ids(bool force, bool with_hits) { struct perf_session *session; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = force, }; symbol__elf_init(); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 87a52d09da29..9590fdcc6484 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2572,8 +2572,8 @@ static int perf_c2c__report(int argc, const char **argv) if (!input_name || !strlen(input_name)) input_name = "perf.data"; - data.path = input_name; - data.force = symbol_conf.force; + data.file.path = input_name; + data.force = symbol_conf.force; err = setup_display(display); if (err) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 5292e3d13cec..67570e6417e5 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -707,7 +707,7 @@ static void data__fprintf(void) data__for_each_file(i, d) fprintf(stdout, "# [%d] %s %s\n", - d->idx, d->data.path, + d->idx, d->data.file.path, !d->idx ? "(Baseline)" : ""); fprintf(stdout, "#\n"); @@ -778,14 +778,14 @@ static int __cmd_diff(void) data__for_each_file(i, d) { d->session = perf_session__new(&d->data, false, &tool); if (!d->session) { - pr_err("Failed to open %s\n", d->data.path); + pr_err("Failed to open %s\n", d->data.file.path); ret = -1; goto out_delete; } ret = perf_session__process_events(d->session); if (ret) { - pr_err("Failed to process %s\n", d->data.path); + pr_err("Failed to process %s\n", d->data.file.path); goto out_delete; } @@ -1288,9 +1288,9 @@ static int data_init(int argc, const char **argv) data__for_each_file(i, d) { struct perf_data *data = &d->data; - data->path = use_default ? defaults[i] : argv[i]; - data->mode = PERF_DATA_MODE_READ, - data->force = force, + data->file.path = use_default ? defaults[i] : argv[i]; + data->mode = PERF_DATA_MODE_READ, + data->force = force, d->idx = i; } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index cd79c26e16a4..93b85dc857b6 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -22,9 +22,11 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details struct perf_session *session; struct perf_evsel *pos; struct perf_data data = { - .path = file_name, - .mode = PERF_DATA_MODE_READ, - .force = details->force, + .file = { + .path = file_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = details->force, }; bool has_tracepoint = false; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index ac7486f6c46f..91e65093d3c2 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -145,7 +145,7 @@ static s64 perf_event__repipe_auxtrace(struct perf_tool *tool, if (!inject->output.is_pipe) { off_t offset; - offset = lseek(inject->output.fd, 0, SEEK_CUR); + offset = lseek(inject->output.file.fd, 0, SEEK_CUR); if (offset == -1) return -errno; ret = auxtrace_index__auxtrace_event(&session->auxtrace_index, @@ -775,8 +775,10 @@ int cmd_inject(int argc, const char **argv) .input_name = "-", .samples = LIST_HEAD_INIT(inject.samples), .output = { - .path = "-", - .mode = PERF_DATA_MODE_WRITE, + .file = { + .path = "-", + }, + .mode = PERF_DATA_MODE_WRITE, }, }; struct perf_data data = { @@ -789,7 +791,7 @@ int cmd_inject(int argc, const char **argv) "Inject build-ids into the output stream"), OPT_STRING('i', "input", &inject.input_name, "file", "input file name"), - OPT_STRING('o', "output", &inject.output.path, "file", + OPT_STRING('o', "output", &inject.output.file.path, "file", "output file name"), OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat, "Merge sched-stat and sched-switch for getting events " @@ -836,7 +838,7 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = inject.sched_stat; - data.path = inject.input_name; + data.file.path = inject.input_name; inject.session = perf_session__new(&data, true, &inject.tool); if (inject.session == NULL) return -1; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index d45740a3e5f1..abcab75cc5b9 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1949,7 +1949,7 @@ int cmd_kmem(int argc, const char **argv) return __cmd_record(argc, argv); } - data.path = input_name; + data.file.path = input_name; kmem_session = session = perf_session__new(&data, false, &perf_kmem); if (session == NULL) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 4301fc34f23c..0af4c092b471 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1068,9 +1068,11 @@ static int read_events(struct perf_kvm_stat *kvm) .ordered_events = true, }; struct perf_data file = { - .path = kvm->file_name, - .mode = PERF_DATA_MODE_READ, - .force = kvm->force, + .file = { + .path = kvm->file_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = kvm->force, }; kvm->tool = eops; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 2e281f7b0fca..81af29400b64 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -865,9 +865,11 @@ static int __cmd_report(bool display_info) .ordered_events = true, }; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = force, }; session = perf_session__new(&data, false, &eops); diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 5a4a6f8e614d..f09fd1a1b813 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -237,9 +237,11 @@ static int process_sample_event(struct perf_tool *tool, static int report_raw_events(struct perf_mem *mem) { struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = mem->force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = mem->force, }; int ret; struct perf_session *session = perf_session__new(&data, false, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 0ab7dd0e4f2b..f4d9fc54b382 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -618,7 +618,7 @@ record__switch_output(struct record *rec, bool at_exit) if (!quiet) fprintf(stderr, "[ perf record: Dump %s.%s ]\n", - data->path, timestamp); + data->file.path, timestamp); /* Output tracking events */ if (!at_exit) { @@ -1118,7 +1118,7 @@ out_child: fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n", perf_data__size(data) / 1024.0 / 1024.0, - data->path, postfix, samples); + data->file.path, postfix, samples); } out_delete_session: @@ -1482,7 +1482,7 @@ static struct option __record_options[] = { OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", "list of cpus to monitor"), OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), - OPT_STRING('o', "output", &record.data.path, "file", + OPT_STRING('o', "output", &record.data.file.path, "file", "output file name"), OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, &record.opts.no_inherit_set, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 0dc323772b5e..3c2d9d4932f3 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -637,7 +637,7 @@ static int __cmd_report(struct report *rep) rep->nr_entries += evsel__hists(pos)->nr_entries; if (rep->nr_entries == 0) { - ui__error("The %s file has no samples!\n", data->path); + ui__error("The %s file has no samples!\n", data->file.path); return 0; } @@ -940,8 +940,8 @@ int cmd_report(int argc, const char **argv) input_name = "perf.data"; } - data.path = input_name; - data.force = symbol_conf.force; + data.file.path = input_name; + data.force = symbol_conf.force; repeat: session = perf_session__new(&data, false, &report.tool); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cb5410511f69..47e54348b5ed 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1701,9 +1701,11 @@ static int perf_sched__read_events(struct perf_sched *sched) }; struct perf_session *session; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = sched->force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; int rc = -1; @@ -2903,9 +2905,11 @@ static int perf_sched__timehist(struct perf_sched *sched) { "sched:sched_migrate_task", timehist_migrate_task_event, }, }; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = sched->force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; struct perf_session *session; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8f8fa952b506..89975e30c0ba 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -222,7 +222,7 @@ static struct perf_evsel_script *perf_evsel_script__new(struct perf_evsel *evsel struct perf_evsel_script *es = malloc(sizeof(*es)); if (es != NULL) { - if (asprintf(&es->filename, "%s.%s.dump", data->path, perf_evsel__name(evsel)) < 0) + if (asprintf(&es->filename, "%s.%s.dump", data->file.path, perf_evsel__name(evsel)) < 0) goto out_free; es->fp = fopen(es->filename, "w"); if (es->fp == NULL) @@ -2591,8 +2591,10 @@ int find_scripts(char **scripts_array, char **scripts_path_array) DIR *scripts_dir, *lang_dir; struct perf_session *session; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, }; char *temp; int i = 0; @@ -2982,8 +2984,8 @@ int cmd_script(int argc, const char **argv) argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, PARSE_OPT_STOP_AT_NON_OPTION); - data.path = input_name; - data.force = symbol_conf.force; + data.file.path = input_name; + data.force = symbol_conf.force; if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { rec_script_path = get_script_path(argv[1], RECORD_SUFFIX); @@ -3206,7 +3208,7 @@ int cmd_script(int argc, const char **argv) goto out_delete; } - input = open(data.path, O_RDONLY); /* input_name */ + input = open(data.file.path, O_RDONLY); /* input_name */ if (input < 0) { err = -errno; perror("failed to open file"); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 85af6d291c06..fa5896270022 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2412,7 +2412,7 @@ static int __cmd_record(int argc, const char **argv) PARSE_OPT_STOP_AT_NON_OPTION); if (output_name) - data->path = output_name; + data->file.path = output_name; if (run_count != 1 || forever) { pr_err("Cannot use -r option with perf stat record.\n"); @@ -2585,8 +2585,8 @@ static int __cmd_report(int argc, const char **argv) input_name = "perf.data"; } - perf_stat.data.path = input_name; - perf_stat.data.mode = PERF_DATA_MODE_READ; + perf_stat.data.file.path = input_name; + perf_stat.data.mode = PERF_DATA_MODE_READ; session = perf_session__new(&perf_stat.data, false, &perf_stat.tool); if (session == NULL) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 0f79ea5e2f0f..813698a9b8c7 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1602,9 +1602,11 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) { "syscalls:sys_exit_select", process_exit_poll }, }; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = tchart->force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = tchart->force, }; struct perf_session *session = perf_session__new(&data, false, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 7def6947ad61..c373f9a3e4a9 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2533,9 +2533,11 @@ static int trace__replay(struct trace *trace) { "probe:vfs_getname", trace__vfs_getname, }, }; struct perf_data data = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - .force = trace->force, + .file = { + .path = input_name, + }, + .mode = PERF_DATA_MODE_READ, + .force = trace->force, }; struct perf_session *session; struct perf_evsel *evsel; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 7536782e8495..9bbfed51f1d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -30,8 +30,10 @@ static int session_write_header(char *path) { struct perf_session *session; struct perf_data data = { - .path = path, - .mode = PERF_DATA_MODE_WRITE, + .file = { + .path = path, + }, + .mode = PERF_DATA_MODE_WRITE, }; session = perf_session__new(&data, false, NULL); @@ -46,7 +48,7 @@ static int session_write_header(char *path) session->header.data_size += DATA_SIZE; TEST_ASSERT_VAL("failed to write header", - !perf_session__write_header(session, session->evlist, data.fd, true)); + !perf_session__write_header(session, session->evlist, data.file.fd, true)); perf_session__delete(session); @@ -57,8 +59,10 @@ static int check_cpu_topology(char *path, struct cpu_map *map) { struct perf_session *session; struct perf_data data = { - .path = path, - .mode = PERF_DATA_MODE_READ, + .file = { + .path = path, + }, + .mode = PERF_DATA_MODE_READ, }; int i; diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 9fdae383a58c..5744c12641a5 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1578,9 +1578,9 @@ int bt_convert__perf2ctf(const char *input, const char *path, { struct perf_session *session; struct perf_data data = { - .path = input, - .mode = PERF_DATA_MODE_READ, - .force = opts->force, + .file.path = input, + .mode = PERF_DATA_MODE_READ, + .force = opts->force, }; struct convert c = { .tool = { @@ -1650,7 +1650,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, fprintf(stderr, "[ perf data convert: Converted '%s' into CTF data '%s' ]\n", - data.path, path); + data.file.path, path); fprintf(stderr, "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples", diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index a6eea3df4c10..07ef56a4123c 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -28,16 +28,16 @@ static bool check_pipe(struct perf_data *data) int fd = perf_data__is_read(data) ? STDIN_FILENO : STDOUT_FILENO; - if (!data->path) { + if (!data->file.path) { if (!fstat(fd, &st) && S_ISFIFO(st.st_mode)) is_pipe = true; } else { - if (!strcmp(data->path, "-")) + if (!strcmp(data->file.path, "-")) is_pipe = true; } if (is_pipe) - data->fd = fd; + data->file.fd = fd; return data->is_pipe = is_pipe; } @@ -46,13 +46,13 @@ static int check_backup(struct perf_data *data) { struct stat st; - if (!stat(data->path, &st) && st.st_size) { + if (!stat(data->file.path, &st) && st.st_size) { /* TODO check errors properly */ char oldname[PATH_MAX]; snprintf(oldname, sizeof(oldname), "%s.old", - data->path); + data->file.path); unlink(oldname); - rename(data->path, oldname); + rename(data->file.path, oldname); } return 0; @@ -64,13 +64,13 @@ static int open_file_read(struct perf_data *data) int fd; char sbuf[STRERR_BUFSIZE]; - fd = open(data->path, O_RDONLY); + fd = open(data->file.path, O_RDONLY); if (fd < 0) { int err = errno; - pr_err("failed to open %s: %s", data->path, + pr_err("failed to open %s: %s", data->file.path, str_error_r(err, sbuf, sizeof(sbuf))); - if (err == ENOENT && !strcmp(data->path, "perf.data")) + if (err == ENOENT && !strcmp(data->file.path, "perf.data")) pr_err(" (try 'perf record' first)"); pr_err("\n"); return -err; @@ -81,13 +81,13 @@ static int open_file_read(struct perf_data *data) if (!data->force && st.st_uid && (st.st_uid != geteuid())) { pr_err("File %s not owned by current user or root (use -f to override)\n", - data->path); + data->file.path); goto out_close; } if (!st.st_size) { pr_info("zero-sized data (%s), nothing to do!\n", - data->path); + data->file.path); goto out_close; } @@ -107,11 +107,11 @@ static int open_file_write(struct perf_data *data) if (check_backup(data)) return -1; - fd = open(data->path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, + fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd < 0) - pr_err("failed to open %s : %s\n", data->path, + pr_err("failed to open %s : %s\n", data->file.path, str_error_r(errno, sbuf, sizeof(sbuf))); return fd; @@ -124,7 +124,7 @@ static int open_file(struct perf_data *data) fd = perf_data__is_read(data) ? open_file_read(data) : open_file_write(data); - data->fd = fd; + data->file.fd = fd; return fd < 0 ? -1 : 0; } @@ -133,21 +133,21 @@ int perf_data__open(struct perf_data *data) if (check_pipe(data)) return 0; - if (!data->path) - data->path = "perf.data"; + if (!data->file.path) + data->file.path = "perf.data"; return open_file(data); } void perf_data__close(struct perf_data *data) { - close(data->fd); + close(data->file.fd); } ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size) { - return writen(data->fd, buf, size); + return writen(data->file.fd, buf, size); } int perf_data__switch(struct perf_data *data, @@ -162,30 +162,30 @@ int perf_data__switch(struct perf_data *data, if (perf_data__is_read(data)) return -EINVAL; - if (asprintf(&new_filepath, "%s.%s", data->path, postfix) < 0) + if (asprintf(&new_filepath, "%s.%s", data->file.path, postfix) < 0) return -ENOMEM; /* * Only fire a warning, don't return error, continue fill * original file. */ - if (rename(data->path, new_filepath)) - pr_warning("Failed to rename %s to %s\n", data->path, new_filepath); + if (rename(data->file.path, new_filepath)) + pr_warning("Failed to rename %s to %s\n", data->file.path, new_filepath); if (!at_exit) { - close(data->fd); + close(data->file.fd); ret = perf_data__open(data); if (ret < 0) goto out; - if (lseek(data->fd, pos, SEEK_SET) == (off_t)-1) { + if (lseek(data->file.fd, pos, SEEK_SET) == (off_t)-1) { ret = -errno; pr_debug("Failed to lseek to %zu: %s", pos, strerror(errno)); goto out; } } - ret = data->fd; + ret = data->file.fd; out: free(new_filepath); return ret; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index a1f9d70426b1..1797bed3aa4b 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -8,9 +8,13 @@ enum perf_data_mode { PERF_DATA_MODE_READ, }; +struct perf_data_file { + const char *path; + int fd; +}; + struct perf_data { - const char *path; - int fd; + struct perf_data_file file; bool is_pipe; bool force; unsigned long size; @@ -34,7 +38,7 @@ static inline int perf_data__is_pipe(struct perf_data *data) static inline int perf_data__fd(struct perf_data *data) { - return data->fd; + return data->file.fd; } static inline unsigned long perf_data__size(struct perf_data *data) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d7be552b21c8..6e59dcca9df2 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2889,7 +2889,7 @@ int perf_session__read_header(struct perf_session *session) if (f_header.data.size == 0) { pr_warning("WARNING: The %s file's data size field is 0 which is unexpected.\n" "Was the 'perf record' command properly terminated?\n", - data->path); + data->file.path); } nr_attrs = f_header.attrs.size / f_header.attr_size; From e268687bfb73bb7bfe65f23f5cba5c8a0e5bb050 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 23 Jan 2017 22:42:56 +0100 Subject: [PATCH 0745/1085] perf tools: Add perf_data_file__write function Adding perf_data_file__write function to provide single file write operation. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Changbin Du Cc: David Ahern Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-c3f9p4xzykr845ktqcek6p4t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 8 +++++++- tools/perf/util/data.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 07ef56a4123c..f80a23d031d6 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -144,10 +144,16 @@ void perf_data__close(struct perf_data *data) close(data->file.fd); } +ssize_t perf_data_file__write(struct perf_data_file *file, + void *buf, size_t size) +{ + return writen(file->fd, buf, size); +} + ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size) { - return writen(data->file.fd, buf, size); + return perf_data_file__write(&data->file, buf, size); } int perf_data__switch(struct perf_data *data, diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 1797bed3aa4b..000c43bbb7ac 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -50,6 +50,8 @@ int perf_data__open(struct perf_data *data); void perf_data__close(struct perf_data *data); ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size); +ssize_t perf_data_file__write(struct perf_data_file *file, + void *buf, size_t size); /* * If at_exit is set, only rename current perf.data to * perf.data., continue write on original data. From 54830dd0c342525de2ff10f8be7cf0a9f062b896 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 23 Jan 2017 22:42:56 +0100 Subject: [PATCH 0746/1085] perf stat: Move the shadow stats scale computation in perf_stat__update_shadow_stats Move the shadow stats scale computation to the perf_stat__update_shadow_stats() function, so it's centralized and we don't forget to do it. It also saves few lines of code. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Changbin Du Cc: David Ahern Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-htg7mmyxv6pcrf57qyo6msid@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 3 +-- tools/perf/util/stat-shadow.c | 48 ++++++++++++++++++----------------- tools/perf/util/stat.c | 6 ++--- tools/perf/util/stat.h | 2 +- 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fa5896270022..59af5a8419e2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1267,8 +1267,7 @@ static void aggr_update_shadow(void) continue; val += perf_counts(counter->counts, cpu, 0)->val; } - val = val * counter->scale; - perf_stat__update_shadow_stats(counter, &val, + perf_stat__update_shadow_stats(counter, val, first_shadow_cpu(counter, id)); } } diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index a2c12d1ef32a..51ad03a799ec 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -178,58 +178,60 @@ void perf_stat__reset_shadow_stats(void) * more semantic information such as miss/hit ratios, * instruction rates, etc: */ -void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, +void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count, int cpu) { int ctx = evsel_context(counter); + count *= counter->scale; + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) || perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK)) - update_stats(&runtime_nsecs_stats[cpu], count[0]); + update_stats(&runtime_nsecs_stats[cpu], count); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[ctx][cpu], count[0]); + update_stats(&runtime_cycles_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) - update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count[0]); + update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TRANSACTION_START)) - update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); + update_stats(&runtime_transaction_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, ELISION_START)) - update_stats(&runtime_elision_stats[ctx][cpu], count[0]); + update_stats(&runtime_elision_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) - update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]); + update_stats(&runtime_topdown_total_slots[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) - update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]); + update_stats(&runtime_topdown_slots_issued[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) - update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]); + update_stats(&runtime_topdown_slots_retired[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) - update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]); + update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu], count); else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) - update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]); + update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count); else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) - update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); + update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) - update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]); + update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[ctx][cpu], count[0]); + update_stats(&runtime_branches_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]); + update_stats(&runtime_cacherefs_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) - update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]); + update_stats(&runtime_l1_dcache_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) - update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); + update_stats(&runtime_ll_cache_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) - update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); + update_stats(&runtime_ll_cache_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) - update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]); + update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) - update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]); + update_stats(&runtime_itlb_cache_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, SMI_NUM)) - update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]); + update_stats(&runtime_smi_num_stats[ctx][cpu], count); else if (perf_stat_evsel__is(counter, APERF)) - update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); + update_stats(&runtime_aperf_stats[ctx][cpu], count); if (counter->collect_stat) { struct saved_value *v = saved_value_lookup(counter, cpu, true); - update_stats(&v->stats, count[0]); + update_stats(&v->stats, count); } } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 933de91831fa..ef00c91e2553 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -277,7 +277,7 @@ process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel perf_evsel__compute_deltas(evsel, cpu, thread, count); perf_counts_values__scale(count, config->scale, NULL); if (config->aggr_mode == AGGR_NONE) - perf_stat__update_shadow_stats(evsel, count->values, cpu); + perf_stat__update_shadow_stats(evsel, count->val, cpu); break; case AGGR_GLOBAL: aggr->val += count->val; @@ -320,7 +320,6 @@ int perf_stat_process_counter(struct perf_stat_config *config, struct perf_counts_values *aggr = &counter->counts->aggr; struct perf_stat_evsel *ps = counter->stats; u64 *count = counter->counts->aggr.values; - u64 val; int i, ret; aggr->val = aggr->ena = aggr->run = 0; @@ -360,8 +359,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, /* * Save the full runtime - to allow normalization during printout: */ - val = counter->scale * *count; - perf_stat__update_shadow_stats(counter, &val, 0); + perf_stat__update_shadow_stats(counter, *count, 0); return 0; } diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 47915df346fb..490b78aa7230 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -82,7 +82,7 @@ typedef void (*new_line_t )(void *ctx); void perf_stat__init_shadow_stats(void); void perf_stat__reset_shadow_stats(void); -void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, +void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count, int cpu); struct perf_stat_output_ctx { void *ctx; From 021b462a51de48dd84f12f5046b5b57a362d6506 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Oct 2017 13:21:01 +0200 Subject: [PATCH 0747/1085] perf stat: Make --per-thread update shadow stats to show metrics We should support this because it would allow easily to collect metrics for different threads in applications. Original patch from posted by Jin Yao in here [1]. 1. Current output, for example: root@skl:/tmp# perf stat --per-thread -p 21623 ^C Performance counter stats for process id '21623': vmstat-21623 0.517479 task-clock (msec) # 0.000 CPUs utilized vmstat-21623 1 context-switches vmstat-21623 0 cpu-migrations vmstat-21623 0 page-faults vmstat-21623 461,306 cycles vmstat-21623 630,724 instructions vmstat-21623 136,265 branches vmstat-21623 2,520 branch-misses 1.444020756 seconds time elapsed root@skl:/tmp# perf stat --per-thread --metrics ipc -p 21623 ^C Performance counter stats for process id '21623': vmstat-21623 631,185 inst_retired.any vmstat-21623 605,893 cpu_clk_unhalted.thread 1.415679293 seconds time elapsed 2. With this patch, the result would be: root@skl:/tmp# perf stat --per-thread -p 21623 ^C Performance counter stats for process id '21623': vmstat-21623 0.533759 task-clock (msec) # 0.000 CPUs utilized vmstat-21623 1 context-switches # 0.002 M/sec vmstat-21623 0 cpu-migrations # 0.000 K/sec vmstat-21623 0 page-faults # 0.000 K/sec vmstat-21623 473,896 cycles # 0.888 GHz vmstat-21623 631,072 instructions # 1.33 insn per cycle vmstat-21623 136,307 branches # 255.372 M/sec vmstat-21623 2,524 branch-misses # 1.85% of all branches 1.544862861 seconds time elapsed root@skl:/tmp# perf stat --per-thread --metrics ipc -p 21623 ^C Performance counter stats for process id '21623': vmstat-21623 1,259,104 inst_retired.any # 1.2 IPC vmstat-21623 1,056,756 cpu_clk_unhalted.thread 2.040954502 seconds time elapsed [1] https://marc.info/?l=linux-kernel&m=150777054620511&w=2 Originally-from: Jin Yao Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Changbin Du Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-tr8ntktxmy4qc5769ajg5u6c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index ef00c91e2553..203f5d8d11d1 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -278,6 +278,8 @@ process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel perf_counts_values__scale(count, config->scale, NULL); if (config->aggr_mode == AGGR_NONE) perf_stat__update_shadow_stats(evsel, count->val, cpu); + if (config->aggr_mode == AGGR_THREAD) + perf_stat__update_shadow_stats(evsel, count->val, 0); break; case AGGR_GLOBAL: aggr->val += count->val; From 0f295b0650c90362b4111f46d7f9149a0a4191be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 13 Oct 2017 11:54:33 -0600 Subject: [PATCH 0748/1085] rtc: Allow rtc drivers to specify the tv_nsec value for ntp ntp is currently hardwired to try and call the rtc set when wall clock tv_nsec is 0.5 seconds. This historical behaviour works well with certain PC RTCs, but is not universal to all rtc hardware. Change how this works by introducing the driver specific concept of set_offset_nsec, the delay between current wall clock time and the target time to set (with a 0 tv_nsecs). For x86-style CMOS set_offset_nsec should be -0.5 s which causes the last second to be written 0.5 s after it has started. For compat with the old rtc_set_ntp_time, the value is defaulted to + 0.5 s, which causes the next second to be written 0.5s before it starts, as things were before this patch. Testing shows many non-x86 RTCs would like set_offset_nsec ~= 0, so ultimately each RTC driver should set the set_offset_nsec according to its needs, and non x86 architectures should stop using update_persistent_clock64 in order to access this feature. Future patches will revise the drivers as needed. Since CMOS and RTC now have very different handling they are split into two dedicated code paths, sharing the support code, and ifdefs are replaced with IS_ENABLED. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Jason Gunthorpe Signed-off-by: John Stultz --- drivers/rtc/class.c | 3 + drivers/rtc/systohc.c | 53 ++++++++++---- include/linux/rtc.h | 43 ++++++++++- kernel/time/ntp.c | 166 ++++++++++++++++++++++++++++-------------- 4 files changed, 196 insertions(+), 69 deletions(-) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 2ed970d61da1..722d683e0b0f 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -161,6 +161,9 @@ static struct rtc_device *rtc_allocate_device(void) device_initialize(&rtc->dev); + /* Drivers can revise this default after allocating the device. */ + rtc->set_offset_nsec = NSEC_PER_SEC / 2; + rtc->irq_freq = 1; rtc->max_user_freq = 64; rtc->dev.class = rtc_class; diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c index b4a68ffcd06b..0c177647ea6c 100644 --- a/drivers/rtc/systohc.c +++ b/drivers/rtc/systohc.c @@ -10,6 +10,7 @@ /** * rtc_set_ntp_time - Save NTP synchronized time to the RTC * @now: Current time of day + * @target_nsec: pointer for desired now->tv_nsec value * * Replacement for the NTP platform function update_persistent_clock64 * that stores time for later retrieval by rtc_hctosys. @@ -18,30 +19,52 @@ * possible at all, and various other -errno for specific temporary failure * cases. * + * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec. + ( * If temporary failure is indicated the caller should try again 'soon' */ -int rtc_set_ntp_time(struct timespec64 now) +int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec) { struct rtc_device *rtc; struct rtc_time tm; + struct timespec64 to_set; int err = -ENODEV; - - if (now.tv_nsec < (NSEC_PER_SEC >> 1)) - rtc_time64_to_tm(now.tv_sec, &tm); - else - rtc_time64_to_tm(now.tv_sec + 1, &tm); + bool ok; rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); - if (rtc) { - /* rtc_hctosys exclusively uses UTC, so we call set_time here, - * not set_mmss. */ - if (rtc->ops && - (rtc->ops->set_time || - rtc->ops->set_mmss64 || - rtc->ops->set_mmss)) - err = rtc_set_time(rtc, &tm); - rtc_class_close(rtc); + if (!rtc) + goto out_err; + + if (!rtc->ops || (!rtc->ops->set_time && !rtc->ops->set_mmss64 && + !rtc->ops->set_mmss)) + goto out_close; + + /* Compute the value of tv_nsec we require the caller to supply in + * now.tv_nsec. This is the value such that (now + + * set_offset_nsec).tv_nsec == 0. + */ + set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec); + *target_nsec = to_set.tv_nsec; + + /* The ntp code must call this with the correct value in tv_nsec, if + * it does not we update target_nsec and return EPROTO to make the ntp + * code try again later. + */ + ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now); + if (!ok) { + err = -EPROTO; + goto out_close; } + rtc_time64_to_tm(to_set.tv_sec, &tm); + + /* rtc_hctosys exclusively uses UTC, so we call set_time here, not + * set_mmss. + */ + err = rtc_set_time(rtc, &tm); + +out_close: + rtc_class_close(rtc); +out_err: return err; } diff --git a/include/linux/rtc.h b/include/linux/rtc.h index e6d0f9c1cafd..5b13fa029fd6 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -135,6 +135,14 @@ struct rtc_device { /* Some hardware can't support UIE mode */ int uie_unsupported; + /* Number of nsec it takes to set the RTC clock. This influences when + * the set ops are called. An offset: + * - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s + * - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s + * - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s + */ + long set_offset_nsec; + bool registered; struct nvmem_config *nvmem_config; @@ -172,7 +180,7 @@ extern void devm_rtc_device_unregister(struct device *dev, extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); -extern int rtc_set_ntp_time(struct timespec64 now); +extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); @@ -221,6 +229,39 @@ static inline bool is_leap_year(unsigned int year) return (!(year % 4) && (year % 100)) || !(year % 400); } +/* Determine if we can call to driver to set the time. Drivers can only be + * called to set a second aligned time value, and the field set_offset_nsec + * specifies how far away from the second aligned time to call the driver. + * + * This also computes 'to_set' which is the time we are trying to set, and has + * a zero in tv_nsecs, such that: + * to_set - set_delay_nsec == now +/- FUZZ + * + */ +static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec, + struct timespec64 *to_set, + const struct timespec64 *now) +{ + /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ + const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; + struct timespec64 delay = {.tv_sec = 0, + .tv_nsec = set_offset_nsec}; + + *to_set = timespec64_add(*now, delay); + + if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) { + to_set->tv_nsec = 0; + return true; + } + + if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) { + to_set->tv_sec++; + to_set->tv_nsec = 0; + return true; + } + return false; +} + #define rtc_register_device(device) \ __rtc_register_device(THIS_MODULE, device) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index edf19cc53140..bc19de1a0683 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -492,6 +492,67 @@ out: return leap; } +static void sync_hw_clock(struct work_struct *work); +static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); + +static void sched_sync_hw_clock(struct timespec64 now, + unsigned long target_nsec, bool fail) + +{ + struct timespec64 next; + + getnstimeofday64(&next); + if (!fail) + next.tv_sec = 659; + else { + /* + * Try again as soon as possible. Delaying long periods + * decreases the accuracy of the work queue timer. Due to this + * the algorithm is very likely to require a short-sleep retry + * after the above long sleep to synchronize ts_nsec. + */ + next.tv_sec = 0; + } + + /* Compute the needed delay that will get to tv_nsec == target_nsec */ + next.tv_nsec = target_nsec - next.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + + queue_delayed_work(system_power_efficient_wq, &sync_work, + timespec64_to_jiffies(&next)); +} + +static void sync_rtc_clock(void) +{ + unsigned long target_nsec; + struct timespec64 adjust, now; + int rc; + + if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) + return; + + getnstimeofday64(&now); + + adjust = now; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* + * The current RTC in use will provide the target_nsec it wants to be + * called at, and does rtc_tv_nsec_ok internally. + */ + rc = rtc_set_ntp_time(adjust, &target_nsec); + if (rc == -ENODEV) + return; + + sched_sync_hw_clock(now, target_nsec, rc); +} + #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock(struct timespec now) { @@ -507,77 +568,76 @@ int __weak update_persistent_clock64(struct timespec64 now64) } #endif -#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -static void sync_cmos_clock(struct work_struct *work); - -static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); - -static void sync_cmos_clock(struct work_struct *work) +static bool sync_cmos_clock(void) { + static bool no_cmos; struct timespec64 now; - struct timespec64 next; - int fail = 1; + struct timespec64 adjust; + int rc = -EPROTO; + long target_nsec = NSEC_PER_SEC / 2; + + if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) + return false; + + if (no_cmos) + return false; /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - * We want the clock to be within a couple of ticks from the target. + * Historically update_persistent_clock64() has followed x86 + * semantics, which match the MC146818A/etc RTC. This RTC will store + * 'adjust' and then in .5s it will advance once second. + * + * Architectures are strongly encouraged to use rtclib and not + * implement this legacy API. */ - if (!ntp_synced()) { - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - } - getnstimeofday64(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec64 adjust = now; - - fail = -ENODEV; + if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); -#ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock64(adjust); -#endif - -#ifdef CONFIG_RTC_SYSTOHC - if (fail == -ENODEV) - fail = rtc_set_ntp_time(adjust); -#endif + rc = update_persistent_clock64(adjust); + /* + * The machine does not support update_persistent_clock64 even + * though it defines CONFIG_GENERIC_CMOS_UPDATE. + */ + if (rc == -ENODEV) { + no_cmos = true; + return false; + } } - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; + sched_sync_hw_clock(now, target_nsec, rc); + return true; +} - if (!fail || fail == -ENODEV) - next.tv_sec = 659; - else - next.tv_sec = 0; +/* + * If we have an externally synchronized Linux clock, then update RTC clock + * accordingly every ~11 minutes. Generally RTCs can only store second + * precision, but many RTCs will adjust the phase of their second tick to + * match the moment of update. This infrastructure arranges to call to the RTC + * set at the correct moment to phase synchronize the RTC second tick over + * with the kernel clock. + */ +static void sync_hw_clock(struct work_struct *work) +{ + if (!ntp_synced()) + return; - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } - queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec64_to_jiffies(&next)); + if (sync_cmos_clock()) + return; + + sync_rtc_clock(); } void ntp_notify_cmos_timer(void) { - queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); + if (!ntp_synced()) + return; + + if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || + IS_ENABLED(CONFIG_RTC_SYSTOHC)) + queue_delayed_work(system_power_efficient_wq, &sync_work, 0); } -#else -void ntp_notify_cmos_timer(void) { } -#endif - - /* * Propagate a new txc->status value into the NTP state: */ From e0956dcc4ba74ec4b17e32fc9a156fcba1ef6610 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:44 +0200 Subject: [PATCH 0749/1085] timekeeping: Consolidate timekeeping_inject_offset code The code to check the adjtimex() or clock_adjtime() arguments is spread out across multiple files for presumably only historic reasons. As a preparatation for a rework to get rid of the use of 'struct timeval' and 'struct timespec' in there, this moves all the portions into kernel/time/timekeeping.c and marks them as 'static'. The warp_clock() function here is not as closely related as the others, but I feel it still makes sense to move it here in order to consolidate all callers of timekeeping_inject_offset(). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann [jstultz: Whitespace fixup] Signed-off-by: John Stultz --- include/linux/time.h | 26 -------- kernel/time/ntp.c | 61 ------------------ kernel/time/ntp_internal.h | 1 - kernel/time/time.c | 36 +---------- kernel/time/timekeeping.c | 123 ++++++++++++++++++++++++++++++++++++- kernel/time/timekeeping.h | 2 +- 6 files changed, 123 insertions(+), 126 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 9bc1f945777c..c0fbad08448f 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -134,32 +134,6 @@ static inline bool timeval_valid(const struct timeval *tv) extern struct timespec timespec_trunc(struct timespec t, unsigned gran); -/* - * Validates if a timespec/timeval used to inject a time offset is valid. - * Offsets can be postive or negative. The value of the timeval/timespec - * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must - * always be non-negative. - */ -static inline bool timeval_inject_offset_valid(const struct timeval *tv) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_inject_offset_valid(const struct timespec *ts) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more nanoseconds then a second */ - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - /* Some architectures do not supply their own clocksource. * This is mainly the case in architectures that get their * inter-tick times by reading the counter on their interval diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bc19de1a0683..90f84582a076 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -713,67 +713,6 @@ static inline void process_adjtimex_modes(struct timex *txc, } - -/** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex - */ -int ntp_validate_timex(struct timex *txc) -{ - if (txc->modes & ADJ_ADJTIME) { - /* singleshot must not be used with any other mode bits */ - if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) - return -EINVAL; - if (!(txc->modes & ADJ_OFFSET_READONLY) && - !capable(CAP_SYS_TIME)) - return -EPERM; - } else { - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - /* - * if the quartz is off by more than 10% then - * something is VERY wrong! - */ - if (txc->modes & ADJ_TICK && - (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ)) - return -EINVAL; - } - - if (txc->modes & ADJ_SETOFFSET) { - /* In order to inject time, you gotta be super-user! */ - if (!capable(CAP_SYS_TIME)) - return -EPERM; - - if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) - return -EINVAL; - - } else { - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; - } - } - - /* - * Check for potential multiplication overflows that can - * only happen on 64-bit systems: - */ - if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { - if (LLONG_MIN / PPM_SCALE > txc->freq) - return -EINVAL; - if (LLONG_MAX / PPM_SCALE < txc->freq) - return -EINVAL; - } - - return 0; -} - - /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index d8a7c11fa71a..74b52cd48209 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,7 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 44a8c1402133..04684e294f00 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -157,40 +157,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, return 0; } -/* - * Indicates if there is an offset between the system clock and the hardware - * clock/persistent clock/rtc. - */ -int persistent_clock_is_local; - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; - - persistent_clock_is_local = 1; - adjust.tv_sec = sys_tz.tz_minuteswest * 60; - adjust.tv_nsec = 0; - timekeeping_inject_offset(&adjust); - } -} - /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, @@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz if (firsttime) { firsttime = 0; if (!tv) - warp_clock(); + timekeeping_warp_clock(); } } if (tv) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..7d8e0e842484 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1258,13 +1258,39 @@ out: } EXPORT_SYMBOL(do_settimeofday64); +/* + * Validates if a timespec/timeval used to inject a time offset is valid. + * Offsets can be postive or negative. The value of the timeval/timespec + * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must + * always be non-negative. + */ +static inline bool timeval_inject_offset_valid(const struct timeval *tv) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_inject_offset_valid(const struct timespec *ts) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more nanoseconds then a second */ + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -1303,7 +1329,40 @@ error: /* even if we error out, we forwarded the time, so call update */ return ret; } -EXPORT_SYMBOL(timekeeping_inject_offset); + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} /** * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic @@ -2247,6 +2306,66 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, return base; } +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int ntp_validate_timex(struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + + /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index d0914676d4c5..44aec7893cdd 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -10,7 +10,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern int timekeeping_inject_offset(struct timespec *ts); +extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); From 1572fa03784831b81ec26ec379374cf6bdec04fb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:45 +0200 Subject: [PATCH 0750/1085] timekeeping: Use timespec64 in timekeeping_inject_offset As part of changing all the timekeeping code to use 64-bit time_t consistently, this removes the uses of timeval and timespec as much as possible from do_adjtimex() and timekeeping_inject_offset(). The timeval_inject_offset_valid() and timespec_inject_offset_valid() just complicate this, so I'm folding them into the respective callers. This leaves the actual 'struct timex' definition, which is part of the user-space ABI and should be dealt with separately when we have agreed on the ABI change. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 72 ++++++++++++++------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7d8e0e842484..c6a35fb3cf76 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1258,65 +1258,37 @@ out: } EXPORT_SYMBOL(do_settimeofday64); -/* - * Validates if a timespec/timeval used to inject a time offset is valid. - * Offsets can be postive or negative. The value of the timeval/timespec - * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must - * always be non-negative. - */ -static inline bool timeval_inject_offset_valid(const struct timeval *tv) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_inject_offset_valid(const struct timespec *ts) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more nanoseconds then a second */ - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec64 ts64, tmp; + struct timespec64 tmp; int ret = 0; - if (!timespec_inject_offset_valid(ts)) + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - ts64 = timespec_to_timespec64(*ts); - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), ts64); - if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, &ts64); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1355,7 +1327,7 @@ int persistent_clock_is_local; void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; + struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; @@ -2307,9 +2279,9 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, } /** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int ntp_validate_timex(struct timex *txc) +static int timekeeping_validate_timex(struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2337,16 +2309,22 @@ static int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be postive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; + if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) + if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; - } else { - if (!timeval_inject_offset_valid(&txc->time)) + if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } @@ -2378,12 +2356,12 @@ int do_adjtimex(struct timex *txc) int ret; /* Validate the data before disabling interrupts */ - ret = ntp_validate_timex(txc); + ret = timekeeping_validate_timex(txc); if (ret) return ret; if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; + struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) From 85bf19e7df2479140eff2348a4e6a9c19b5c3960 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:46 +0200 Subject: [PATCH 0751/1085] time: Remove unused functions The (slow but) ongoing work on conversion from timespec to timespec64 has led some timespec based helper functions to become unused. No new code should use them, so we can remove the functions entirely. I'm planning to obsolete additional interfaces next and remove more of these. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/time.h | 18 ------------------ include/linux/time64.h | 28 ---------------------------- kernel/time/time.c | 18 ------------------ 3 files changed, 64 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index c0fbad08448f..0e8a80918484 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -39,15 +39,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time return lhs->tv_nsec - rhs->tv_nsec; } -static inline int timeval_compare(const struct timeval *lhs, const struct timeval *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_usec - rhs->tv_usec; -} - extern time64_t mktime64(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); @@ -65,15 +56,6 @@ static inline unsigned long mktime(const unsigned int year, extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); -/* - * timespec_add_safe assumes both values are positive and checks - * for overflow. It will return TIME_T_MAX if the reutrn would be - * smaller then either of the arguments. - */ -extern struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs); - - static inline struct timespec timespec_add(struct timespec lhs, struct timespec rhs) { diff --git a/include/linux/time64.h b/include/linux/time64.h index 980c71b3001a..402b595c76d2 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -53,16 +53,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) return ts; } -static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64) -{ - return *its64; -} - -static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its) -{ - return *its; -} - # define timespec64_equal timespec_equal # define timespec64_compare timespec_compare # define set_normalized_timespec64 set_normalized_timespec @@ -94,24 +84,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) return ret; } -static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64) -{ - struct itimerspec ret; - - ret.it_interval = timespec64_to_timespec(its64->it_interval); - ret.it_value = timespec64_to_timespec(its64->it_value); - return ret; -} - -static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its) -{ - struct itimerspec64 ret; - - ret.it_interval = timespec_to_timespec64(its->it_interval); - ret.it_value = timespec_to_timespec64(its->it_value); - return ret; -} - static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 04684e294f00..947fb614c78f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -818,24 +818,6 @@ unsigned long nsecs_to_jiffies(u64 n) } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). From 5dbf20127f8cca8588ad0b0e3e8ded587ac7afa0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:47 +0200 Subject: [PATCH 0752/1085] time: Move time_t based interfaces to time32.h Interfaces based on 'struct timespec' or 'struct timeval' should no longer be used for new code, which can use either ktime_t or 'struct timespec64' instead. To make this a little clearer, this moves the various helpers into a new time32.h header. For the moment, this gets included by the normal time.h, but we may be able to separate it entirely when most users of time32.h are gone. Individual helpers in the new file can get removed once they become unused in the future. Since the contents of time32.h look a lot like what's in time64.h, I'm reordering them during the move to make them more similar, and to allow a follow-up patch to redirect the 'timespec' based functions to thei 'timespec64' based counterparts on 64-bit architectures later. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann [jstultz: Whitespace & checkpatch fixups] Signed-off-by: John Stultz --- include/linux/time.h | 163 +------------------------------------- include/linux/time32.h | 176 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 162 deletions(-) create mode 100644 include/linux/time32.h diff --git a/include/linux/time.h b/include/linux/time.h index 0e8a80918484..c375f54a678d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -17,105 +17,10 @@ int get_itimerspec64(struct itimerspec64 *it, int put_itimerspec64(const struct itimerspec64 *it, struct itimerspec __user *uit); -#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) - -static inline int timespec_equal(const struct timespec *a, - const struct timespec *b) -{ - return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); -} - -/* - * lhs < rhs: return <0 - * lhs == rhs: return 0 - * lhs > rhs: return >0 - */ -static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - extern time64_t mktime64(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -/** - * Deprecated. Use mktime64(). - */ -static inline unsigned long mktime(const unsigned int year, - const unsigned int mon, const unsigned int day, - const unsigned int hour, const unsigned int min, - const unsigned int sec) -{ - return mktime64(year, mon, day, hour, min, sec); -} - -extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); - -static inline struct timespec timespec_add(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - return ts_delta; -} - -/* - * sub = lhs - rhs, in normalized form - */ -static inline struct timespec timespec_sub(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, - lhs.tv_nsec - rhs.tv_nsec); - return ts_delta; -} - -/* - * Returns true if the timespec is norm, false if denorm: - */ -static inline bool timespec_valid(const struct timespec *ts) -{ - /* Dates before 1970 are bogus */ - if (ts->tv_sec < 0) - return false; - /* Can't have more nanoseconds then a second */ - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_valid_strict(const struct timespec *ts) -{ - if (!timespec_valid(ts)) - return false; - /* Disallow values that could overflow ktime_t */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) - return false; - return true; -} - -static inline bool timeval_valid(const struct timeval *tv) -{ - /* Dates before 1970 are bogus */ - if (tv->tv_sec < 0) - return false; - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - - return true; -} - -extern struct timespec timespec_trunc(struct timespec t, unsigned gran); - /* Some architectures do not supply their own clocksource. * This is mainly the case in architectures that get their * inter-tick times by reading the counter on their interval @@ -164,73 +69,7 @@ struct tm { void time64_to_tm(time64_t totalsecs, int offset, struct tm *result); -/** - * time_to_tm - converts the calendar time to local broken-down time - * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, - * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time - */ -static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) -{ - time64_to_tm(totalsecs, offset, result); -} - -/** - * timespec_to_ns - Convert timespec to nanoseconds - * @ts: pointer to the timespec variable to be converted - * - * Returns the scalar nanosecond representation of the timespec - * parameter. - */ -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - -/** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - */ -static inline s64 timeval_to_ns(const struct timeval *tv) -{ - return ((s64) tv->tv_sec * NSEC_PER_SEC) + - tv->tv_usec * NSEC_PER_USEC; -} - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -extern struct timespec ns_to_timespec(const s64 nsec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -extern struct timeval ns_to_timeval(const s64 nsec); - -/** - * timespec_add_ns - Adds nanoseconds to a timespec - * @a: pointer to timespec to be incremented - * @ns: unsigned nanoseconds value to be added - * - * This must always be inlined because its used from the x86-64 vdso, - * which cannot call other kernel functions. - */ -static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) -{ - a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); - a->tv_nsec = ns; -} +# include static inline bool itimerspec64_valid(const struct itimerspec64 *its) { diff --git a/include/linux/time32.h b/include/linux/time32.h new file mode 100644 index 000000000000..9b9c43f0d39b --- /dev/null +++ b/include/linux/time32.h @@ -0,0 +1,176 @@ +#ifndef _LINUX_TIME32_H +#define _LINUX_TIME32_H +/* + * These are all interfaces based on the old time_t definition + * that overflows in 2038 on 32-bit architectures. New code + * should use the replacements based on time64_t and timespec64. + * + * Any interfaces in here that become unused as we migrate + * code to time64_t should get removed. + */ + +#include + +#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) + +static inline int timespec_equal(const struct timespec *a, + const struct timespec *b) +{ + return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); +} + +/* + * lhs < rhs: return <0 + * lhs == rhs: return 0 + * lhs > rhs: return >0 + */ +static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + +extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); + +static inline struct timespec timespec_add(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + + set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + return ts_delta; +} + +/* + * sub = lhs - rhs, in normalized form + */ +static inline struct timespec timespec_sub(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + + set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, + lhs.tv_nsec - rhs.tv_nsec); + return ts_delta; +} + +/* + * Returns true if the timespec is norm, false if denorm: + */ +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} + +/** + * timespec_to_ns - Convert timespec to nanoseconds + * @ts: pointer to the timespec variable to be converted + * + * Returns the scalar nanosecond representation of the timespec + * parameter. + */ +static inline s64 timespec_to_ns(const struct timespec *ts) +{ + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; +} + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +extern struct timespec ns_to_timespec(const s64 nsec); + +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + * + * This must always be inlined because its used from the x86-64 vdso, + * which cannot call other kernel functions. + */ +static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) +{ + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); + a->tv_nsec = ns; +} + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + time64_to_tm(totalsecs, offset, result); +} + +static inline unsigned long mktime(const unsigned int year, + const unsigned int mon, const unsigned int day, + const unsigned int hour, const unsigned int min, + const unsigned int sec) +{ + return mktime64(year, mon, day, hour, min, sec); +} + +static inline bool timeval_valid(const struct timeval *tv) +{ + /* Dates before 1970 are bogus */ + if (tv->tv_sec < 0) + return false; + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + + return true; +} + +extern struct timespec timespec_trunc(struct timespec t, unsigned int gran); + +/** + * timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * parameter. + */ +static inline s64 timeval_to_ns(const struct timeval *tv) +{ + return ((s64) tv->tv_sec * NSEC_PER_SEC) + + tv->tv_usec * NSEC_PER_USEC; +} + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +extern struct timeval ns_to_timeval(const s64 nsec); + +#endif From abc8f96e3eb846fcf6333395ee1f6ed4a734576c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:48 +0200 Subject: [PATCH 0753/1085] time: Move time_t conversion helpers to time32.h On 64-bit architectures, the timespec64 based helpers in linux/time.h are defined as macros pointing to their timespec based counterparts. This made sense when they were first introduced, but as we are migrating away from timespec in general, it's much less intuitive now. This changes the macros to work in the exact opposite way: we always provide the timespec64 based helpers and define the old interfaces as macros for them. Now we can move those macros into linux/time32.h, which already contains the respective helpers for 32-bit architectures. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/time32.h | 45 +++++++++++++++++++++++++++++++++++++ include/linux/time64.h | 50 +----------------------------------------- kernel/time/time.c | 5 +++-- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/include/linux/time32.h b/include/linux/time32.h index 9b9c43f0d39b..65b1de25198d 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -13,6 +13,49 @@ #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) +#if __BITS_PER_LONG == 64 + +/* timespec64 is defined as timespec here */ +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + return ts64; +} + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + return ts; +} + +# define timespec_equal timespec64_equal +# define timespec_compare timespec64_compare +# define set_normalized_timespec set_normalized_timespec64 +# define timespec_add timespec64_add +# define timespec_sub timespec64_sub +# define timespec_valid timespec64_valid +# define timespec_valid_strict timespec64_valid_strict +# define timespec_to_ns timespec64_to_ns +# define ns_to_timespec ns_to_timespec64 +# define timespec_add_ns timespec64_add_ns + +#else +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + struct timespec ret; + + ret.tv_sec = (time_t)ts64.tv_sec; + ret.tv_nsec = ts64.tv_nsec; + return ret; +} + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + struct timespec64 ret; + + ret.tv_sec = ts.tv_sec; + ret.tv_nsec = ts.tv_nsec; + return ret; +} + static inline int timespec_equal(const struct timespec *a, const struct timespec *b) { @@ -116,6 +159,8 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } +#endif + /** * time_to_tm - converts the calendar time to local broken-down time * diff --git a/include/linux/time64.h b/include/linux/time64.h index 402b595c76d2..ec1888cf5378 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -7,11 +7,8 @@ typedef __s64 time64_t; typedef __u64 timeu64_t; -/* - * This wants to go into uapi/linux/time.h once we agreed about the - * userspace interfaces. - */ #if __BITS_PER_LONG == 64 +/* this trick allows us to optimize out timespec64_to_timespec */ # define timespec64 timespec #define itimerspec64 itimerspec #else @@ -41,49 +38,6 @@ struct itimerspec64 { #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#if __BITS_PER_LONG == 64 - -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - return ts64; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - return ts; -} - -# define timespec64_equal timespec_equal -# define timespec64_compare timespec_compare -# define set_normalized_timespec64 set_normalized_timespec -# define timespec64_add timespec_add -# define timespec64_sub timespec_sub -# define timespec64_valid timespec_valid -# define timespec64_valid_strict timespec_valid_strict -# define timespec64_to_ns timespec_to_ns -# define ns_to_timespec64 ns_to_timespec -# define timespec64_add_ns timespec_add_ns - -#else - -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - struct timespec ret; - - ret.tv_sec = (time_t)ts64.tv_sec; - ret.tv_nsec = ts64.tv_nsec; - return ret; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - struct timespec64 ret; - - ret.tv_sec = ts.tv_sec; - ret.tv_nsec = ts.tv_nsec; - return ret; -} - static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { @@ -185,8 +139,6 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) a->tv_nsec = ns; } -#endif - /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. diff --git a/kernel/time/time.c b/kernel/time/time.c index 947fb614c78f..fe60ebd301cf 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -407,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); +#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -467,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); +#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -486,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); -#if BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -547,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec64); -#endif + /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds From 6546911ed369af8d747215ff8b6144618e91c6ab Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:49 +0200 Subject: [PATCH 0754/1085] time: Move old timekeeping interfaces to timekeeping32.h The interfaces based on 'struct timespec' and 'unsigned long' seconds are no longer recommended for new code, and we are trying to migrate to ktime_t based interfaces and other y2038-safe variants. This moves all the legacy interfaces from linux/timekeeping.h into a new timekeeping32.h to better document this. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/ktime.h | 1 + include/linux/timekeeping.h | 137 +----------------------------- include/linux/timekeeping32.h | 151 ++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 135 deletions(-) create mode 100644 include/linux/timekeeping32.h diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 0c8bd45c8206..5b9fddbaac41 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -270,5 +270,6 @@ static inline ktime_t ms_to_ktime(u64 ms) } # include +# include #endif diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ddc229ff6d1e..405beea4e71b 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -15,27 +15,16 @@ extern void xtime_update(unsigned long ticks); /* * Get and set timeofday */ -extern void do_gettimeofday(struct timeval *tv); extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * Kernel time accessors */ -unsigned long get_seconds(void); struct timespec64 current_kernel_time64(void); -/* does not take xtime_lock */ -struct timespec __current_kernel_time(void); - -static inline struct timespec current_kernel_time(void) -{ - struct timespec64 now = current_kernel_time64(); - - return timespec64_to_timespec(now); -} /* - * timespec based interfaces + * timespec64 based interfaces */ struct timespec64 get_monotonic_coarse64(void); extern void getrawmonotonic64(struct timespec64 *ts); @@ -47,116 +36,6 @@ extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); extern void getboottime64(struct timespec64 *ts); -#if BITS_PER_LONG == 64 -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - return do_settimeofday64(ts); -} - -static inline int __getnstimeofday(struct timespec *ts) -{ - return __getnstimeofday64(ts); -} - -static inline void getnstimeofday(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - ktime_get_ts64(ts); -} - -static inline void ktime_get_real_ts(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - getrawmonotonic64(ts); -} - -static inline struct timespec get_monotonic_coarse(void) -{ - return get_monotonic_coarse64(); -} - -static inline void getboottime(struct timespec *ts) -{ - return getboottime64(ts); -} -#else -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - struct timespec64 ts64; - - ts64 = timespec_to_timespec64(*ts); - return do_settimeofday64(&ts64); -} - -static inline int __getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - int ret = __getnstimeofday64(&ts64); - - *ts = timespec64_to_timespec(ts64); - return ret; -} - -static inline void getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - - getnstimeofday64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_real_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - getnstimeofday64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - struct timespec64 ts64; - - getrawmonotonic64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline struct timespec get_monotonic_coarse(void) -{ - return timespec64_to_timespec(get_monotonic_coarse64()); -} - -static inline void getboottime(struct timespec *ts) -{ - struct timespec64 ts64; - - getboottime64(&ts64); - *ts = timespec64_to_timespec(ts64); -} -#endif - #define ktime_get_real_ts64(ts) getnstimeofday64(ts) /* @@ -241,23 +120,13 @@ extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); /* - * Timespec interfaces utilizing the ktime based ones + * timespec64 interfaces utilizing the ktime based ones */ -static inline void get_monotonic_boottime(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_boottime()); -} - static inline void get_monotonic_boottime64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } -static inline void timekeeping_clocktai(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_clocktai()); -} - static inline void timekeeping_clocktai64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); @@ -340,10 +209,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); */ extern int persistent_clock_is_local; -extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock64(struct timespec64 *ts); -extern int update_persistent_clock(struct timespec now); extern int update_persistent_clock64(struct timespec64 now); diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h new file mode 100644 index 000000000000..af4114d5dc17 --- /dev/null +++ b/include/linux/timekeeping32.h @@ -0,0 +1,151 @@ +#ifndef _LINUX_TIMEKEEPING32_H +#define _LINUX_TIMEKEEPING32_H +/* + * These interfaces are all based on the old timespec type + * and should get replaced with the timespec64 based versions + * over time so we can remove the file here. + */ + +extern void do_gettimeofday(struct timeval *tv); +unsigned long get_seconds(void); + +/* does not take xtime_lock */ +struct timespec __current_kernel_time(void); + +static inline struct timespec current_kernel_time(void) +{ + struct timespec64 now = current_kernel_time64(); + + return timespec64_to_timespec(now); +} + +#if BITS_PER_LONG == 64 +/** + * Deprecated. Use do_settimeofday64(). + */ +static inline int do_settimeofday(const struct timespec *ts) +{ + return do_settimeofday64(ts); +} + +static inline int __getnstimeofday(struct timespec *ts) +{ + return __getnstimeofday64(ts); +} + +static inline void getnstimeofday(struct timespec *ts) +{ + getnstimeofday64(ts); +} + +static inline void ktime_get_ts(struct timespec *ts) +{ + ktime_get_ts64(ts); +} + +static inline void ktime_get_real_ts(struct timespec *ts) +{ + getnstimeofday64(ts); +} + +static inline void getrawmonotonic(struct timespec *ts) +{ + getrawmonotonic64(ts); +} + +static inline struct timespec get_monotonic_coarse(void) +{ + return get_monotonic_coarse64(); +} + +static inline void getboottime(struct timespec *ts) +{ + return getboottime64(ts); +} +#else +/** + * Deprecated. Use do_settimeofday64(). + */ +static inline int do_settimeofday(const struct timespec *ts) +{ + struct timespec64 ts64; + + ts64 = timespec_to_timespec64(*ts); + return do_settimeofday64(&ts64); +} + +static inline int __getnstimeofday(struct timespec *ts) +{ + struct timespec64 ts64; + int ret = __getnstimeofday64(&ts64); + + *ts = timespec64_to_timespec(ts64); + return ret; +} + +static inline void getnstimeofday(struct timespec *ts) +{ + struct timespec64 ts64; + + getnstimeofday64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void ktime_get_ts(struct timespec *ts) +{ + struct timespec64 ts64; + + ktime_get_ts64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void ktime_get_real_ts(struct timespec *ts) +{ + struct timespec64 ts64; + + getnstimeofday64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void getrawmonotonic(struct timespec *ts) +{ + struct timespec64 ts64; + + getrawmonotonic64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline struct timespec get_monotonic_coarse(void) +{ + return timespec64_to_timespec(get_monotonic_coarse64()); +} + +static inline void getboottime(struct timespec *ts) +{ + struct timespec64 ts64; + + getboottime64(&ts64); + *ts = timespec64_to_timespec(ts64); +} +#endif + +/* + * Timespec interfaces utilizing the ktime based ones + */ +static inline void get_monotonic_boottime(struct timespec *ts) +{ + *ts = ktime_to_timespec(ktime_get_boottime()); +} + +static inline void timekeeping_clocktai(struct timespec *ts) +{ + *ts = ktime_to_timespec(ktime_get_clocktai()); +} + +/* + * Persistent clock related interfaces + */ +extern void read_persistent_clock(struct timespec *ts); +extern int update_persistent_clock(struct timespec now); + +#endif From ed6e26baa7431ebfad890f275e191e32b5a4103c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 20 Oct 2017 21:49:14 +0200 Subject: [PATCH 0755/1085] bug-hunting.rst: Fix an example and a typo in a Sphinx tag - Use the same file name in the explanation and in the example (conex.c vs sonixj.c) - Add a missing ':' in a :ref: tag which leads to incorrect Shpinx output - Add some missing ',' and ';' Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/bug-hunting.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index 08c4b1308189..f278b289e260 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -240,7 +240,7 @@ In order to report it upstream, you should identify the mailing list used for the development of the affected code. This can be done by using the ``get_maintainer.pl`` script. -For example, if you find a bug at the gspca's conex.c file, you can get +For example, if you find a bug at the gspca's sonixj.c file, you can get their maintainers with:: $ ./scripts/get_maintainer.pl -f drivers/media/usb/gspca/sonixj.c @@ -257,7 +257,7 @@ Please notice that it will point to: Tejun and Bhaktipriya (in this specific case, none really envolved on the development of this file); - The driver maintainer (Hans Verkuil); -- The subsystem maintainer (Mauro Carvalho Chehab) +- The subsystem maintainer (Mauro Carvalho Chehab); - The driver and/or subsystem mailing list (linux-media@vger.kernel.org); - the Linux Kernel mailing list (linux-kernel@vger.kernel.org). @@ -274,14 +274,14 @@ Fixing the bug -------------- If you know programming, you could help us by not only reporting the bug, -but also providing us with a solution. After all open source is about +but also providing us with a solution. After all, open source is about sharing what you do and don't you want to be recognised for your genius? If you decide to take this way, once you have worked out a fix please submit it upstream. Please do read -ref:`Documentation/process/submitting-patches.rst ` though +:ref:`Documentation/process/submitting-patches.rst ` though to help your code get accepted. From 16c0890dc66d258fdeccf7b15a133f3930b19143 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 31 Oct 2017 02:46:54 +0100 Subject: [PATCH 0756/1085] irq/work: Don't reinvent the wheel but use existing llist API Use the proper llist APIs instead of open-coded variants of them. Signed-off-by: Byungchul Park Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1509414414-14987-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..e2ebe8c71e8f 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -138,11 +138,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - while (llnode != NULL) { - work = llist_entry(llnode, struct irq_work, llnode); - - llnode = llist_next(llnode); - + llist_for_each_entry(work, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used. From b0d40d2b22fe48cfcbbfb137fd198be0a1cd8a85 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 31 Oct 2017 04:18:34 +0100 Subject: [PATCH 0757/1085] sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated Document the latest updates on the isolcpus= boot option. While at it, let's also fix the details about the preferred way to isolate a set of CPUs from the scheduler general domains. Cpusets offer a much better interface to achieve that. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509419914-16179-1-git-send-email-frederic@kernel.org [ Clarified the text some more, marked the boot option deprecated. ] Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6b99c8b77c59..17eb0234d5be 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1727,20 +1727,33 @@ isapnp= [ISAPNP] Format: ,,, - isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. - The argument is a cpu list, as described above. + isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance. + [Deprecated - use cpusets instead] + Format: [flag-list,] + + Specify one or more CPUs to isolate from disturbances + specified in the flag list (default: domain): + + nohz + Disable the tick when a single task runs. + domain + Isolate from the general SMP balancing and scheduling + algorithms. Note that performing domain isolation this way + is irreversible: it's not possible to bring back a CPU to + the domains once isolated through isolcpus. It's strongly + advised to use cpusets instead to disable scheduler load + balancing through the "cpuset.sched_load_balance" file. + It offers a much more flexible interface where CPUs can + move in and out of an isolated set anytime. + + You can move a process onto or off an "isolated" CPU via + the CPU affinity syscalls or cpuset. + begins at 0 and the maximum value is + "number of CPUs in system - 1". + + The format of is described above. - This option can be used to specify one or more CPUs - to isolate from the general SMP balancing and scheduling - algorithms. You can move a process onto or off an - "isolated" CPU via the CPU affinity syscalls or cpuset. - begins at 0 and the maximum value is - "number of CPUs in system - 1". - This option is the preferred way to isolate CPUs. The - alternative -- manually setting the CPU mask of all - tasks in the system -- can cause problems and - suboptimal load balancer performance. iucv= [HW,NET] From 6c3b56b1973083e2bb4e87eb90ea5368455706dc Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 30 Oct 2017 12:43:51 -0700 Subject: [PATCH 0758/1085] x86/boot: Disable Clang warnings about GNU extensions The kernel makes use of several GCC extensions, disable Clang warnings about that in the boot code, as we already do for the rest of the kernel. This suppresses the following warning when building with clang: ./include/linux/cgroup-defs.h:391:16: warning: field 'cgrp' with variable sized type 'struct cgroup' not at the end of a struct or class is a GNU extension [-Wgnu-variable-sized-type-not-at-end] struct cgroup cgrp; Reported-by: Nick Desaulniers Signed-off-by: Matthias Kaehlcke Acked-by: Thomas Gleixner Cc: Douglas Anderson Cc: Guenter Roeck Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171030194351.122090-1-mka@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8a958274b54c..65a150a7f15c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -35,6 +35,7 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n From c128dbfa0f879f8ce7b79054037889b0b2240728 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Mon, 30 Oct 2017 18:20:29 -0700 Subject: [PATCH 0759/1085] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, AVX512_BITALG. CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG Detailed information of CPUID bits for these features can be found in the Intel Architecture Instruction Set Extensions and Future Features Programming Interface document (refer to Table 1-1. and Table 1-2.). A copy of this document is available at https://bugzilla.kernel.org/show_bug.cgi?id=197239 Signed-off-by: Gayatri Kammela Acked-by: Thomas Gleixner Cc: Andi Kleen Cc: Fenghua Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Ricardo Neri Cc: Yang Zhong Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 6 ++++++ arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 401a70992060..b0556f882aa8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -299,6 +299,12 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c1d49842a411..c21f22d836ad 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, From 707ce9eac5fc3b68f98c887dddea3911a8fc4f9f Mon Sep 17 00:00:00 2001 From: James Ban Date: Mon, 30 Oct 2017 11:32:38 +0900 Subject: [PATCH 0760/1085] regulator: da9211: update for supporting da9223/4/5 This is update for supporting additional devices da9223/4/5. Only device strings is added because only package type is different. Signed-off-by: James Ban Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/da9211.txt | 82 +++++++++++++++++-- drivers/regulator/Kconfig | 2 +- drivers/regulator/da9211-regulator.c | 14 +++- drivers/regulator/da9211-regulator.h | 2 +- include/linux/regulator/da9211.h | 5 +- 5 files changed, 91 insertions(+), 14 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/da9211.txt b/Documentation/devicetree/bindings/regulator/da9211.txt index 0f2a6f8fcafd..27717e816e71 100644 --- a/Documentation/devicetree/bindings/regulator/da9211.txt +++ b/Documentation/devicetree/bindings/regulator/da9211.txt @@ -1,8 +1,9 @@ -* Dialog Semiconductor DA9211/DA9212/DA9213/DA9214/DA9215 Voltage Regulator +* Dialog Semiconductor DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 + Voltage Regulator Required properties: -- compatible: "dlg,da9211" or "dlg,da9212" or "dlg,da9213" - or "dlg,da9214" or "dlg,da9215" +- compatible: "dlg,da9211" or "dlg,da9212" or "dlg,da9213" or "dlg,da9223" + or "dlg,da9214" or "dlg,da9224" or "dlg,da9215" or "dlg,da9225" - reg: I2C slave address, usually 0x68. - interrupts: the interrupt outputs of the controller - regulators: A node that houses a sub-node for each regulator within the @@ -16,7 +17,6 @@ Optional properties: - Any optional property defined in regulator.txt Example 1) DA9211 - pmic: da9211@68 { compatible = "dlg,da9211"; reg = <0x68>; @@ -35,7 +35,6 @@ Example 1) DA9211 }; Example 2) DA9212 - pmic: da9212@68 { compatible = "dlg,da9212"; reg = <0x68>; @@ -79,7 +78,25 @@ Example 3) DA9213 }; }; -Example 4) DA9214 +Example 4) DA9223 + pmic: da9223@68 { + compatible = "dlg,da9223"; + reg = <0x68>; + interrupts = <3 27>; + + regulators { + BUCKA { + regulator-name = "VBUCKA"; + regulator-min-microvolt = < 300000>; + regulator-max-microvolt = <1570000>; + regulator-min-microamp = <3000000>; + regulator-max-microamp = <6000000>; + enable-gpios = <&gpio 27 0>; + }; + }; + }; + +Example 5) DA9214 pmic: da9214@68 { compatible = "dlg,da9214"; reg = <0x68>; @@ -105,7 +122,33 @@ Example 4) DA9214 }; }; -Example 5) DA9215 +Example 6) DA9224 + pmic: da9224@68 { + compatible = "dlg,da9224"; + reg = <0x68>; + interrupts = <3 27>; + + regulators { + BUCKA { + regulator-name = "VBUCKA"; + regulator-min-microvolt = < 300000>; + regulator-max-microvolt = <1570000>; + regulator-min-microamp = <3000000>; + regulator-max-microamp = <6000000>; + enable-gpios = <&gpio 27 0>; + }; + BUCKB { + regulator-name = "VBUCKB"; + regulator-min-microvolt = < 300000>; + regulator-max-microvolt = <1570000>; + regulator-min-microamp = <3000000>; + regulator-max-microamp = <6000000>; + enable-gpios = <&gpio 17 0>; + }; + }; + }; + +Example 7) DA9215 pmic: da9215@68 { compatible = "dlg,da9215"; reg = <0x68>; @@ -131,3 +174,28 @@ Example 5) DA9215 }; }; +Example 8) DA9225 + pmic: da9225@68 { + compatible = "dlg,da9225"; + reg = <0x68>; + interrupts = <3 27>; + + regulators { + BUCKA { + regulator-name = "VBUCKA"; + regulator-min-microvolt = < 300000>; + regulator-max-microvolt = <1570000>; + regulator-min-microamp = <4000000>; + regulator-max-microamp = <7000000>; + enable-gpios = <&gpio 27 0>; + }; + BUCKB { + regulator-name = "VBUCKB"; + regulator-min-microvolt = < 300000>; + regulator-max-microvolt = <1570000>; + regulator-min-microamp = <4000000>; + regulator-max-microamp = <7000000>; + enable-gpios = <&gpio 17 0>; + }; + }; + }; diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 0fd6195601ba..96cd55f9e3c5 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -244,7 +244,7 @@ config REGULATOR_DA9210 interface. config REGULATOR_DA9211 - tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9214/DA9215 regulator" + tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 regulator" depends on I2C select REGMAP_I2C help diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c index aa47280efd32..9b8f47617724 100644 --- a/drivers/regulator/da9211-regulator.c +++ b/drivers/regulator/da9211-regulator.c @@ -1,6 +1,6 @@ /* * da9211-regulator.c - Regulator device driver for DA9211/DA9212 - * /DA9213/DA9214/DA9215 + * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 * Copyright (C) 2015 Dialog Semiconductor Ltd. * * This library is free software; you can redistribute it and/or @@ -496,8 +496,11 @@ static const struct i2c_device_id da9211_i2c_id[] = { {"da9211", DA9211}, {"da9212", DA9212}, {"da9213", DA9213}, + {"da9223", DA9223}, {"da9214", DA9214}, + {"da9224", DA9224}, {"da9215", DA9215}, + {"da9225", DA9225}, {}, }; MODULE_DEVICE_TABLE(i2c, da9211_i2c_id); @@ -507,8 +510,11 @@ static const struct of_device_id da9211_dt_ids[] = { { .compatible = "dlg,da9211", .data = &da9211_i2c_id[0] }, { .compatible = "dlg,da9212", .data = &da9211_i2c_id[1] }, { .compatible = "dlg,da9213", .data = &da9211_i2c_id[2] }, - { .compatible = "dlg,da9214", .data = &da9211_i2c_id[3] }, - { .compatible = "dlg,da9215", .data = &da9211_i2c_id[4] }, + { .compatible = "dlg,da9223", .data = &da9211_i2c_id[3] }, + { .compatible = "dlg,da9214", .data = &da9211_i2c_id[4] }, + { .compatible = "dlg,da9224", .data = &da9211_i2c_id[5] }, + { .compatible = "dlg,da9215", .data = &da9211_i2c_id[6] }, + { .compatible = "dlg,da9225", .data = &da9211_i2c_id[7] }, {}, }; MODULE_DEVICE_TABLE(of, da9211_dt_ids); @@ -526,5 +532,5 @@ static struct i2c_driver da9211_regulator_driver = { module_i2c_driver(da9211_regulator_driver); MODULE_AUTHOR("James Ban "); -MODULE_DESCRIPTION("DA9211/DA9212/DA9213/DA9214/DA9215 regulator driver"); +MODULE_DESCRIPTION("DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 regulator driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/da9211-regulator.h b/drivers/regulator/da9211-regulator.h index b841bbf330cc..2cb32aab4f82 100644 --- a/drivers/regulator/da9211-regulator.h +++ b/drivers/regulator/da9211-regulator.h @@ -1,6 +1,6 @@ /* * da9211-regulator.h - Regulator definitions for DA9211/DA9212 - * /DA9213/DA9214/DA9215 + * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 * Copyright (C) 2015 Dialog Semiconductor Ltd. * * This program is free software; you can redistribute it and/or diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h index 80cb40b7c88d..f2fd2d3bf58f 100644 --- a/include/linux/regulator/da9211.h +++ b/include/linux/regulator/da9211.h @@ -1,6 +1,6 @@ /* * da9211.h - Regulator device driver for DA9211/DA9212 - * /DA9213/DA9214/DA9215 + * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 * Copyright (C) 2015 Dialog Semiconductor Ltd. * * This program is free software; you can redistribute it and/or @@ -25,8 +25,11 @@ enum da9211_chip_id { DA9211, DA9212, DA9213, + DA9223, DA9214, + DA9224, DA9215, + DA9225, }; struct da9211_pdata { From 67f7b2781fafcc0f52464880154b320fea1ae982 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 30 Oct 2017 11:35:25 +0100 Subject: [PATCH 0761/1085] spi: fix use-after-free at controller deregistration The controller is typically freed as part of device_unregister() so store the bus id before deregistration to avoid use-after-free when the id is later released. Fixes: 9b61e302210e ("spi: Pick spi bus number from Linux idr or spi alias") Signed-off-by: Johan Hovold Signed-off-by: Mark Brown Cc: stable --- drivers/spi/spi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 6e65524cbfd9..759c9725495a 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2240,11 +2240,12 @@ static int __unregister(struct device *dev, void *null) void spi_unregister_controller(struct spi_controller *ctlr) { struct spi_controller *found; + int id = ctlr->bus_num; int dummy; /* First make sure that this controller was ever added */ mutex_lock(&board_lock); - found = idr_find(&spi_master_idr, ctlr->bus_num); + found = idr_find(&spi_master_idr, id); mutex_unlock(&board_lock); if (found != ctlr) { dev_dbg(&ctlr->dev, @@ -2264,7 +2265,7 @@ void spi_unregister_controller(struct spi_controller *ctlr) device_unregister(&ctlr->dev); /* free bus id */ mutex_lock(&board_lock); - idr_remove(&spi_master_idr, ctlr->bus_num); + idr_remove(&spi_master_idr, id); mutex_unlock(&board_lock); } EXPORT_SYMBOL_GPL(spi_unregister_controller); From 68b892f1fdc493d7cd4e4067596879cd097c1f62 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 30 Oct 2017 11:35:26 +0100 Subject: [PATCH 0762/1085] spi: document odd controller reference handling Document the fact that a reference to the controller is dropped as part of deregistration. This is an odd pattern as the reference is typically taken in __spi_alloc_controller() rather than spi_register_controller(). Most controller drivers gets it right these days and notably the device-managed interface relies on this behaviour. Signed-off-by: Johan Hovold Signed-off-by: Mark Brown --- drivers/spi/spi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 6e65524cbfd9..5673cca1d1d0 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2195,7 +2195,7 @@ static void devm_spi_unregister(struct device *dev, void *res) * Context: can sleep * * Register a SPI device as with spi_register_controller() which will - * automatically be unregister + * automatically be unregistered and freed. * * Return: zero on success, else a negative error code. */ @@ -2236,6 +2236,8 @@ static int __unregister(struct device *dev, void *null) * only ones directly touching chip registers. * * This must be called from context that can sleep. + * + * Note that this function also drops a reference to the controller. */ void spi_unregister_controller(struct spi_controller *ctlr) { From 4d5e0689dc9d5640ad46cdfbe1896b74d8df1661 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 30 Oct 2017 11:35:27 +0100 Subject: [PATCH 0763/1085] spi: spi-axi: fix potential use-after-free after deregistration Take an extra reference to the controller before deregistering it to prevent use-after-free in the interrupt handler in case an interrupt fires before the line is disabled. Fixes: b1353d1c1d45 ("spi: Add Analog Devices AXI SPI Engine controller support") Acked-by: Lars-Peter Clausen Signed-off-by: Johan Hovold Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 6ab4c7700228..68cfc351b47f 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -553,7 +553,7 @@ err_put_master: static int spi_engine_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); + struct spi_master *master = spi_master_get(platform_get_drvdata(pdev)); struct spi_engine *spi_engine = spi_master_get_devdata(master); int irq = platform_get_irq(pdev, 0); @@ -561,6 +561,8 @@ static int spi_engine_remove(struct platform_device *pdev) free_irq(irq, master); + spi_master_put(master); + writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_INT_ENABLE); writel_relaxed(0x01, spi_engine->base + SPI_ENGINE_REG_RESET); From 974488e4ce1ed0b39f2c711c13f523c5912128a1 Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Thu, 26 Oct 2017 18:08:39 -0700 Subject: [PATCH 0764/1085] spi: imx: Fix failure path leak on GPIO request error If the code that requests any chip select GPIOs fails, the cleanup of spi_bitbang_start() by calling spi_bitbang_stop() is not done. Fix this by moving spi_bitbang_start() to after the code that requets GPIOs. The GPIOs are dev managed and don't need explicit cleanup. Since spi_bitbang_start() is now the last operation, it doesn't need to be cleaned up in the failure path. CC: Shawn Guo CC: Sascha Hauer CC: Fabio Estevam CC: Mark Brown Reviewed-by: Oleksij Rempel Signed-off-by: Trent Piepho Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index fe35aaea323b..5ddd32ba2521 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1613,11 +1613,6 @@ static int spi_imx_probe(struct platform_device *pdev) spi_imx->devtype_data->intctrl(spi_imx, 0); master->dev.of_node = pdev->dev.of_node; - ret = spi_bitbang_start(&spi_imx->bitbang); - if (ret) { - dev_err(&pdev->dev, "bitbang start failed with %d\n", ret); - goto out_clk_put; - } if (!spi_imx->slave_mode) { if (!master->cs_gpios) { @@ -1641,6 +1636,12 @@ static int spi_imx_probe(struct platform_device *pdev) } } + ret = spi_bitbang_start(&spi_imx->bitbang); + if (ret) { + dev_err(&pdev->dev, "bitbang start failed with %d\n", ret); + goto out_clk_put; + } + dev_info(&pdev->dev, "probed\n"); clk_disable(spi_imx->clk_ipg); From d4d1fc61eb38ff8e5af657e2d2f2290859a277f2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 31 Oct 2017 10:43:39 -0700 Subject: [PATCH 0765/1085] ia64: Update fsyscall gettime to use modern vsyscall_update John Stultz provided the outline for this patch back in May 2014 here: http://patches.linaro.org/patch/30501/ but I let this sit on the shelf for too long and in the intervening years almost every field in "struct timekeeper" was changed. So this is almost completely different from his original. Though the key change in arch/ia64/kernel/fsys.S remains the same. The core logic change with the updated vsyscall method is that we preserve the base nanosecond value in shifted nanoseconds, which allows us to avoid truncating and rounding up to the next nanosecond every tick to avoid inconsistencies. Thus the logic moved from nsec = ((cycle_delta * mult)>>shift) + base_nsec; to nsec = ((cycle_delta * mult) + base_snsec) >> shift; Cc: John Stultz Cc: linux-ia64@vger.kernel.org Signed-off-by: Tony Luck --- arch/ia64/Kconfig | 2 +- arch/ia64/kernel/asm-offsets.c | 2 ++ arch/ia64/kernel/fsys.S | 8 +++--- arch/ia64/kernel/fsyscall_gtod_data.h | 10 +++++-- arch/ia64/kernel/time.c | 38 ++++++++++++++------------- 5 files changed, 35 insertions(+), 25 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 6a15083cc366..4d032e7f1637 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -46,7 +46,7 @@ config IA64 select ARCH_TASK_STRUCT_ALLOCATOR select ARCH_THREAD_STACK_ALLOCATOR select ARCH_CLOCKSOURCE_DATA - select GENERIC_TIME_VSYSCALL_OLD + select GENERIC_TIME_VSYSCALL select SYSCTL_ARCH_UNALIGN_NO_WARN select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 798bdb209d00..c5eecf3e76fc 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -211,6 +211,8 @@ void foo(void) BLANK(); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + DEFINE(IA64_TIME_SN_SPEC_SNSEC_OFFSET, + offsetof (struct time_sn_spec, snsec)); DEFINE(CLONE_SETTLS_BIT, 19); #if CLONE_SETTLS != (1<<19) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index edbf7af95849..0d3c7abb31a8 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -235,9 +235,9 @@ ENTRY(fsys_gettimeofday) MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!! (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. (p13) ld8 r25 = [r19] // get itc_lastcycle value - ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec + ld8 r9 = [r22],IA64_TIME_SN_SPEC_SNSEC_OFFSET // sec ;; - ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec + ld8 r8 = [r22],-IA64_TIME_SN_SPEC_SNSEC_OFFSET // snsec (p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) ;; (p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared @@ -265,9 +265,9 @@ EX(.fail_efault, probe.w.fault r31, 3) mf ;; ld4 r10 = [r20] // gtod_lock.sequence - shr.u r2 = r2,r23 // shift by factor - ;; add r8 = r8,r2 // Add xtime.nsecs + ;; + shr.u r8 = r8,r23 // shift by factor cmp4.ne p7,p0 = r28,r10 (p7) br.cond.dpnt.few .time_redo // sequence number changed, redo // End critical section. diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h index dcc514917731..28363bfc9f57 100644 --- a/arch/ia64/kernel/fsyscall_gtod_data.h +++ b/arch/ia64/kernel/fsyscall_gtod_data.h @@ -5,10 +5,16 @@ * fsyscall gettimeofday data */ +/* like timespec, but includes "shifted nanoseconds" */ +struct time_sn_spec { + u64 sec; + u64 snsec; +}; + struct fsyscall_gtod_data_t { seqcount_t seq; - struct timespec wall_time; - struct timespec monotonic_time; + struct time_sn_spec wall_time; + struct time_sn_spec monotonic_time; u64 clk_mask; u32 clk_mult; u32 clk_shift; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index aa7be020a904..c6ecb97151a2 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -430,30 +430,32 @@ void update_vsyscall_tz(void) { } -void update_vsyscall_old(struct timespec *wall, struct timespec *wtm, - struct clocksource *c, u32 mult, u64 cycle_last) +void update_vsyscall(struct timekeeper *tk) { write_seqcount_begin(&fsyscall_gtod_data.seq); - /* copy fsyscall clock data */ - fsyscall_gtod_data.clk_mask = c->mask; - fsyscall_gtod_data.clk_mult = mult; - fsyscall_gtod_data.clk_shift = c->shift; - fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio; - fsyscall_gtod_data.clk_cycle_last = cycle_last; + /* copy vsyscall data */ + fsyscall_gtod_data.clk_mask = tk->tkr_mono.mask; + fsyscall_gtod_data.clk_mult = tk->tkr_mono.mult; + fsyscall_gtod_data.clk_shift = tk->tkr_mono.shift; + fsyscall_gtod_data.clk_fsys_mmio = tk->tkr_mono.clock->archdata.fsys_mmio; + fsyscall_gtod_data.clk_cycle_last = tk->tkr_mono.cycle_last; - /* copy kernel time structures */ - fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec; - fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec; - fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec - + wall->tv_sec; - fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec - + wall->tv_nsec; + fsyscall_gtod_data.wall_time.sec = tk->xtime_sec; + fsyscall_gtod_data.wall_time.snsec = tk->tkr_mono.xtime_nsec; + + fsyscall_gtod_data.monotonic_time.sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + fsyscall_gtod_data.monotonic_time.snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); /* normalize */ - while (fsyscall_gtod_data.monotonic_time.tv_nsec >= NSEC_PER_SEC) { - fsyscall_gtod_data.monotonic_time.tv_nsec -= NSEC_PER_SEC; - fsyscall_gtod_data.monotonic_time.tv_sec++; + while (fsyscall_gtod_data.monotonic_time.snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + fsyscall_gtod_data.monotonic_time.snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + fsyscall_gtod_data.monotonic_time.sec++; } write_seqcount_end(&fsyscall_gtod_data.seq); From d6332a176b869df1839abb26c8f80026a66d21d6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Oct 2017 14:15:33 +0900 Subject: [PATCH 0766/1085] perf callchain: Fix double mapping al->addr for children without self period Milian Wolff found a problem he described in [1] and that for him would get fixed: "Note how most of the large offset values are now gone. Most notably, we get proper srcline resolution for the random.h and complex headers." Then Namhyung found the root cause: "I looked into it and found a bug handling cumulative (children) entries. For children entries that have no self period, the al->addr (so he->ip) ends up having an doubly-mapped address. It seems to be there from the beginning but only affects entries that have no srclines - finding srcline itself is done using a different address but it will show the invalid address if no srcline was found. I think we should fix the commit c7405d85d7a3 ("perf tools: Update cpumode for each cumulative entry")." [1] https://lkml.kernel.org/r/20171018185350.14893-7-milian.wolff@kdab.com Reported-by: Milian Wolff Signed-off-by: Namhyung Kim Tested-by: Milian Wolff Cc: Jin Yao Cc: Jiri Olsa Cc: kernel-team@lge.com Fixes: c7405d85d7a3 ("perf tools: Update cpumode for each cumulative entry") Link: https://lkml.kernel.org/r/20171020051533.GA2746@sejong Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 3a3916934a92..837012147c7b 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1091,10 +1091,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * al->map = node->map; al->sym = node->sym; al->srcline = node->srcline; - if (node->map) - al->addr = node->map->map_ip(node->map, node->ip); - else - al->addr = node->ip; + al->addr = node->ip; if (al->sym == NULL) { if (hide_unresolved) From 735e215e95e53b857000aaabe1b4707878b10f43 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 31 Oct 2017 10:04:11 -0300 Subject: [PATCH 0767/1085] tools include uapi: Grab a copy of linux/kcmp.h We will use it to generate tables for beautifying kcmp's 'type' arg. Cc: Adrian Hunter Cc: Andrey Vagin Cc: Cyrill Gorcunov Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-r35zr79invmpinfe1zu57cas@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kcmp.h | 27 +++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 28 insertions(+) create mode 100644 tools/include/uapi/linux/kcmp.h diff --git a/tools/include/uapi/linux/kcmp.h b/tools/include/uapi/linux/kcmp.h new file mode 100644 index 000000000000..481e103da78e --- /dev/null +++ b/tools/include/uapi/linux/kcmp.h @@ -0,0 +1,27 @@ +#ifndef _UAPI_LINUX_KCMP_H +#define _UAPI_LINUX_KCMP_H + +#include + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + KCMP_EPOLL_TFD, + + KCMP_TYPES, +}; + +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + +#endif /* _UAPI_LINUX_KCMP_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 8d8b37198666..a3a041b0d35e 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -4,6 +4,7 @@ HEADERS=' include/uapi/drm/drm.h include/uapi/drm/i915_drm.h include/uapi/linux/fcntl.h +include/uapi/linux/kcmp.h include/uapi/linux/kvm.h include/uapi/linux/perf_event.h include/uapi/linux/prctl.h From 0a2f7540abc08e32f89ff273506eeeeb6910794f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 31 Oct 2017 11:30:09 -0300 Subject: [PATCH 0768/1085] perf trace beauty: Implement pid_fd beautifier One that given a pid and a fd, will try to get the path for that fd. Will be used in the upcoming kcmp's KCMP_FILE beautifier. Cc: Adrian Hunter Cc: Andrey Vagin Cc: Cyrill Gorcunov Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-7ketygp2dvs9h13wuakfncws@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 17 +++++++++++++++++ tools/perf/trace/beauty/beauty.h | 3 +++ 2 files changed, 20 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c373f9a3e4a9..0c1461416ef1 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -988,6 +988,23 @@ size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg) return printed; } +size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size) +{ + size_t printed = scnprintf(bf, size, "%d", fd); + struct thread *thread = machine__find_thread(trace->host, pid, pid); + + if (thread) { + const char *path = thread__fd_path(thread, fd, trace); + + if (path) + printed += scnprintf(bf + printed, size - printed, "<%s>", path); + + thread__put(thread); + } + + return printed; +} + static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, struct syscall_arg *arg) { diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index b29f94ef5d6a..365bb6fecc1c 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -3,6 +3,7 @@ #include #include +#include struct strarray { int offset; @@ -26,6 +27,8 @@ size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const cha struct trace; struct thread; +size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size); + /** * @val: value of syscall argument being formatted * @args: All the args, use syscall_args__val(arg, nth) to access one From 1de3038d0082a1c2edd7a7b1b3381e38f42af0e7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 31 Oct 2017 11:32:23 -0300 Subject: [PATCH 0769/1085] perf trace beauty kcmp: Beautify arguments For some unknown reason there is no entry in tracefs's syscalls for kcmp, i.e. no tracefs/events/syscalls/sys_{enter,exit}_kcmp, so we need to provide a data dictionary for the fields. To beautify the 'type' argument we automatically generate a strarray from tools/include/uapi/kcmp.h, the idx1 and idx2 args, nowadays used only if type == KCMP_FILE, are masked for all the other types and a lookup is made for the thread and fd to show the path, if possible, getting it from the probe:vfs_getname if in place or from procfs, races allowing. A system wide strace like tracing session, with callchains shows just one user so far in this fedora 25 machine: # perf trace --max-stack 5 -e kcmp 1502914.400 ( 0.001 ms): systemd/1 kcmp(pid1: 1 (systemd), pid2: 1 (systemd), type: FILE, idx1: 271, idx2: 25) = -1 ENOSYS Function not implemented syscall (/usr/lib64/libc-2.25.so) same_fd (/usr/lib/systemd/libsystemd-shared-233.so) service_add_fd_store (/usr/lib/systemd/systemd) service_notify_message.lto_priv.127 (/usr/lib/systemd/systemd) 1502914.407 ( 0.001 ms): systemd/1 kcmp(pid1: 1 (systemd), pid2: 1 (systemd), type: FILE, idx1: 270, idx2: 25) = -1 ENOSYS Function not implemented syscall (/usr/lib64/libc-2.25.so) same_fd (/usr/lib/systemd/libsystemd-shared-233.so) service_add_fd_store (/usr/lib/systemd/systemd) service_notify_message.lto_priv.127 (/usr/lib/systemd/systemd) The backtraces seem to agree this is really kcmp(), but this system doesn't have the sys_kcmp(), bummer: # uname -a Linux jouet 4.14.0-rc3+ #1 SMP Fri Oct 13 12:21:12 -03 2017 x86_64 x86_64 x86_64 GNU/Linux # grep kcmp /proc/kallsyms ffffffffb60b8890 W sys_kcmp $ grep CONFIG_CHECKPOINT_RESTORE ../build/v4.14.0-rc3+/.config # CONFIG_CHECKPOINT_RESTORE is not set $ So systemd uses it, good fedora kernel config has it: $ grep CONFIG_CHECKPOINT_RESTORE /boot/config-4.13.4-200.fc26.x86_64 CONFIG_CHECKPOINT_RESTORE=y [acme@jouet linux]$ /me goes to rebuild a kernel... Cc: Adrian Hunter Cc: Andrey Vagin Cc: Cyrill Gorcunov Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-gz5fca968viw8m7hryjqvrln@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 9 ++++++ tools/perf/builtin-trace.c | 6 ++++ tools/perf/trace/beauty/Build | 1 + tools/perf/trace/beauty/beauty.h | 6 ++++ tools/perf/trace/beauty/kcmp.c | 44 ++++++++++++++++++++++++++++ tools/perf/trace/beauty/kcmp_type.sh | 10 +++++++ 6 files changed, 76 insertions(+) create mode 100644 tools/perf/trace/beauty/kcmp.c create mode 100755 tools/perf/trace/beauty/kcmp_type.sh diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index c872ca607b39..68cf1360a3f3 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -420,6 +420,13 @@ sndrv_pcm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh $(sndrv_pcm_ioctl_array): $(sndrv_pcm_hdr_dir)/asound.h $(sndrv_pcm_ioctl_tbl) $(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(sndrv_pcm_hdr_dir) > $@ +kcmp_type_array := $(beauty_outdir)/kcmp_type_array.c +kcmp_hdr_dir := $(srctree)/tools/include/uapi/linux/ +kcmp_type_tbl := $(srctree)/tools/perf/trace/beauty/kcmp_type.sh + +$(kcmp_type_array): $(kcmp_hdr_dir)/kcmp.h $(kcmp_type_tbl) + $(Q)$(SHELL) '$(kcmp_type_tbl)' $(kcmp_hdr_dir) > $@ + kvm_ioctl_array := $(beauty_ioctl_outdir)/kvm_ioctl_array.c kvm_hdr_dir := $(srctree)/tools/include/uapi/linux kvm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/kvm_ioctl.sh @@ -553,6 +560,7 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(pkey_alloc_access_rights_array) \ $(sndrv_pcm_ioctl_array) \ $(sndrv_ctl_ioctl_array) \ + $(kcmp_type_array) \ $(kvm_ioctl_array) \ $(vhost_virtio_ioctl_array) \ $(madvise_behavior_array) \ @@ -836,6 +844,7 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)$(sndrv_ctl_ioctl_array) \ $(OUTPUT)$(sndrv_pcm_ioctl_array) \ $(OUTPUT)$(kvm_ioctl_array) \ + $(OUTPUT)$(kcmp_type_array) \ $(OUTPUT)$(vhost_virtio_ioctl_array) \ $(OUTPUT)$(perf_ioctl_array) \ $(OUTPUT)$(prctl_option_array) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0c1461416ef1..505b871fdc82 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -633,6 +633,12 @@ static struct syscall_fmt { #else [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, #endif + { .name = "kcmp", .nr_args = 5, + .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, }, + [1] = { .name = "pid2", .scnprintf = SCA_PID, }, + [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, }, + [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, }, + [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, }, { .name = "keyctl", .arg = { [0] = STRARRAY(option, keyctl_options), }, }, { .name = "kill", diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index 2f68b76ec39b..066bbf0f4a74 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -3,6 +3,7 @@ libperf-y += fcntl.o ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) libperf-y += ioctl.o endif +libperf-y += kcmp.o libperf-y += pkey_alloc.o libperf-y += prctl.o libperf-y += statx.o diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 365bb6fecc1c..3f067bdab84f 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -81,6 +81,12 @@ size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_ar size_t syscall_arg__scnprintf_ioctl_cmd(char *bf, size_t size, struct syscall_arg *arg); #define SCA_IOCTL_CMD syscall_arg__scnprintf_ioctl_cmd +size_t syscall_arg__scnprintf_kcmp_type(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_KCMP_TYPE syscall_arg__scnprintf_kcmp_type + +size_t syscall_arg__scnprintf_kcmp_idx(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_KCMP_IDX syscall_arg__scnprintf_kcmp_idx + size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PKEY_ALLOC_ACCESS_RIGHTS syscall_arg__scnprintf_pkey_alloc_access_rights diff --git a/tools/perf/trace/beauty/kcmp.c b/tools/perf/trace/beauty/kcmp.c new file mode 100644 index 000000000000..f62040eb9d5c --- /dev/null +++ b/tools/perf/trace/beauty/kcmp.c @@ -0,0 +1,44 @@ +/* + * trace/beauty/kcmp.c + * + * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "trace/beauty/beauty.h" +#include +#include +#include +#include + +#include "trace/beauty/generated/kcmp_type_array.c" + +size_t syscall_arg__scnprintf_kcmp_idx(char *bf, size_t size, struct syscall_arg *arg) +{ + unsigned long fd = arg->val; + int type = syscall_arg__val(arg, 2); + pid_t pid; + + if (type != KCMP_FILE) + return syscall_arg__scnprintf_long(bf, size, arg); + + pid = syscall_arg__val(arg, arg->idx == 3 ? 0 : 1); /* idx1 -> pid1, idx2 -> pid2 */ + return pid__scnprintf_fd(arg->trace, pid, fd, bf, size); +} + +static size_t kcmp__scnprintf_type(int type, char *bf, size_t size) +{ + static DEFINE_STRARRAY(kcmp_types); + return strarray__scnprintf(&strarray__kcmp_types, bf, size, "%d", type); +} + +size_t syscall_arg__scnprintf_kcmp_type(char *bf, size_t size, struct syscall_arg *arg) +{ + unsigned long type = arg->val; + + if (type != KCMP_FILE) + arg->mask |= (1 << 3) | (1 << 4); /* Ignore idx1 and idx2 */ + + return kcmp__scnprintf_type(type, bf, size); +} diff --git a/tools/perf/trace/beauty/kcmp_type.sh b/tools/perf/trace/beauty/kcmp_type.sh new file mode 100755 index 000000000000..40d063b8c082 --- /dev/null +++ b/tools/perf/trace/beauty/kcmp_type.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +header_dir=$1 + +printf "static const char *kcmp_types[] = {\n" +regex='^[[:space:]]+(KCMP_(\w+)),' +egrep $regex ${header_dir}/kcmp.h | grep -v KCMP_TYPES, | \ + sed -r "s/$regex/\1 \2/g" | \ + xargs printf "\t[%s]\t= \"%s\",\n" +printf "};\n" From a9903f04e0a4ea522d959c2f287cdf0ab029e324 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 30 Oct 2017 11:08:16 -0700 Subject: [PATCH 0770/1085] sched/sysctl: Fix attributes of some extern declarations The definition of sysctl_sched_migration_cost, sysctl_sched_nr_migrate and sysctl_sched_time_avg includes the attribute const_debug. This attribute is not part of the extern declaration of these variables in include/linux/sched/sysctl.h, while it is in kernel/sched/sched.h, and as a result Clang generates warnings like this: kernel/sched/sched.h:1618:33: warning: section attribute is specified on redeclared variable [-Wsection] extern const_debug unsigned int sysctl_sched_time_avg; ^ ./include/linux/sched/sysctl.h:42:21: note: previous declaration is here extern unsigned int sysctl_sched_time_avg; The header only declares the variables when CONFIG_SCHED_DEBUG is defined, therefore it is not necessary to duplicate the definition of const_debug. Instead we can use the attribute __read_mostly, which is the expansion of const_debug when CONFIG_SCHED_DEBUG=y is set. Signed-off-by: Matthias Kaehlcke Reviewed-by: Nick Desaulniers Cc: Douglas Anderson Cc: Guenter Roeck Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shile Zhang Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171030180816.170850-1-mka@chromium.org Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0f5ecd4d298e..d34c823f3d36 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -37,9 +37,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; +extern __read_mostly unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_time_avg; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, From 10d94ff4d558b96bfc4f55bb0051ae4d938246fe Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 1 Nov 2017 10:14:51 +0600 Subject: [PATCH 0771/1085] irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1) When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before initializing the slab allocator to allocate a cpumask. So, use alloc_bootmem_cpumask_var() instead. Also do some cleanups while at it: in init_irq_default_affinity() remove an #ifdef via using cpumask_available(). Signed-off-by: Rakib Mullick Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mullick@gmail.com Link: http://lkml.kernel.org/r/20171101041451.12581-1-rakib.mullick@gmail.com Signed-off-by: Ingo Molnar --- kernel/irq/irqdesc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 982a3576fb01..f2edcf85780d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { - zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with @@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!irq_default_affinity) + if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); -#endif if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } From 8698b9364710e7bac84b3af07dd410e39c8c2e08 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 1 Nov 2017 10:11:55 +0800 Subject: [PATCH 0772/1085] regmap: Add hardware spinlock support On some platforms, when reading or writing some special registers through regmap, we should acquire one hardware spinlock to synchronize between the multiple subsystems. Thus this patch adds the hardware spinlock support for regmap. Signed-off-by: Baolin Wang Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 2 + drivers/base/regmap/regmap.c | 101 +++++++++++++++++++++++++++------ include/linux/regmap.h | 6 ++ 3 files changed, 93 insertions(+), 16 deletions(-) diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 2a4435d76028..8641183cac2f 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -157,6 +157,8 @@ struct regmap { struct rb_root range_tree; void *selector_work_buf; /* Scratch buffer used for selector */ + + struct hwspinlock *hwlock; }; struct regcache_ops { diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index b9a779a4a739..999e981a174a 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -20,6 +20,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -413,6 +414,49 @@ static unsigned int regmap_parse_64_native(const void *buf) } #endif +static void regmap_lock_hwlock(void *__map) +{ + struct regmap *map = __map; + + hwspin_lock_timeout(map->hwlock, UINT_MAX); +} + +static void regmap_lock_hwlock_irq(void *__map) +{ + struct regmap *map = __map; + + hwspin_lock_timeout_irq(map->hwlock, UINT_MAX); +} + +static void regmap_lock_hwlock_irqsave(void *__map) +{ + struct regmap *map = __map; + + hwspin_lock_timeout_irqsave(map->hwlock, UINT_MAX, + &map->spinlock_flags); +} + +static void regmap_unlock_hwlock(void *__map) +{ + struct regmap *map = __map; + + hwspin_unlock(map->hwlock); +} + +static void regmap_unlock_hwlock_irq(void *__map) +{ + struct regmap *map = __map; + + hwspin_unlock_irq(map->hwlock); +} + +static void regmap_unlock_hwlock_irqrestore(void *__map) +{ + struct regmap *map = __map; + + hwspin_unlock_irqrestore(map->hwlock, &map->spinlock_flags); +} + static void regmap_lock_mutex(void *__map) { struct regmap *map = __map; @@ -627,6 +671,29 @@ struct regmap *__regmap_init(struct device *dev, map->lock = config->lock; map->unlock = config->unlock; map->lock_arg = config->lock_arg; + } else if (config->hwlock_id) { + map->hwlock = hwspin_lock_request_specific(config->hwlock_id); + if (!map->hwlock) { + ret = -ENXIO; + goto err_map; + } + + switch (config->hwlock_mode) { + case HWLOCK_IRQSTATE: + map->lock = regmap_lock_hwlock_irqsave; + map->unlock = regmap_unlock_hwlock_irqrestore; + break; + case HWLOCK_IRQ: + map->lock = regmap_lock_hwlock_irq; + map->unlock = regmap_unlock_hwlock_irq; + break; + default: + map->lock = regmap_lock_hwlock; + map->unlock = regmap_unlock_hwlock; + break; + } + + map->lock_arg = map; } else { if ((bus && bus->fast_io) || config->fast_io) { @@ -729,7 +796,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_write = regmap_format_2_6_write; break; default: - goto err_map; + goto err_hwlock; } break; @@ -739,7 +806,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_write = regmap_format_4_12_write; break; default: - goto err_map; + goto err_hwlock; } break; @@ -749,7 +816,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_write = regmap_format_7_9_write; break; default: - goto err_map; + goto err_hwlock; } break; @@ -759,7 +826,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_write = regmap_format_10_14_write; break; default: - goto err_map; + goto err_hwlock; } break; @@ -779,13 +846,13 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_reg = regmap_format_16_native; break; default: - goto err_map; + goto err_hwlock; } break; case 24: if (reg_endian != REGMAP_ENDIAN_BIG) - goto err_map; + goto err_hwlock; map->format.format_reg = regmap_format_24; break; @@ -801,7 +868,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_reg = regmap_format_32_native; break; default: - goto err_map; + goto err_hwlock; } break; @@ -818,13 +885,13 @@ struct regmap *__regmap_init(struct device *dev, map->format.format_reg = regmap_format_64_native; break; default: - goto err_map; + goto err_hwlock; } break; #endif default: - goto err_map; + goto err_hwlock; } if (val_endian == REGMAP_ENDIAN_NATIVE) @@ -853,12 +920,12 @@ struct regmap *__regmap_init(struct device *dev, map->format.parse_val = regmap_parse_16_native; break; default: - goto err_map; + goto err_hwlock; } break; case 24: if (val_endian != REGMAP_ENDIAN_BIG) - goto err_map; + goto err_hwlock; map->format.format_val = regmap_format_24; map->format.parse_val = regmap_parse_24; break; @@ -879,7 +946,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.parse_val = regmap_parse_32_native; break; default: - goto err_map; + goto err_hwlock; } break; #ifdef CONFIG_64BIT @@ -900,7 +967,7 @@ struct regmap *__regmap_init(struct device *dev, map->format.parse_val = regmap_parse_64_native; break; default: - goto err_map; + goto err_hwlock; } break; #endif @@ -909,18 +976,18 @@ struct regmap *__regmap_init(struct device *dev, if (map->format.format_write) { if ((reg_endian != REGMAP_ENDIAN_BIG) || (val_endian != REGMAP_ENDIAN_BIG)) - goto err_map; + goto err_hwlock; map->use_single_write = true; } if (!map->format.format_write && !(map->format.format_reg && map->format.format_val)) - goto err_map; + goto err_hwlock; map->work_buf = kzalloc(map->format.buf_size, GFP_KERNEL); if (map->work_buf == NULL) { ret = -ENOMEM; - goto err_map; + goto err_hwlock; } if (map->format.format_write) { @@ -1041,6 +1108,8 @@ err_regcache: err_range: regmap_range_exit(map); kfree(map->work_buf); +err_hwlock: + hwspin_lock_free(map->hwlock); err_map: kfree(map); err: diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 978abfbac617..edc32aac84d7 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -273,6 +273,9 @@ typedef void (*regmap_unlock)(void *); * * @ranges: Array of configuration entries for virtual address ranges. * @num_ranges: Number of range configuration entries. + * @hwlock_id: Specify the hardware spinlock id. + * @hwlock_mode: The hardware spinlock mode, should be HWLOCK_IRQSTATE, + * HWLOCK_IRQ or 0. */ struct regmap_config { const char *name; @@ -317,6 +320,9 @@ struct regmap_config { const struct regmap_range_cfg *ranges; unsigned int num_ranges; + + unsigned int hwlock_id; + unsigned int hwlock_mode; }; /** From 2305a18ba9ffa105ad1686f40a7861718c338e3c Mon Sep 17 00:00:00 2001 From: hotran Date: Tue, 31 Oct 2017 13:58:07 -0700 Subject: [PATCH 0773/1085] hwmon: (xgene) Minor clean up of ifdef and acpi_match_table reference This patch removes the un-necessary ifdef CONFIG_ACPI and directly uses the acpi_match_table from the driver pdev. Signed-off-by: Hoan Tran [groeck: Dropped unnecessary initialization] Signed-off-by: Guenter Roeck --- drivers/hwmon/xgene-hwmon.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index 57834361fb3c..a3cd91f23267 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -665,16 +665,15 @@ static int xgene_hwmon_probe(struct platform_device *pdev) } } else { struct acpi_pcct_hw_reduced *cppc_ss; - int version = XGENE_HWMON_V1; -#ifdef CONFIG_ACPI const struct acpi_device_id *acpi_id; + int version; - acpi_id = acpi_match_device(xgene_hwmon_acpi_match, &pdev->dev); + acpi_id = acpi_match_device(pdev->dev.driver->acpi_match_table, + &pdev->dev); if (!acpi_id) return -EINVAL; version = (int)acpi_id->driver_data; -#endif if (device_property_read_u32(&pdev->dev, "pcc-channel", &ctx->mbox_idx)) { From b7b75a60b291cc699ca9bb2a8517a1b3b08bbeb1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 31 Oct 2017 11:06:53 +0900 Subject: [PATCH 0774/1085] perf srcline: Fix memory leak in addr2inlines() When libbfd is not used, addr2inlines() executes `addr2line -i` and process output line by line. But it resets filename to NULL in the loop so getline() allocates additional memory everytime instead of realloc. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Jin Yao Cc: Milian Wolff Cc: Peter Zijlstra Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20171031020654.31163-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index c143c3bc1ef8..51dc49c65476 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -456,20 +456,17 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, while (getline(&filename, &len, fp) != -1) { char *srcline; - if (filename_split(filename, &line_nr) != 1) { - free(filename); + if (filename_split(filename, &line_nr) != 1) goto out; - } srcline = srcline_from_fileline(filename, line_nr); if (inline_list__append(sym, srcline, node) != 0) goto out; - - filename = NULL; } out: pclose(fp); + free(filename); return node; } From 7285cf3325b4a1dfb336d31eebc27dfbc30fb9aa Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 31 Oct 2017 11:06:54 +0900 Subject: [PATCH 0775/1085] perf srcline: Show correct function name for srcline of callchains When libbfd is not used, it doesn't show proper function name and reuse the original symbol of the sample. That's because it passes the original sym to inline_list__append(). As `addr2line -f` returns function names as well, use that to create an inline_sym and pass it to inline_list__append(). For example, following data shows that inlined entries of main have same name (main). Before: $ perf report -g srcline -q | head 45.22% inlining libm-2.26.so [.] __hypot_finite | ---__hypot_finite ??:0 | |--44.15%--hypot ??:0 | main complex:589 | main complex:597 | main complex:654 | main complex:664 | main inlining.cpp:14 After: $ perf report -g srcline -q | head 45.22% inlining libm-2.26.so [.] __hypot_finite | ---__hypot_finite | |--44.15%--hypot | std::__complex_abs complex:589 (inlined) | std::abs complex:597 (inlined) | std::_Norm_helper::_S_do_it complex:654 (inlined) | std::norm complex:664 (inlined) | main inlining.cpp:14 Signed-off-by: Namhyung Kim Reviewed-by: Jiri Olsa Reviewed-by: Milian Wolff Cc: Jin Yao Cc: Peter Zijlstra Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20171031020654.31163-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 95 ++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 51dc49c65476..ad1b46f1f2cf 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -10,7 +10,7 @@ #include "util/debug.h" #include "util/callchain.h" #include "srcline.h" - +#include "string2.h" #include "symbol.h" bool srcline_full_filename; @@ -77,6 +77,41 @@ static char *srcline_from_fileline(const char *file, unsigned int line) return srcline; } +static struct symbol *new_inline_sym(struct dso *dso, + struct symbol *base_sym, + const char *funcname) +{ + struct symbol *inline_sym; + char *demangled = NULL; + + if (dso) { + demangled = dso__demangle_sym(dso, 0, funcname); + if (demangled) + funcname = demangled; + } + + if (base_sym && strcmp(funcname, base_sym->name) == 0) { + /* reuse the real, existing symbol */ + inline_sym = base_sym; + /* ensure that we don't alias an inlined symbol, which could + * lead to double frees in inline_node__delete + */ + assert(!base_sym->inlined); + } else { + /* create a fake symbol for the inline frame */ + inline_sym = symbol__new(base_sym ? base_sym->start : 0, + base_sym ? base_sym->end : 0, + base_sym ? base_sym->binding : 0, + funcname); + if (inline_sym) + inline_sym->inlined = 1; + } + + free(demangled); + + return inline_sym; +} + #ifdef HAVE_LIBBFD_SUPPORT /* @@ -219,41 +254,6 @@ static void addr2line_cleanup(struct a2l_data *a2l) #define MAX_INLINE_NEST 1024 -static struct symbol *new_inline_sym(struct dso *dso, - struct symbol *base_sym, - const char *funcname) -{ - struct symbol *inline_sym; - char *demangled = NULL; - - if (dso) { - demangled = dso__demangle_sym(dso, 0, funcname); - if (demangled) - funcname = demangled; - } - - if (base_sym && strcmp(funcname, base_sym->name) == 0) { - /* reuse the real, existing symbol */ - inline_sym = base_sym; - /* ensure that we don't alias an inlined symbol, which could - * lead to double frees in inline_node__delete - */ - assert(!base_sym->inlined); - } else { - /* create a fake symbol for the inline frame */ - inline_sym = symbol__new(base_sym ? base_sym->start : 0, - base_sym ? base_sym->end : 0, - base_sym ? base_sym->binding : 0, - funcname); - if (inline_sym) - inline_sym->inlined = 1; - } - - free(demangled); - - return inline_sym; -} - static int inline_list__append_dso_a2l(struct dso *dso, struct inline_node *node, struct symbol *sym) @@ -432,10 +432,11 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, char cmd[PATH_MAX]; struct inline_node *node; char *filename = NULL; - size_t len; + char *funcname = NULL; + size_t filelen, funclen; unsigned int line_nr = 0; - scnprintf(cmd, sizeof(cmd), "addr2line -e %s -i %016"PRIx64, + scnprintf(cmd, sizeof(cmd), "addr2line -e %s -i -f %016"PRIx64, dso_name, addr); fp = popen(cmd, "r"); @@ -453,20 +454,34 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, INIT_LIST_HEAD(&node->val); node->addr = addr; - while (getline(&filename, &len, fp) != -1) { + /* addr2line -f generates two lines for each inlined functions */ + while (getline(&funcname, &funclen, fp) != -1) { char *srcline; + struct symbol *inline_sym; + + rtrim(funcname); + + if (getline(&filename, &filelen, fp) == -1) + goto out; if (filename_split(filename, &line_nr) != 1) goto out; srcline = srcline_from_fileline(filename, line_nr); - if (inline_list__append(sym, srcline, node) != 0) + inline_sym = new_inline_sym(dso, sym, funcname); + + if (inline_list__append(inline_sym, srcline, node) != 0) { + free(srcline); + if (inline_sym && inline_sym->inlined) + symbol__delete(inline_sym); goto out; + } } out: pclose(fp); free(filename); + free(funcname); return node; } From 39c82caff8610d57ffe32157cb3130dfabe12fbe Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Thu, 26 Oct 2017 11:37:22 -0700 Subject: [PATCH 0776/1085] clockevents: Update clockevents device next_event on stop clockevent_device::next_event holds the next timer event of a clock event device. The value is updated in clockevents_program_event(), i.e. when the hardware timer is armed for the next expiry. When there are no software timers armed on a CPU, the corresponding per CPU clockevent device is brought into ONESHOT_STOPPED state, but clockevent_device::next_event is not updated, because clockevents_program_event() is not called. So the content of clockevent_device::next_event is stale, which is not an issue when real hardware is used. But the hrtimer broadcast device relies on that information and the stale value causes spurious wakeups. Update clockevent_device::next_event to KTIME_MAX when it has been brought into ONESHOT_STOPPED state to avoid spurious wakeups. This reflects the proper expiry time of the stopped timer: infinity. [ tglx: Massaged changelog ] Signed-off-by: Prasad Sodagudi Signed-off-by: Thomas Gleixner Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/1509043042-32486-1-git-send-email-psodagud@codeaurora.org --- kernel/time/tick-oneshot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 6b009c207671..c1f518e7aa80 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force) * We don't need the clock event device any more, stop it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + dev->next_event = KTIME_MAX; return 0; } From 00ed87da35e88a7a4d7f41673beab52ef828f12b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 1 Nov 2017 07:32:50 -0700 Subject: [PATCH 0777/1085] timer: Add parenthesis around timer_setup() macro arguments In the case where expressions are passed as macro arguments, the LOCKDEP version of the timer macros need enclosing parenthesis. Reported-by: Stephen Rothwell Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171101143250.GA65266@beast --- include/linux/timer.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 09950482309b..a1af92bac0d5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -173,11 +173,12 @@ static inline void timer_setup_on_stack(struct timer_list *timer, * do want to keep the inline for argument type checking, though. */ # define timer_setup(timer, callback, flags) \ - __setup_timer(timer, (TIMER_FUNC_TYPE)callback, \ - (TIMER_DATA_TYPE)timer, flags) + __setup_timer((timer), (TIMER_FUNC_TYPE)(callback), \ + (TIMER_DATA_TYPE)(timer), (flags)) # define timer_setup_on_stack(timer, callback, flags) \ - __setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,\ - (TIMER_DATA_TYPE)timer, flags) + __setup_timer_on_stack((timer), \ + (TIMER_FUNC_TYPE)(callback), \ + (TIMER_DATA_TYPE)(timer), (flags)) #endif #define from_timer(var, callback_timer, timer_fieldname) \ From 8c4602f3c147caaecf33adf3aaf0c5bc84c6b409 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Aug 2017 16:26:35 -0700 Subject: [PATCH 0778/1085] scsi: aic7xxx: Convert timers to use timer_setup() stat_timer only ever assigns the same function and data, so consolidate to using timer_setup(), adjust callback, drop everything else used to pass things around, and remove needless typedefs. reset_timer is unused; remove it. Cc: Hannes Reinecke Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/aic7xxx/aic79xx.h | 5 +---- drivers/scsi/aic7xxx/aic79xx_core.c | 29 ++++++++--------------------- drivers/scsi/aic7xxx/aic79xx_osm.h | 7 ------- 3 files changed, 9 insertions(+), 32 deletions(-) diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h index d47b527b25dd..31f2bb9d7146 100644 --- a/drivers/scsi/aic7xxx/aic79xx.h +++ b/drivers/scsi/aic7xxx/aic79xx.h @@ -1046,8 +1046,6 @@ typedef enum { typedef uint8_t ahd_mode_state; -typedef void ahd_callback_t (void *); - struct ahd_completion { uint16_t tag; @@ -1122,8 +1120,7 @@ struct ahd_softc { /* * Timer handles for timer driven callbacks. */ - ahd_timer_t reset_timer; - ahd_timer_t stat_timer; + struct timer_list stat_timer; /* * Statistics. diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c index 95d8f25cbcca..b560f396ee99 100644 --- a/drivers/scsi/aic7xxx/aic79xx_core.c +++ b/drivers/scsi/aic7xxx/aic79xx_core.c @@ -207,7 +207,7 @@ static void ahd_add_scb_to_free_list(struct ahd_softc *ahd, static u_int ahd_rem_wscb(struct ahd_softc *ahd, u_int scbid, u_int prev, u_int next, u_int tid); static void ahd_reset_current_bus(struct ahd_softc *ahd); -static ahd_callback_t ahd_stat_timer; +static void ahd_stat_timer(struct timer_list *t); #ifdef AHD_DUMP_SEQ static void ahd_dumpseq(struct ahd_softc *ahd); #endif @@ -6104,8 +6104,7 @@ ahd_alloc(void *platform_arg, char *name) ahd->bugs = AHD_BUGNONE; ahd->flags = AHD_SPCHK_ENB_A|AHD_RESET_BUS_A|AHD_TERM_ENB_A | AHD_EXTENDED_TRANS_A|AHD_STPWLEVEL_A; - ahd_timer_init(&ahd->reset_timer); - ahd_timer_init(&ahd->stat_timer); + timer_setup(&ahd->stat_timer, ahd_stat_timer, 0); ahd->int_coalescing_timer = AHD_INT_COALESCING_TIMER_DEFAULT; ahd->int_coalescing_maxcmds = AHD_INT_COALESCING_MAXCMDS_DEFAULT; ahd->int_coalescing_mincmds = AHD_INT_COALESCING_MINCMDS_DEFAULT; @@ -6235,8 +6234,7 @@ ahd_shutdown(void *arg) /* * Stop periodic timer callbacks. */ - ahd_timer_stop(&ahd->reset_timer); - ahd_timer_stop(&ahd->stat_timer); + del_timer_sync(&ahd->stat_timer); /* This will reset most registers to 0, but not all */ ahd_reset(ahd, /*reinit*/FALSE); @@ -7039,20 +7037,11 @@ static const char *termstat_strings[] = { }; /***************************** Timer Facilities *******************************/ -#define ahd_timer_init init_timer -#define ahd_timer_stop del_timer_sync -typedef void ahd_linux_callback_t (u_long); - static void -ahd_timer_reset(ahd_timer_t *timer, int usec, ahd_callback_t *func, void *arg) +ahd_timer_reset(struct timer_list *timer, int usec) { - struct ahd_softc *ahd; - - ahd = (struct ahd_softc *)arg; del_timer(timer); - timer->data = (u_long)arg; timer->expires = jiffies + (usec * HZ)/1000000; - timer->function = (ahd_linux_callback_t*)func; add_timer(timer); } @@ -7279,8 +7268,7 @@ ahd_init(struct ahd_softc *ahd) } init_done: ahd_restart(ahd); - ahd_timer_reset(&ahd->stat_timer, AHD_STAT_UPDATE_US, - ahd_stat_timer, ahd); + ahd_timer_reset(&ahd->stat_timer, AHD_STAT_UPDATE_US); return (0); } @@ -8878,9 +8866,9 @@ ahd_reset_channel(struct ahd_softc *ahd, char channel, int initiate_reset) /**************************** Statistics Processing ***************************/ static void -ahd_stat_timer(void *arg) +ahd_stat_timer(struct timer_list *t) { - struct ahd_softc *ahd = arg; + struct ahd_softc *ahd = from_timer(ahd, t, stat_timer); u_long s; int enint_coal; @@ -8907,8 +8895,7 @@ ahd_stat_timer(void *arg) ahd->cmdcmplt_bucket = (ahd->cmdcmplt_bucket+1) & (AHD_STAT_BUCKETS-1); ahd->cmdcmplt_total -= ahd->cmdcmplt_counts[ahd->cmdcmplt_bucket]; ahd->cmdcmplt_counts[ahd->cmdcmplt_bucket] = 0; - ahd_timer_reset(&ahd->stat_timer, AHD_STAT_UPDATE_US, - ahd_stat_timer, ahd); + ahd_timer_reset(&ahd->stat_timer, AHD_STAT_UPDATE_US); ahd_unlock(ahd, &s); } diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.h b/drivers/scsi/aic7xxx/aic79xx_osm.h index 728193a42e6e..8a8b7ae7aed3 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.h +++ b/drivers/scsi/aic7xxx/aic79xx_osm.h @@ -203,9 +203,6 @@ int ahd_dmamap_unload(struct ahd_softc *, bus_dma_tag_t, bus_dmamap_t); */ #define ahd_dmamap_sync(ahd, dma_tag, dmamap, offset, len, op) -/************************** Timer DataStructures ******************************/ -typedef struct timer_list ahd_timer_t; - /********************************** Includes **********************************/ #ifdef CONFIG_AIC79XX_REG_PRETTY_PRINT #define AIC_DEBUG_REGISTERS 1 @@ -214,10 +211,6 @@ typedef struct timer_list ahd_timer_t; #endif #include "aic79xx.h" -/***************************** Timer Facilities *******************************/ -#define ahd_timer_init init_timer -#define ahd_timer_stop del_timer_sync - /***************************** SMP support ************************************/ #include From fa60a31b0cf07794d643924328cd7e3a78e99aca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 15:15:33 -0700 Subject: [PATCH 0779/1085] scsi: csiostor: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: Varun Prakash Cc: Johannes Thumshirn Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/csiostor/csio_hw.c | 15 ++++++--------- drivers/scsi/csiostor/csio_mb.c | 9 +++------ drivers/scsi/csiostor/csio_mb.h | 3 ++- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c index 5be0086142ca..0bd1131b6cc9 100644 --- a/drivers/scsi/csiostor/csio_hw.c +++ b/drivers/scsi/csiostor/csio_hw.c @@ -3347,9 +3347,10 @@ csio_mberr_worker(void *data) * **/ static void -csio_hw_mb_timer(uintptr_t data) +csio_hw_mb_timer(struct timer_list *t) { - struct csio_hw *hw = (struct csio_hw *)data; + struct csio_mbm *mbm = from_timer(mbm, t, timer); + struct csio_hw *hw = mbm->hw; struct csio_mb *mbp = NULL; spin_lock_irq(&hw->lock); @@ -3715,9 +3716,9 @@ csio_mgmt_req_lookup(struct csio_mgmtm *mgmtm, struct csio_ioreq *io_req) * Return - none. */ static void -csio_mgmt_tmo_handler(uintptr_t data) +csio_mgmt_tmo_handler(struct timer_list *t) { - struct csio_mgmtm *mgmtm = (struct csio_mgmtm *) data; + struct csio_mgmtm *mgmtm = from_timer(mgmtm, t, mgmt_timer); struct list_head *tmp; struct csio_ioreq *io_req; @@ -3797,11 +3798,7 @@ csio_mgmtm_cleanup(struct csio_mgmtm *mgmtm) static int csio_mgmtm_init(struct csio_mgmtm *mgmtm, struct csio_hw *hw) { - struct timer_list *timer = &mgmtm->mgmt_timer; - - init_timer(timer); - timer->function = csio_mgmt_tmo_handler; - timer->data = (unsigned long)mgmtm; + timer_setup(&mgmtm->mgmt_timer, csio_mgmt_tmo_handler, 0); INIT_LIST_HEAD(&mgmtm->active_q); INIT_LIST_HEAD(&mgmtm->cbfn_q); diff --git a/drivers/scsi/csiostor/csio_mb.c b/drivers/scsi/csiostor/csio_mb.c index 9451787ca7f2..abcedfbcecda 100644 --- a/drivers/scsi/csiostor/csio_mb.c +++ b/drivers/scsi/csiostor/csio_mb.c @@ -1644,13 +1644,10 @@ csio_mb_cancel_all(struct csio_hw *hw, struct list_head *cbfn_q) */ int csio_mbm_init(struct csio_mbm *mbm, struct csio_hw *hw, - void (*timer_fn)(uintptr_t)) + void (*timer_fn)(struct timer_list *)) { - struct timer_list *timer = &mbm->timer; - - init_timer(timer); - timer->function = timer_fn; - timer->data = (unsigned long)hw; + mbm->hw = hw; + timer_setup(&mbm->timer, timer_fn, 0); INIT_LIST_HEAD(&mbm->req_q); INIT_LIST_HEAD(&mbm->cbfn_q); diff --git a/drivers/scsi/csiostor/csio_mb.h b/drivers/scsi/csiostor/csio_mb.h index 1bc82d0bc260..a6823df73015 100644 --- a/drivers/scsi/csiostor/csio_mb.h +++ b/drivers/scsi/csiostor/csio_mb.h @@ -137,6 +137,7 @@ struct csio_mbm { uint32_t a_mbox; /* Async mbox num */ uint32_t intr_idx; /* Interrupt index */ struct timer_list timer; /* Mbox timer */ + struct csio_hw *hw; /* Hardware pointer */ struct list_head req_q; /* Mbox request queue */ struct list_head cbfn_q; /* Mbox completion q */ struct csio_mb *mcurrent; /* Current mailbox */ @@ -252,7 +253,7 @@ void csio_mb_process_portparams_rsp(struct csio_hw *hw, struct csio_mb *mbp, /* MB module functions */ int csio_mbm_init(struct csio_mbm *, struct csio_hw *, - void (*)(uintptr_t)); + void (*)(struct timer_list *)); void csio_mbm_exit(struct csio_mbm *); void csio_mb_intr_enable(struct csio_hw *); void csio_mb_intr_disable(struct csio_hw *); From cd07f958e88250e39c3e8b0a93ac17165d483fd3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Aug 2017 10:13:50 -0700 Subject: [PATCH 0780/1085] scsi: cxgbi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Karen Xie Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/cxgbi/cxgb3i/cxgb3i.c | 8 ++++---- drivers/scsi/cxgbi/cxgb4i/cxgb4i.c | 8 ++++---- drivers/scsi/cxgbi/libcxgbi.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c index 7b09e7ddf35e..babd79361a46 100644 --- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c @@ -545,10 +545,10 @@ static int act_open_rpl_status_to_errno(int status) } } -static void act_open_retry_timer(unsigned long data) +static void act_open_retry_timer(struct timer_list *t) { + struct cxgbi_sock *csk = from_timer(csk, t, retry_timer); struct sk_buff *skb; - struct cxgbi_sock *csk = (struct cxgbi_sock *)data; log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", @@ -586,8 +586,8 @@ static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) cxgbi_sock_get(csk); spin_lock_bh(&csk->lock); if (rpl->status == CPL_ERR_CONN_EXIST && - csk->retry_timer.function != act_open_retry_timer) { - csk->retry_timer.function = act_open_retry_timer; + csk->retry_timer.function != (TIMER_FUNC_TYPE)act_open_retry_timer) { + csk->retry_timer.function = (TIMER_FUNC_TYPE)act_open_retry_timer; mod_timer(&csk->retry_timer, jiffies + HZ / 2); } else cxgbi_sock_fail_act_open(csk, diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c index 1d02cf9fe06c..1bef2724eb78 100644 --- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c @@ -872,10 +872,10 @@ static int act_open_rpl_status_to_errno(int status) } } -static void csk_act_open_retry_timer(unsigned long data) +static void csk_act_open_retry_timer(struct timer_list *t) { struct sk_buff *skb = NULL; - struct cxgbi_sock *csk = (struct cxgbi_sock *)data; + struct cxgbi_sock *csk = from_timer(csk, t, retry_timer); struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev); void (*send_act_open_func)(struct cxgbi_sock *, struct sk_buff *, struct l2t_entry *); @@ -963,8 +963,8 @@ static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb) spin_lock_bh(&csk->lock); if (status == CPL_ERR_CONN_EXIST && - csk->retry_timer.function != csk_act_open_retry_timer) { - csk->retry_timer.function = csk_act_open_retry_timer; + csk->retry_timer.function != (TIMER_FUNC_TYPE)csk_act_open_retry_timer) { + csk->retry_timer.function = (TIMER_FUNC_TYPE)csk_act_open_retry_timer; mod_timer(&csk->retry_timer, jiffies + HZ / 2); } else cxgbi_sock_fail_act_open(csk, diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 512c8f1ea5b0..a61a152136a3 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -572,7 +572,7 @@ static struct cxgbi_sock *cxgbi_sock_create(struct cxgbi_device *cdev) kref_init(&csk->refcnt); skb_queue_head_init(&csk->receive_queue); skb_queue_head_init(&csk->write_queue); - setup_timer(&csk->retry_timer, NULL, (unsigned long)csk); + timer_setup(&csk->retry_timer, NULL, 0); rwlock_init(&csk->callback_lock); csk->cdev = cdev; csk->flags = 0; From 9a5d04fcbabd20aa06e5624b824c86f7d260cc80 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 20:54:48 -0700 Subject: [PATCH 0781/1085] scsi: ibmvscsi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Martin K. Petersen" Cc: Tyrel Datwyler Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "James E.J. Bottomley" Cc: linux-scsi@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Acked-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 14 ++++++-------- drivers/scsi/ibmvscsi/ibmvscsi.c | 7 +++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index b491af31a5f8..0d2f7eb3acb6 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -1393,8 +1393,9 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd, * * Called when an internally generated command times out **/ -static void ibmvfc_timeout(struct ibmvfc_event *evt) +static void ibmvfc_timeout(struct timer_list *t) { + struct ibmvfc_event *evt = from_timer(evt, t, timer); struct ibmvfc_host *vhost = evt->vhost; dev_err(vhost->dev, "Command timed out (%p). Resetting connection\n", evt); ibmvfc_reset_host(vhost); @@ -1424,12 +1425,10 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt, BUG(); list_add_tail(&evt->queue, &vhost->sent); - init_timer(&evt->timer); + timer_setup(&evt->timer, ibmvfc_timeout, 0); if (timeout) { - evt->timer.data = (unsigned long) evt; evt->timer.expires = jiffies + (timeout * HZ); - evt->timer.function = (void (*)(unsigned long))ibmvfc_timeout; add_timer(&evt->timer); } @@ -3692,8 +3691,9 @@ static void ibmvfc_tgt_adisc_cancel_done(struct ibmvfc_event *evt) * out, reset the CRQ. When the ADISC comes back as cancelled, * log back into the target. **/ -static void ibmvfc_adisc_timeout(struct ibmvfc_target *tgt) +static void ibmvfc_adisc_timeout(struct timer_list *t) { + struct ibmvfc_target *tgt = from_timer(tgt, t, timer); struct ibmvfc_host *vhost = tgt->vhost; struct ibmvfc_event *evt; struct ibmvfc_tmf *tmf; @@ -3778,9 +3778,7 @@ static void ibmvfc_tgt_adisc(struct ibmvfc_target *tgt) if (timer_pending(&tgt->timer)) mod_timer(&tgt->timer, jiffies + (IBMVFC_ADISC_TIMEOUT * HZ)); else { - tgt->timer.data = (unsigned long) tgt; tgt->timer.expires = jiffies + (IBMVFC_ADISC_TIMEOUT * HZ); - tgt->timer.function = (void (*)(unsigned long))ibmvfc_adisc_timeout; add_timer(&tgt->timer); } @@ -3912,7 +3910,7 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, u64 scsi_id) tgt->vhost = vhost; tgt->need_login = 1; tgt->cancel_key = vhost->task_set++; - init_timer(&tgt->timer); + timer_setup(&tgt->timer, ibmvfc_adisc_timeout, 0); kref_init(&tgt->kref); ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout); spin_lock_irqsave(vhost->host->host_lock, flags); diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 7d156b161482..17df76f0be3c 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -837,8 +837,9 @@ static void ibmvscsi_reset_host(struct ibmvscsi_host_data *hostdata) * * Called when an internally generated command times out */ -static void ibmvscsi_timeout(struct srp_event_struct *evt_struct) +static void ibmvscsi_timeout(struct timer_list *t) { + struct srp_event_struct *evt_struct = from_timer(evt_struct, t, timer); struct ibmvscsi_host_data *hostdata = evt_struct->hostdata; dev_err(hostdata->dev, "Command timed out (%x). Resetting connection\n", @@ -927,11 +928,9 @@ static int ibmvscsi_send_srp_event(struct srp_event_struct *evt_struct, */ list_add_tail(&evt_struct->list, &hostdata->sent); - init_timer(&evt_struct->timer); + timer_setup(&evt_struct->timer, ibmvscsi_timeout, 0); if (timeout) { - evt_struct->timer.data = (unsigned long) evt_struct; evt_struct->timer.expires = jiffies + (timeout * HZ); - evt_struct->timer.function = (void (*)(unsigned long))ibmvscsi_timeout; add_timer(&evt_struct->timer); } From 738c6ec546aaba5acde6a4e1d2b7f56aec249d62 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Aug 2017 16:53:24 -0700 Subject: [PATCH 0782/1085] scsi: ipr: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Brian King Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/ipr.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index f838bd73befa..d53429371127 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -694,7 +694,7 @@ static void ipr_init_ipr_cmnd(struct ipr_cmnd *ipr_cmd, ipr_cmd->sibling = NULL; ipr_cmd->eh_comp = NULL; ipr_cmd->fast_done = fast_done; - init_timer(&ipr_cmd->timer); + timer_setup(&ipr_cmd->timer, NULL, 0); } /** @@ -990,15 +990,14 @@ static void ipr_send_command(struct ipr_cmnd *ipr_cmd) **/ static void ipr_do_req(struct ipr_cmnd *ipr_cmd, void (*done) (struct ipr_cmnd *), - void (*timeout_func) (struct ipr_cmnd *), u32 timeout) + void (*timeout_func) (struct timer_list *), u32 timeout) { list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q); ipr_cmd->done = done; - ipr_cmd->timer.data = (unsigned long) ipr_cmd; ipr_cmd->timer.expires = jiffies + timeout; - ipr_cmd->timer.function = (void (*)(unsigned long))timeout_func; + ipr_cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func; add_timer(&ipr_cmd->timer); @@ -1080,7 +1079,7 @@ static void ipr_init_ioadl(struct ipr_cmnd *ipr_cmd, dma_addr_t dma_addr, * none **/ static void ipr_send_blocking_cmd(struct ipr_cmnd *ipr_cmd, - void (*timeout_func) (struct ipr_cmnd *ipr_cmd), + void (*timeout_func) (struct timer_list *), u32 timeout) { struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; @@ -2664,8 +2663,9 @@ static void ipr_process_error(struct ipr_cmnd *ipr_cmd) * Return value: * none **/ -static void ipr_timeout(struct ipr_cmnd *ipr_cmd) +static void ipr_timeout(struct timer_list *t) { + struct ipr_cmnd *ipr_cmd = from_timer(ipr_cmd, t, timer); unsigned long lock_flags = 0; struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; @@ -2696,8 +2696,9 @@ static void ipr_timeout(struct ipr_cmnd *ipr_cmd) * Return value: * none **/ -static void ipr_oper_timeout(struct ipr_cmnd *ipr_cmd) +static void ipr_oper_timeout(struct timer_list *t) { + struct ipr_cmnd *ipr_cmd = from_timer(ipr_cmd, t, timer); unsigned long lock_flags = 0; struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; @@ -5449,8 +5450,9 @@ static void ipr_bus_reset_done(struct ipr_cmnd *ipr_cmd) * Return value: * none **/ -static void ipr_abort_timeout(struct ipr_cmnd *ipr_cmd) +static void ipr_abort_timeout(struct timer_list *t) { + struct ipr_cmnd *ipr_cmd = from_timer(ipr_cmd, t, timer); struct ipr_cmnd *reset_cmd; struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; struct ipr_cmd_pkt *cmd_pkt; @@ -8271,8 +8273,9 @@ static int ipr_ioafp_identify_hrrq(struct ipr_cmnd *ipr_cmd) * Return value: * none **/ -static void ipr_reset_timer_done(struct ipr_cmnd *ipr_cmd) +static void ipr_reset_timer_done(struct timer_list *t) { + struct ipr_cmnd *ipr_cmd = from_timer(ipr_cmd, t, timer); struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; unsigned long lock_flags = 0; @@ -8308,9 +8311,8 @@ static void ipr_reset_start_timer(struct ipr_cmnd *ipr_cmd, list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q); ipr_cmd->done = ipr_reset_ioa_job; - ipr_cmd->timer.data = (unsigned long) ipr_cmd; ipr_cmd->timer.expires = jiffies + timeout; - ipr_cmd->timer.function = (void (*)(unsigned long))ipr_reset_timer_done; + ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_reset_timer_done; add_timer(&ipr_cmd->timer); } @@ -8394,9 +8396,8 @@ static int ipr_reset_next_stage(struct ipr_cmnd *ipr_cmd) } } - ipr_cmd->timer.data = (unsigned long) ipr_cmd; ipr_cmd->timer.expires = jiffies + stage_time * HZ; - ipr_cmd->timer.function = (void (*)(unsigned long))ipr_oper_timeout; + ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout; ipr_cmd->done = ipr_reset_ioa_job; add_timer(&ipr_cmd->timer); @@ -8466,9 +8467,8 @@ static int ipr_reset_enable_ioa(struct ipr_cmnd *ipr_cmd) return IPR_RC_JOB_CONTINUE; } - ipr_cmd->timer.data = (unsigned long) ipr_cmd; ipr_cmd->timer.expires = jiffies + (ioa_cfg->transop_timeout * HZ); - ipr_cmd->timer.function = (void (*)(unsigned long))ipr_oper_timeout; + ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout; ipr_cmd->done = ipr_reset_ioa_job; add_timer(&ipr_cmd->timer); list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q); From f22eb4d31c21ac2be001fa7bcd079e2d7d02a8f1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Sep 2017 20:24:26 -0700 Subject: [PATCH 0783/1085] scsi: lpfc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: James Smart Cc: Dick Kennedy Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_crtn.h | 16 ++++++------- drivers/scsi/lpfc/lpfc_ct.c | 4 ++-- drivers/scsi/lpfc/lpfc_els.c | 12 +++++----- drivers/scsi/lpfc/lpfc_hbadisc.c | 7 +++--- drivers/scsi/lpfc/lpfc_init.c | 39 +++++++++++++------------------- drivers/scsi/lpfc/lpfc_scsi.c | 4 ++-- drivers/scsi/lpfc/lpfc_sli.c | 8 +++---- 7 files changed, 41 insertions(+), 49 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index 7e300734b345..4e858b38529a 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -113,7 +113,7 @@ void lpfc_disc_list_loopmap(struct lpfc_vport *); void lpfc_disc_start(struct lpfc_vport *); void lpfc_cleanup_discovery_resources(struct lpfc_vport *); void lpfc_cleanup(struct lpfc_vport *); -void lpfc_disc_timeout(unsigned long); +void lpfc_disc_timeout(struct timer_list *); int lpfc_unregister_fcf_prep(struct lpfc_hba *); struct lpfc_nodelist *__lpfc_findnode_rpi(struct lpfc_vport *, uint16_t); @@ -154,7 +154,7 @@ int lpfc_els_rsp_adisc_acc(struct lpfc_vport *, struct lpfc_iocbq *, int lpfc_els_rsp_prli_acc(struct lpfc_vport *, struct lpfc_iocbq *, struct lpfc_nodelist *); void lpfc_cancel_retry_delay_tmo(struct lpfc_vport *, struct lpfc_nodelist *); -void lpfc_els_retry_delay(unsigned long); +void lpfc_els_retry_delay(struct timer_list *); void lpfc_els_retry_delay_handler(struct lpfc_nodelist *); void lpfc_els_unsol_event(struct lpfc_hba *, struct lpfc_sli_ring *, struct lpfc_iocbq *); @@ -165,7 +165,7 @@ void lpfc_els_flush_all_cmd(struct lpfc_hba *); void lpfc_els_flush_cmd(struct lpfc_vport *); int lpfc_els_disc_adisc(struct lpfc_vport *); int lpfc_els_disc_plogi(struct lpfc_vport *); -void lpfc_els_timeout(unsigned long); +void lpfc_els_timeout(struct timer_list *); void lpfc_els_timeout_handler(struct lpfc_vport *); struct lpfc_iocbq *lpfc_prep_els_iocb(struct lpfc_vport *, uint8_t, uint16_t, uint8_t, struct lpfc_nodelist *, @@ -180,7 +180,7 @@ int lpfc_get_gidft_type(struct lpfc_vport *vport, struct lpfc_iocbq *iocbq); int lpfc_ns_cmd(struct lpfc_vport *, int, uint8_t, uint32_t); int lpfc_fdmi_cmd(struct lpfc_vport *, struct lpfc_nodelist *, int, uint32_t); void lpfc_fdmi_num_disc_check(struct lpfc_vport *); -void lpfc_delayed_disc_tmo(unsigned long); +void lpfc_delayed_disc_tmo(struct timer_list *); void lpfc_delayed_disc_timeout_handler(struct lpfc_vport *); int lpfc_config_port_prep(struct lpfc_hba *); @@ -279,9 +279,9 @@ void lpfc_mem_free(struct lpfc_hba *); void lpfc_mem_free_all(struct lpfc_hba *); void lpfc_stop_vport_timers(struct lpfc_vport *); -void lpfc_poll_timeout(unsigned long ptr); +void lpfc_poll_timeout(struct timer_list *t); void lpfc_poll_start_timer(struct lpfc_hba *); -void lpfc_poll_eratt(unsigned long); +void lpfc_poll_eratt(struct timer_list *); int lpfc_sli_handle_fast_ring_event(struct lpfc_hba *, struct lpfc_sli_ring *, uint32_t); @@ -351,7 +351,7 @@ int lpfc_sli_abort_taskmgmt(struct lpfc_vport *, struct lpfc_sli_ring *, uint16_t, uint64_t, lpfc_ctx_cmd); -void lpfc_mbox_timeout(unsigned long); +void lpfc_mbox_timeout(struct timer_list *t); void lpfc_mbox_timeout_handler(struct lpfc_hba *); struct lpfc_nodelist *lpfc_findnode_did(struct lpfc_vport *, uint32_t); @@ -445,7 +445,7 @@ extern unsigned int lpfc_fcp_look_ahead; /* Interface exported by fabric iocb scheduler */ void lpfc_fabric_abort_nport(struct lpfc_nodelist *); void lpfc_fabric_abort_hba(struct lpfc_hba *); -void lpfc_fabric_block_timeout(unsigned long); +void lpfc_fabric_block_timeout(struct timer_list *); void lpfc_unblock_fabric_iocbs(struct lpfc_hba *); void lpfc_rampdown_queue_depth(struct lpfc_hba *); void lpfc_ramp_down_queue_handler(struct lpfc_hba *); diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 33417681f5d4..f77673ab4a84 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -2884,9 +2884,9 @@ fdmi_cmd_exit: * the worker thread. **/ void -lpfc_delayed_disc_tmo(unsigned long ptr) +lpfc_delayed_disc_tmo(struct timer_list *t) { - struct lpfc_vport *vport = (struct lpfc_vport *)ptr; + struct lpfc_vport *vport = from_timer(vport, t, delayed_disc_tmo); struct lpfc_hba *phba = vport->phba; uint32_t tmo_posted; unsigned long iflag; diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 468a66371de9..0dd6c21433fe 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -3131,9 +3131,9 @@ lpfc_cancel_retry_delay_tmo(struct lpfc_vport *vport, struct lpfc_nodelist *nlp) * to the event associated with the ndlp. **/ void -lpfc_els_retry_delay(unsigned long ptr) +lpfc_els_retry_delay(struct timer_list *t) { - struct lpfc_nodelist *ndlp = (struct lpfc_nodelist *) ptr; + struct lpfc_nodelist *ndlp = from_timer(ndlp, t, nlp_delayfunc); struct lpfc_vport *vport = ndlp->vport; struct lpfc_hba *phba = vport->phba; unsigned long flags; @@ -7385,9 +7385,9 @@ lpfc_els_rcv_fan(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb, * lpfc_els_timeout_handler() to work on the posted event WORKER_ELS_TMO. **/ void -lpfc_els_timeout(unsigned long ptr) +lpfc_els_timeout(struct timer_list *t) { - struct lpfc_vport *vport = (struct lpfc_vport *) ptr; + struct lpfc_vport *vport = from_timer(vport, t, els_tmofunc); struct lpfc_hba *phba = vport->phba; uint32_t tmo_posted; unsigned long iflag; @@ -9017,9 +9017,9 @@ lpfc_issue_els_npiv_logo(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) * posted event WORKER_FABRIC_BLOCK_TMO. **/ void -lpfc_fabric_block_timeout(unsigned long ptr) +lpfc_fabric_block_timeout(struct timer_list *t) { - struct lpfc_hba *phba = (struct lpfc_hba *) ptr; + struct lpfc_hba *phba = from_timer(phba, t, fabric_block_timer); unsigned long iflags; uint32_t tmo_posted; diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 20808349a80e..8d491084eb5d 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -4370,8 +4370,7 @@ lpfc_initialize_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, { INIT_LIST_HEAD(&ndlp->els_retry_evt.evt_listp); INIT_LIST_HEAD(&ndlp->dev_loss_evt.evt_listp); - setup_timer(&ndlp->nlp_delayfunc, lpfc_els_retry_delay, - (unsigned long)ndlp); + timer_setup(&ndlp->nlp_delayfunc, lpfc_els_retry_delay, 0); ndlp->nlp_DID = did; ndlp->vport = vport; ndlp->phba = vport->phba; @@ -5508,9 +5507,9 @@ lpfc_cleanup_discovery_resources(struct lpfc_vport *vport) */ /*****************************************************************************/ void -lpfc_disc_timeout(unsigned long ptr) +lpfc_disc_timeout(struct timer_list *t) { - struct lpfc_vport *vport = (struct lpfc_vport *) ptr; + struct lpfc_vport *vport = from_timer(vport, t, fc_disctmo); struct lpfc_hba *phba = vport->phba; uint32_t tmo_posted; unsigned long flags = 0; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 100bc4c8798d..6a1e28ba9258 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -1138,13 +1138,13 @@ lpfc_hba_down_post(struct lpfc_hba *phba) * be cleared by the worker thread after it has taken the event bitmap out. **/ static void -lpfc_hb_timeout(unsigned long ptr) +lpfc_hb_timeout(struct timer_list *t) { struct lpfc_hba *phba; uint32_t tmo_posted; unsigned long iflag; - phba = (struct lpfc_hba *)ptr; + phba = from_timer(phba, t, hb_tmofunc); /* Check for heart beat timeout conditions */ spin_lock_irqsave(&phba->pport->work_port_lock, iflag); @@ -1172,12 +1172,12 @@ lpfc_hb_timeout(unsigned long ptr) * be cleared by the worker thread after it has taken the event bitmap out. **/ static void -lpfc_rrq_timeout(unsigned long ptr) +lpfc_rrq_timeout(struct timer_list *t) { struct lpfc_hba *phba; unsigned long iflag; - phba = (struct lpfc_hba *)ptr; + phba = from_timer(phba, t, rrq_tmr); spin_lock_irqsave(&phba->pport->work_port_lock, iflag); if (!(phba->pport->load_flag & FC_UNLOADING)) phba->hba_flag |= HBA_RRQ_ACTIVE; @@ -3937,14 +3937,11 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) INIT_LIST_HEAD(&vport->rcv_buffer_list); spin_lock_init(&vport->work_port_lock); - setup_timer(&vport->fc_disctmo, lpfc_disc_timeout, - (unsigned long)vport); + timer_setup(&vport->fc_disctmo, lpfc_disc_timeout, 0); - setup_timer(&vport->els_tmofunc, lpfc_els_timeout, - (unsigned long)vport); + timer_setup(&vport->els_tmofunc, lpfc_els_timeout, 0); - setup_timer(&vport->delayed_disc_tmo, lpfc_delayed_disc_tmo, - (unsigned long)vport); + timer_setup(&vport->delayed_disc_tmo, lpfc_delayed_disc_tmo, 0); error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); if (error) @@ -4210,9 +4207,9 @@ lpfc_fcf_redisc_wait_start_timer(struct lpfc_hba *phba) * worker thread context. **/ static void -lpfc_sli4_fcf_redisc_wait_tmo(unsigned long ptr) +lpfc_sli4_fcf_redisc_wait_tmo(struct timer_list *t) { - struct lpfc_hba *phba = (struct lpfc_hba *)ptr; + struct lpfc_hba *phba = from_timer(phba, t, fcf.redisc_wait); /* Don't send FCF rediscovery event if timer cancelled */ spin_lock_irq(&phba->hbalock); @@ -5624,15 +5621,13 @@ lpfc_setup_driver_resource_phase1(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->luns); /* MBOX heartbeat timer */ - setup_timer(&psli->mbox_tmo, lpfc_mbox_timeout, (unsigned long)phba); + timer_setup(&psli->mbox_tmo, lpfc_mbox_timeout, 0); /* Fabric block timer */ - setup_timer(&phba->fabric_block_timer, lpfc_fabric_block_timeout, - (unsigned long)phba); + timer_setup(&phba->fabric_block_timer, lpfc_fabric_block_timeout, 0); /* EA polling mode timer */ - setup_timer(&phba->eratt_poll, lpfc_poll_eratt, - (unsigned long)phba); + timer_setup(&phba->eratt_poll, lpfc_poll_eratt, 0); /* Heartbeat timer */ - setup_timer(&phba->hb_tmofunc, lpfc_hb_timeout, (unsigned long)phba); + timer_setup(&phba->hb_tmofunc, lpfc_hb_timeout, 0); return 0; } @@ -5658,8 +5653,7 @@ lpfc_sli_driver_resource_setup(struct lpfc_hba *phba) */ /* FCP polling mode timer */ - setup_timer(&phba->fcp_poll_timer, lpfc_poll_timeout, - (unsigned long)phba); + timer_setup(&phba->fcp_poll_timer, lpfc_poll_timeout, 0); /* Host attention work mask setup */ phba->work_ha_mask = (HA_ERATT | HA_MBATT | HA_LATT); @@ -5829,11 +5823,10 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) * Initialize timers used by driver */ - setup_timer(&phba->rrq_tmr, lpfc_rrq_timeout, (unsigned long)phba); + timer_setup(&phba->rrq_tmr, lpfc_rrq_timeout, 0); /* FCF rediscover timer */ - setup_timer(&phba->fcf.redisc_wait, lpfc_sli4_fcf_redisc_wait_tmo, - (unsigned long)phba); + timer_setup(&phba->fcf.redisc_wait, lpfc_sli4_fcf_redisc_wait_tmo, 0); /* * Control structure for handling external multi-buffer mailbox diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 1a6f122bb25d..c0cdaef4db24 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -4501,9 +4501,9 @@ void lpfc_poll_start_timer(struct lpfc_hba * phba) * and FCP Ring interrupt is disable. **/ -void lpfc_poll_timeout(unsigned long ptr) +void lpfc_poll_timeout(struct timer_list *t) { - struct lpfc_hba *phba = (struct lpfc_hba *) ptr; + struct lpfc_hba *phba = from_timer(phba, t, fcp_poll_timer); if (phba->cfg_poll & ENABLE_FCP_RING_POLLING) { lpfc_sli_handle_fast_ring_event(phba, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 8b119f87b51d..4edb81073409 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -3004,13 +3004,13 @@ lpfc_sli_rsp_pointers_error(struct lpfc_hba *phba, struct lpfc_sli_ring *pring) * and wake up worker thread to process it. Otherwise, it will set up the * Error Attention polling timer for the next poll. **/ -void lpfc_poll_eratt(unsigned long ptr) +void lpfc_poll_eratt(struct timer_list *t) { struct lpfc_hba *phba; uint32_t eratt = 0; uint64_t sli_intr, cnt; - phba = (struct lpfc_hba *)ptr; + phba = from_timer(phba, t, eratt_poll); /* Here we will also keep track of interrupts per sec of the hba */ sli_intr = phba->sli.slistat.sli_intr; @@ -7167,9 +7167,9 @@ out_free_mbox: * done by the worker thread function lpfc_mbox_timeout_handler. **/ void -lpfc_mbox_timeout(unsigned long ptr) +lpfc_mbox_timeout(struct timer_list *t) { - struct lpfc_hba *phba = (struct lpfc_hba *) ptr; + struct lpfc_hba *phba = from_timer(phba, t, sli.mbox_tmo); unsigned long iflag; uint32_t tmo_posted; From c251a7be46b0ee64d02b321081e24ea51fae2cfe Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 15:30:04 -0700 Subject: [PATCH 0784/1085] scsi: megaraid: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Also consolidates the timer setup functions arguments, which are all identical, and corrects on-stack timer usage. Cc: Kashyap Desai Cc: Sumit Saxena Cc: Shivasharan S Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: megaraidlinux.pdl@broadcom.com Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_ioctl.h | 6 ++++ drivers/scsi/megaraid/megaraid_mbox.c | 26 ++++++++-------- drivers/scsi/megaraid/megaraid_mm.c | 27 ++++++++--------- drivers/scsi/megaraid/megaraid_sas_base.c | 33 ++++++++------------- drivers/scsi/megaraid/megaraid_sas_fusion.c | 15 ++-------- 5 files changed, 46 insertions(+), 61 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_ioctl.h b/drivers/scsi/megaraid/megaraid_ioctl.h index 05f6e4ec3453..eedcbde46459 100644 --- a/drivers/scsi/megaraid/megaraid_ioctl.h +++ b/drivers/scsi/megaraid/megaraid_ioctl.h @@ -19,6 +19,7 @@ #include #include +#include #include "mbox_defs.h" @@ -153,6 +154,11 @@ typedef struct uioc { } __attribute__ ((aligned(1024),packed)) uioc_t; +/* For on-stack uioc timers. */ +struct uioc_timeout { + struct timer_list timer; + uioc_t *uioc; +}; /** * struct mraid_hba_info - information about the controller diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index ec3c43854978..530358cdcb39 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c @@ -3904,19 +3904,19 @@ megaraid_sysfs_get_ldmap_done(uioc_t *uioc) wake_up(&raid_dev->sysfs_wait_q); } - /** * megaraid_sysfs_get_ldmap_timeout - timeout handling for get ldmap - * @data : timed out packet + * @t : timed out timer * * Timeout routine to recover and return to application, in case the adapter * has stopped responding. A timeout of 60 seconds for this command seems like * a good value. */ static void -megaraid_sysfs_get_ldmap_timeout(unsigned long data) +megaraid_sysfs_get_ldmap_timeout(struct timer_list *t) { - uioc_t *uioc = (uioc_t *)data; + struct uioc_timeout *timeout = from_timer(timeout, t, timer); + uioc_t *uioc = timeout->uioc; adapter_t *adapter = (adapter_t *)uioc->buf_vaddr; mraid_device_t *raid_dev = ADAP2RAIDDEV(adapter); @@ -3951,8 +3951,7 @@ megaraid_sysfs_get_ldmap(adapter_t *adapter) mbox64_t *mbox64; mbox_t *mbox; char *raw_mbox; - struct timer_list sysfs_timer; - struct timer_list *timerp; + struct uioc_timeout timeout; caddr_t ldmap; int rval = 0; @@ -3988,14 +3987,12 @@ megaraid_sysfs_get_ldmap(adapter_t *adapter) /* * Setup a timer to recover from a non-responding controller */ - timerp = &sysfs_timer; - init_timer(timerp); + timeout.uioc = uioc; + timer_setup_on_stack(&timeout.timer, + megaraid_sysfs_get_ldmap_timeout, 0); - timerp->function = megaraid_sysfs_get_ldmap_timeout; - timerp->data = (unsigned long)uioc; - timerp->expires = jiffies + 60 * HZ; - - add_timer(timerp); + timeout.timer.expires = jiffies + 60 * HZ; + add_timer(&timeout.timer); /* * Send the command to the firmware @@ -4033,7 +4030,8 @@ megaraid_sysfs_get_ldmap(adapter_t *adapter) } - del_timer_sync(timerp); + del_timer_sync(&timeout.timer); + destroy_timer_on_stack(&timeout.timer); mutex_unlock(&raid_dev->sysfs_mtx); diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c index 65b6f6ace3a5..bb802b0c12b8 100644 --- a/drivers/scsi/megaraid/megaraid_mm.c +++ b/drivers/scsi/megaraid/megaraid_mm.c @@ -35,7 +35,7 @@ static int kioc_to_mimd(uioc_t *, mimd_t __user *); static int handle_drvrcmd(void __user *, uint8_t, int *); static int lld_ioctl(mraid_mmadp_t *, uioc_t *); static void ioctl_done(uioc_t *); -static void lld_timedout(unsigned long); +static void lld_timedout(struct timer_list *); static void hinfo_to_cinfo(mraid_hba_info_t *, mcontroller_t *); static mraid_mmadp_t *mraid_mm_get_adapter(mimd_t __user *, int *); static uioc_t *mraid_mm_alloc_kioc(mraid_mmadp_t *); @@ -686,8 +686,7 @@ static int lld_ioctl(mraid_mmadp_t *adp, uioc_t *kioc) { int rval; - struct timer_list timer; - struct timer_list *tp = NULL; + struct uioc_timeout timeout = { }; kioc->status = -ENODATA; rval = adp->issue_uioc(adp->drvr_data, kioc, IOCTL_ISSUE); @@ -698,14 +697,12 @@ lld_ioctl(mraid_mmadp_t *adp, uioc_t *kioc) * Start the timer */ if (adp->timeout > 0) { - tp = &timer; - init_timer(tp); + timeout.uioc = kioc; + timer_setup_on_stack(&timeout.timer, lld_timedout, 0); - tp->function = lld_timedout; - tp->data = (unsigned long)kioc; - tp->expires = jiffies + adp->timeout * HZ; + timeout.timer.expires = jiffies + adp->timeout * HZ; - add_timer(tp); + add_timer(&timeout.timer); } /* @@ -713,8 +710,9 @@ lld_ioctl(mraid_mmadp_t *adp, uioc_t *kioc) * call, the ioctl either completed successfully or timedout. */ wait_event(wait_q, (kioc->status != -ENODATA)); - if (tp) { - del_timer_sync(tp); + if (timeout.timer.function) { + del_timer_sync(&timeout.timer); + destroy_timer_on_stack(&timeout.timer); } /* @@ -783,12 +781,13 @@ ioctl_done(uioc_t *kioc) /** * lld_timedout - callback from the expired timer - * @ptr : ioctl packet that timed out + * @t : timer that timed out */ static void -lld_timedout(unsigned long ptr) +lld_timedout(struct timer_list *t) { - uioc_t *kioc = (uioc_t *)ptr; + struct uioc_timeout *timeout = from_timer(timeout, t, timer); + uioc_t *kioc = timeout->uioc; kioc->status = -ETIME; kioc->timedout = 1; diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index e518dadc8161..a36e18156e49 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -2114,22 +2114,19 @@ static void megasas_complete_cmd_dpc(unsigned long instance_addr) megasas_check_and_restore_queue_depth(instance); } +static void megasas_sriov_heartbeat_handler(struct timer_list *t); + /** - * megasas_start_timer - Initializes a timer object + * megasas_start_timer - Initializes sriov heartbeat timer object * @instance: Adapter soft state - * @timer: timer object to be initialized - * @fn: timer function - * @interval: time interval between timer function call * */ -void megasas_start_timer(struct megasas_instance *instance, - struct timer_list *timer, - void *fn, unsigned long interval) +void megasas_start_timer(struct megasas_instance *instance) { - init_timer(timer); - timer->expires = jiffies + interval; - timer->data = (unsigned long)instance; - timer->function = fn; + struct timer_list *timer = &instance->sriov_heartbeat_timer; + + timer_setup(timer, megasas_sriov_heartbeat_handler, 0); + timer->expires = jiffies + MEGASAS_SRIOV_HEARTBEAT_INTERVAL_VF; add_timer(timer); } @@ -2515,10 +2512,10 @@ out: } /* Handler for SR-IOV heartbeat */ -void megasas_sriov_heartbeat_handler(unsigned long instance_addr) +static void megasas_sriov_heartbeat_handler(struct timer_list *t) { struct megasas_instance *instance = - (struct megasas_instance *)instance_addr; + from_timer(instance, t, sriov_heartbeat_timer); if (instance->hb_host_mem->HB.fwCounter != instance->hb_host_mem->HB.driverCounter) { @@ -5493,10 +5490,7 @@ static int megasas_init_fw(struct megasas_instance *instance) /* Launch SR-IOV heartbeat timer */ if (instance->requestorId) { if (!megasas_sriov_start_heartbeat(instance, 1)) - megasas_start_timer(instance, - &instance->sriov_heartbeat_timer, - megasas_sriov_heartbeat_handler, - MEGASAS_SRIOV_HEARTBEAT_INTERVAL_VF); + megasas_start_timer(instance); else instance->skip_heartbeat_timer_del = 1; } @@ -6507,10 +6501,7 @@ megasas_resume(struct pci_dev *pdev) /* Re-launch SR-IOV heartbeat timer */ if (instance->requestorId) { if (!megasas_sriov_start_heartbeat(instance, 0)) - megasas_start_timer(instance, - &instance->sriov_heartbeat_timer, - megasas_sriov_heartbeat_handler, - MEGASAS_SRIOV_HEARTBEAT_INTERVAL_VF); + megasas_start_timer(instance); else { instance->skip_heartbeat_timer_del = 1; goto fail_init_mfi; diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 11bd2e698b84..3c399e7b3fe1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -85,12 +85,9 @@ int megasas_transition_to_ready(struct megasas_instance *instance, int ocr); void megaraid_sas_kill_hba(struct megasas_instance *instance); extern u32 megasas_dbg_lvl; -void megasas_sriov_heartbeat_handler(unsigned long instance_addr); int megasas_sriov_start_heartbeat(struct megasas_instance *instance, int initial); -void megasas_start_timer(struct megasas_instance *instance, - struct timer_list *timer, - void *fn, unsigned long interval); +void megasas_start_timer(struct megasas_instance *instance); extern struct megasas_mgmt_info megasas_mgmt_info; extern unsigned int resetwaittime; extern unsigned int dual_qdepth_disable; @@ -4369,10 +4366,7 @@ transition_to_ready: /* Restart SR-IOV heartbeat */ if (instance->requestorId) { if (!megasas_sriov_start_heartbeat(instance, 0)) - megasas_start_timer(instance, - &instance->sriov_heartbeat_timer, - megasas_sriov_heartbeat_handler, - MEGASAS_SRIOV_HEARTBEAT_INTERVAL_VF); + megasas_start_timer(instance); else instance->skip_heartbeat_timer_del = 1; } @@ -4404,10 +4398,7 @@ fail_kill_adapter: } else { /* For VF: Restart HB timer if we didn't OCR */ if (instance->requestorId) { - megasas_start_timer(instance, - &instance->sriov_heartbeat_timer, - megasas_sriov_heartbeat_handler, - MEGASAS_SRIOV_HEARTBEAT_INTERVAL_VF); + megasas_start_timer(instance); } clear_bit(MEGASAS_FUSION_IN_RESET, &instance->reset_flags); instance->instancet->enable_intr(instance); From 242b56579ee853bcbdd7c7f9fcd62890103c52e8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Aug 2017 12:48:29 -0700 Subject: [PATCH 0785/1085] scsi: pmcraid: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen --- drivers/scsi/pmcraid.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index b4d6cd8cd1ad..4f9f115fb6a0 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -348,7 +348,7 @@ static void pmcraid_init_cmdblk(struct pmcraid_cmd *cmd, int index) cmd->sense_buffer = NULL; cmd->sense_buffer_dma = 0; cmd->dma_handle = 0; - init_timer(&cmd->timer); + timer_setup(&cmd->timer, NULL, 0); } /** @@ -557,8 +557,9 @@ static void pmcraid_reset_type(struct pmcraid_instance *pinstance) static void pmcraid_ioa_reset(struct pmcraid_cmd *); -static void pmcraid_bist_done(struct pmcraid_cmd *cmd) +static void pmcraid_bist_done(struct timer_list *t) { + struct pmcraid_cmd *cmd = from_timer(cmd, t, timer); struct pmcraid_instance *pinstance = cmd->drv_inst; unsigned long lock_flags; int rc; @@ -572,9 +573,6 @@ static void pmcraid_bist_done(struct pmcraid_cmd *cmd) pmcraid_info("BIST not complete, waiting another 2 secs\n"); cmd->timer.expires = jiffies + cmd->time_left; cmd->time_left = 0; - cmd->timer.data = (unsigned long)cmd; - cmd->timer.function = - (void (*)(unsigned long))pmcraid_bist_done; add_timer(&cmd->timer); } else { cmd->time_left = 0; @@ -605,9 +603,8 @@ static void pmcraid_start_bist(struct pmcraid_cmd *cmd) doorbells, intrs); cmd->time_left = msecs_to_jiffies(PMCRAID_BIST_TIMEOUT); - cmd->timer.data = (unsigned long)cmd; cmd->timer.expires = jiffies + msecs_to_jiffies(PMCRAID_BIST_TIMEOUT); - cmd->timer.function = (void (*)(unsigned long))pmcraid_bist_done; + cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_bist_done; add_timer(&cmd->timer); } @@ -617,8 +614,9 @@ static void pmcraid_start_bist(struct pmcraid_cmd *cmd) * Return value * None */ -static void pmcraid_reset_alert_done(struct pmcraid_cmd *cmd) +static void pmcraid_reset_alert_done(struct timer_list *t) { + struct pmcraid_cmd *cmd = from_timer(cmd, t, timer); struct pmcraid_instance *pinstance = cmd->drv_inst; u32 status = ioread32(pinstance->ioa_status); unsigned long lock_flags; @@ -637,10 +635,8 @@ static void pmcraid_reset_alert_done(struct pmcraid_cmd *cmd) pmcraid_info("critical op is not yet reset waiting again\n"); /* restart timer if some more time is available to wait */ cmd->time_left -= PMCRAID_CHECK_FOR_RESET_TIMEOUT; - cmd->timer.data = (unsigned long)cmd; cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT; - cmd->timer.function = - (void (*)(unsigned long))pmcraid_reset_alert_done; + cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done; add_timer(&cmd->timer); } } @@ -676,10 +672,8 @@ static void pmcraid_reset_alert(struct pmcraid_cmd *cmd) * bit to be reset. */ cmd->time_left = PMCRAID_RESET_TIMEOUT; - cmd->timer.data = (unsigned long)cmd; cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT; - cmd->timer.function = - (void (*)(unsigned long))pmcraid_reset_alert_done; + cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done; add_timer(&cmd->timer); iowrite32(DOORBELL_IOA_RESET_ALERT, @@ -704,8 +698,9 @@ static void pmcraid_reset_alert(struct pmcraid_cmd *cmd) * Return value: * None */ -static void pmcraid_timeout_handler(struct pmcraid_cmd *cmd) +static void pmcraid_timeout_handler(struct timer_list *t) { + struct pmcraid_cmd *cmd = from_timer(cmd, t, timer); struct pmcraid_instance *pinstance = cmd->drv_inst; unsigned long lock_flags; @@ -919,7 +914,7 @@ static void pmcraid_send_cmd( struct pmcraid_cmd *cmd, void (*cmd_done) (struct pmcraid_cmd *), unsigned long timeout, - void (*timeout_func) (struct pmcraid_cmd *) + void (*timeout_func) (struct timer_list *) ) { /* initialize done function */ @@ -927,9 +922,8 @@ static void pmcraid_send_cmd( if (timeout_func) { /* setup timeout handler */ - cmd->timer.data = (unsigned long)cmd; cmd->timer.expires = jiffies + timeout; - cmd->timer.function = (void (*)(unsigned long))timeout_func; + cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func; add_timer(&cmd->timer); } @@ -1955,10 +1949,9 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd) * would re-initiate a reset */ cmd->cmd_done = pmcraid_ioa_reset; - cmd->timer.data = (unsigned long)cmd; cmd->timer.expires = jiffies + msecs_to_jiffies(PMCRAID_TRANSOP_TIMEOUT); - cmd->timer.function = (void (*)(unsigned long))pmcraid_timeout_handler; + cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_timeout_handler; if (!timer_pending(&cmd->timer)) add_timer(&cmd->timer); From 77570eedd92adfcf69fdde31183a56324f82ca5c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Aug 2017 16:05:14 -0700 Subject: [PATCH 0786/1085] scsi: sas: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This requires adding a pointer to hold the timer's target task, as there isn't a link back from slow_task. Cc: John Garry Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Jack Wang Cc: lindar_liu@usish.com Cc: Jens Axboe Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Benjamin Block Cc: Baoyou Xie Cc: Wei Yongjun Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Acked-by: John Garry # for hisi_sas part Tested-by: John Garry # basic sanity test for hisi_sas Reviewed-by: Jack Wang --- drivers/scsi/hisi_sas/hisi_sas.h | 1 - drivers/scsi/hisi_sas/hisi_sas_main.c | 14 ++++++-------- drivers/scsi/hisi_sas/hisi_sas_v1_hw.c | 6 +++--- drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 24 +++++++++++------------- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- drivers/scsi/libsas/sas_expander.c | 8 ++++---- drivers/scsi/libsas/sas_init.c | 3 ++- drivers/scsi/libsas/sas_scsi_host.c | 2 +- drivers/scsi/mvsas/mv_init.c | 3 +-- drivers/scsi/mvsas/mv_sas.c | 15 +++++++-------- drivers/scsi/mvsas/mv_sas.h | 1 - drivers/scsi/pm8001/pm8001_sas.c | 11 +++++------ include/scsi/libsas.h | 1 + 13 files changed, 42 insertions(+), 49 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 07f4a4cfbec1..15692ea05ced 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -103,7 +103,6 @@ struct hisi_sas_phy { struct hisi_sas_port *port; struct asd_sas_phy sas_phy; struct sas_identify identify; - struct timer_list timer; struct work_struct phyup_ws; u64 port_id; /* from hw */ u64 dev_sas_addr; diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 16664f2e15fb..37c838be4757 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -627,7 +627,6 @@ static void hisi_sas_phy_init(struct hisi_hba *hisi_hba, int phy_no) phy->hisi_hba = hisi_hba; phy->port = NULL; - init_timer(&phy->timer); sas_phy->enabled = (phy_no < hisi_hba->n_phy) ? 1 : 0; sas_phy->class = SAS; sas_phy->iproto = SAS_PROTOCOL_ALL; @@ -792,9 +791,10 @@ static void hisi_sas_task_done(struct sas_task *task) complete(&task->slow_task->completion); } -static void hisi_sas_tmf_timedout(unsigned long data) +static void hisi_sas_tmf_timedout(struct timer_list *t) { - struct sas_task *task = (struct sas_task *)data; + struct sas_task_slow *slow = from_timer(slow, t, timer); + struct sas_task *task = slow->task; unsigned long flags; spin_lock_irqsave(&task->task_state_lock, flags); @@ -833,8 +833,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, } task->task_done = hisi_sas_task_done; - task->slow_task->timer.data = (unsigned long) task; - task->slow_task->timer.function = hisi_sas_tmf_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout; task->slow_task->timer.expires = jiffies + TASK_TIMEOUT*HZ; add_timer(&task->slow_task->timer); @@ -1447,8 +1446,7 @@ hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba, task->dev = device; task->task_proto = device->tproto; task->task_done = hisi_sas_task_done; - task->slow_task->timer.data = (unsigned long)task; - task->slow_task->timer.function = hisi_sas_tmf_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout; task->slow_task->timer.expires = jiffies + msecs_to_jiffies(110); add_timer(&task->slow_task->timer); @@ -1877,7 +1875,7 @@ static struct Scsi_Host *hisi_sas_shost_alloc(struct platform_device *pdev, hisi_hba->shost = shost; SHOST_TO_SAS_HA(shost) = &hisi_hba->sha; - init_timer(&hisi_hba->timer); + timer_setup(&hisi_hba->timer, NULL, 0); if (hisi_sas_get_fw_info(hisi_hba) < 0) goto err_out; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c index 08eca20b0b81..9385554e43a6 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c @@ -807,9 +807,9 @@ static void phy_hard_reset_v1_hw(struct hisi_hba *hisi_hba, int phy_no) start_phy_v1_hw(hisi_hba, phy_no); } -static void start_phys_v1_hw(unsigned long data) +static void start_phys_v1_hw(struct timer_list *t) { - struct hisi_hba *hisi_hba = (struct hisi_hba *)data; + struct hisi_hba *hisi_hba = from_timer(hisi_hba, t, timer); int i; for (i = 0; i < hisi_hba->n_phy; i++) { @@ -828,7 +828,7 @@ static void phys_init_v1_hw(struct hisi_hba *hisi_hba) hisi_sas_phy_read32(hisi_hba, i, CHL_INT2_MSK); } - setup_timer(timer, start_phys_v1_hw, (unsigned long)hisi_hba); + timer_setup(timer, start_phys_v1_hw, 0); mod_timer(timer, jiffies + HZ); } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 779af979b6db..b1f097dabd01 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -728,7 +728,7 @@ enum { #define ERR_ON_RX_PHASE(err_phase) (err_phase == 0x10 || \ err_phase == 0x20 || err_phase == 0x40) -static void link_timeout_disable_link(unsigned long data); +static void link_timeout_disable_link(struct timer_list *t); static u32 hisi_sas_read32(struct hisi_hba *hisi_hba, u32 off) { @@ -1270,9 +1270,9 @@ static void init_reg_v2_hw(struct hisi_hba *hisi_hba) upper_32_bits(hisi_hba->initial_fis_dma)); } -static void link_timeout_enable_link(unsigned long data) +static void link_timeout_enable_link(struct timer_list *t) { - struct hisi_hba *hisi_hba = (struct hisi_hba *)data; + struct hisi_hba *hisi_hba = from_timer(hisi_hba, t, timer); int i, reg_val; for (i = 0; i < hisi_hba->n_phy; i++) { @@ -1287,13 +1287,13 @@ static void link_timeout_enable_link(unsigned long data) } } - hisi_hba->timer.function = link_timeout_disable_link; + hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link; mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(900)); } -static void link_timeout_disable_link(unsigned long data) +static void link_timeout_disable_link(struct timer_list *t) { - struct hisi_hba *hisi_hba = (struct hisi_hba *)data; + struct hisi_hba *hisi_hba = from_timer(hisi_hba, t, timer); int i, reg_val; reg_val = hisi_sas_read32(hisi_hba, PHY_STATE); @@ -1308,14 +1308,13 @@ static void link_timeout_disable_link(unsigned long data) } } - hisi_hba->timer.function = link_timeout_enable_link; + hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_enable_link; mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(100)); } static void set_link_timer_quirk(struct hisi_hba *hisi_hba) { - hisi_hba->timer.data = (unsigned long)hisi_hba; - hisi_hba->timer.function = link_timeout_disable_link; + hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link; hisi_hba->timer.expires = jiffies + msecs_to_jiffies(1000); add_timer(&hisi_hba->timer); } @@ -2574,9 +2573,9 @@ static int prep_ata_v2_hw(struct hisi_hba *hisi_hba, return 0; } -static void hisi_sas_internal_abort_quirk_timeout(unsigned long data) +static void hisi_sas_internal_abort_quirk_timeout(struct timer_list *t) { - struct hisi_sas_slot *slot = (struct hisi_sas_slot *)data; + struct hisi_sas_slot *slot = from_timer(slot, t, internal_abort_timer); struct hisi_sas_port *port = slot->port; struct asd_sas_port *asd_sas_port; struct asd_sas_phy *sas_phy; @@ -2619,8 +2618,7 @@ static int prep_abort_v2_hw(struct hisi_hba *hisi_hba, struct timer_list *timer = &slot->internal_abort_timer; /* setup the quirk timer */ - setup_timer(timer, hisi_sas_internal_abort_quirk_timeout, - (unsigned long)slot); + timer_setup(timer, hisi_sas_internal_abort_quirk_timeout, 0); /* Set the timeout to 10ms less than internal abort timeout */ mod_timer(timer, jiffies + msecs_to_jiffies(100)); diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 2e5fa9717be8..3f2f0baf2a5e 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -1823,7 +1823,7 @@ hisi_sas_shost_alloc_pci(struct pci_dev *pdev) hisi_hba->shost = shost; SHOST_TO_SAS_HA(shost) = &hisi_hba->sha; - init_timer(&hisi_hba->timer); + timer_setup(&hisi_hba->timer, NULL, 0); if (hisi_sas_get_fw_info(hisi_hba) < 0) goto err_out; diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 6b4fd2375178..174e5eff6155 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -41,9 +41,10 @@ static int sas_disable_routing(struct domain_device *dev, u8 *sas_addr); /* ---------- SMP task management ---------- */ -static void smp_task_timedout(unsigned long _task) +static void smp_task_timedout(struct timer_list *t) { - struct sas_task *task = (void *) _task; + struct sas_task_slow *slow = from_timer(slow, t, timer); + struct sas_task *task = slow->task; unsigned long flags; spin_lock_irqsave(&task->task_state_lock, flags); @@ -91,8 +92,7 @@ static int smp_execute_task_sg(struct domain_device *dev, task->task_done = smp_task_done; - task->slow_task->timer.data = (unsigned long) task; - task->slow_task->timer.function = smp_task_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)smp_task_timedout; task->slow_task->timer.expires = jiffies + SMP_TIMEOUT*HZ; add_timer(&task->slow_task->timer); diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c index 64e9cdda1c3c..681fcb837354 100644 --- a/drivers/scsi/libsas/sas_init.c +++ b/drivers/scsi/libsas/sas_init.c @@ -66,7 +66,8 @@ struct sas_task *sas_alloc_slow_task(gfp_t flags) } task->slow_task = slow; - init_timer(&slow->timer); + slow->task = task; + timer_setup(&slow->timer, NULL, 0); init_completion(&slow->completion); return task; diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index ea8ad06ff582..91795eb56206 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -919,7 +919,7 @@ void sas_task_abort(struct sas_task *task) return; if (!del_timer(&slow->timer)) return; - slow->timer.function(slow->timer.data); + slow->timer.function((TIMER_DATA_TYPE)&slow->timer); return; } diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c index 718c88de328b..8c91637cd598 100644 --- a/drivers/scsi/mvsas/mv_init.c +++ b/drivers/scsi/mvsas/mv_init.c @@ -95,7 +95,7 @@ static void mvs_phy_init(struct mvs_info *mvi, int phy_id) phy->mvi = mvi; phy->port = NULL; - init_timer(&phy->timer); + timer_setup(&phy->timer, NULL, 0); sas_phy->enabled = (phy_id < mvi->chip->n_phy) ? 1 : 0; sas_phy->class = SAS; sas_phy->iproto = SAS_PROTOCOL_ALL; @@ -248,7 +248,6 @@ static int mvs_alloc(struct mvs_info *mvi, struct Scsi_Host *shost) mvi->devices[i].dev_type = SAS_PHY_UNUSED; mvi->devices[i].device_id = i; mvi->devices[i].dev_status = MVS_DEV_NORMAL; - init_timer(&mvi->devices[i].timer); } /* diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index ee81d10252e0..cff1c37b8d2e 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -1283,9 +1283,10 @@ static void mvs_task_done(struct sas_task *task) complete(&task->slow_task->completion); } -static void mvs_tmf_timedout(unsigned long data) +static void mvs_tmf_timedout(struct timer_list *t) { - struct sas_task *task = (struct sas_task *)data; + struct sas_task_slow *slow = from_timer(slow, t, timer); + struct sas_task *task = slow->task; task->task_state_flags |= SAS_TASK_STATE_ABORTED; complete(&task->slow_task->completion); @@ -1309,8 +1310,7 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev, memcpy(&task->ssp_task, parameter, para_len); task->task_done = mvs_task_done; - task->slow_task->timer.data = (unsigned long) task; - task->slow_task->timer.function = mvs_tmf_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)mvs_tmf_timedout; task->slow_task->timer.expires = jiffies + MVS_TASK_TIMEOUT*HZ; add_timer(&task->slow_task->timer); @@ -1954,9 +1954,9 @@ static int mvs_handle_event(struct mvs_info *mvi, void *data, int handler) return ret; } -static void mvs_sig_time_out(unsigned long tphy) +static void mvs_sig_time_out(struct timer_list *t) { - struct mvs_phy *phy = (struct mvs_phy *)tphy; + struct mvs_phy *phy = from_timer(phy, t, timer); struct mvs_info *mvi = phy->mvi; u8 phy_no; @@ -2020,8 +2020,7 @@ void mvs_int_port(struct mvs_info *mvi, int phy_no, u32 events) MVS_CHIP_DISP->write_port_irq_mask(mvi, phy_no, tmp | PHYEV_SIG_FIS); if (phy->timer.function == NULL) { - phy->timer.data = (unsigned long)phy; - phy->timer.function = mvs_sig_time_out; + phy->timer.function = (TIMER_FUNC_TYPE)mvs_sig_time_out; phy->timer.expires = jiffies + 5*HZ; add_timer(&phy->timer); } diff --git a/drivers/scsi/mvsas/mv_sas.h b/drivers/scsi/mvsas/mv_sas.h index f9afd4cdd4c4..080676c1c9e5 100644 --- a/drivers/scsi/mvsas/mv_sas.h +++ b/drivers/scsi/mvsas/mv_sas.h @@ -247,7 +247,6 @@ struct mvs_device { enum sas_device_type dev_type; struct mvs_info *mvi_info; struct domain_device *sas_device; - struct timer_list timer; u32 attached_phy; u32 device_id; u32 running_req; diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index ce584c31d36e..7b2f92ae9866 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -656,9 +656,10 @@ void pm8001_task_done(struct sas_task *task) complete(&task->slow_task->completion); } -static void pm8001_tmf_timedout(unsigned long data) +static void pm8001_tmf_timedout(struct timer_list *t) { - struct sas_task *task = (struct sas_task *)data; + struct sas_task_slow *slow = from_timer(slow, t, timer); + struct sas_task *task = slow->task; task->task_state_flags |= SAS_TASK_STATE_ABORTED; complete(&task->slow_task->completion); @@ -694,8 +695,7 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev, task->task_proto = dev->tproto; memcpy(&task->ssp_task, parameter, para_len); task->task_done = pm8001_task_done; - task->slow_task->timer.data = (unsigned long)task; - task->slow_task->timer.function = pm8001_tmf_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout; task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT*HZ; add_timer(&task->slow_task->timer); @@ -781,8 +781,7 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha, task->dev = dev; task->task_proto = dev->tproto; task->task_done = pm8001_task_done; - task->slow_task->timer.data = (unsigned long)task; - task->slow_task->timer.function = pm8001_tmf_timedout; + task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout; task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT * HZ; add_timer(&task->slow_task->timer); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 6c0dc6155ee7..388aaf72b480 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -629,6 +629,7 @@ struct sas_task_slow { */ struct timer_list timer; struct completion completion; + struct sas_task *task; }; #define SAS_TASK_STATE_PENDING 1 From d744644ad7088ec04c0e1ad7c6c1adcec849c51c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 21 Oct 2017 08:39:39 -0700 Subject: [PATCH 0787/1085] scsi: qla4xxx: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: QLogic-Storage-Upstream@qlogic.com Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Acked-by: Manish Rangankar --- drivers/scsi/qla4xxx/ql4_os.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 64c6fa563fdb..2b8a8ce2a431 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -3955,16 +3955,15 @@ exit_session_conn_param: /* * Timer routines */ +static void qla4xxx_timer(struct timer_list *t); -static void qla4xxx_start_timer(struct scsi_qla_host *ha, void *func, +static void qla4xxx_start_timer(struct scsi_qla_host *ha, unsigned long interval) { DEBUG(printk("scsi: %s: Starting timer thread for adapter %d\n", __func__, ha->host->host_no)); - init_timer(&ha->timer); + timer_setup(&ha->timer, qla4xxx_timer, 0); ha->timer.expires = jiffies + interval * HZ; - ha->timer.data = (unsigned long)ha; - ha->timer.function = (void (*)(unsigned long))func; add_timer(&ha->timer); ha->timer_active = 1; } @@ -4508,8 +4507,9 @@ static void qla4xxx_check_relogin_flash_ddb(struct iscsi_cls_session *cls_sess) * qla4xxx_timer - checks every second for work to do. * @ha: Pointer to host adapter structure. **/ -static void qla4xxx_timer(struct scsi_qla_host *ha) +static void qla4xxx_timer(struct timer_list *t) { + struct scsi_qla_host *ha = from_timer(ha, t, timer); int start_dpc = 0; uint16_t w; @@ -8805,7 +8805,7 @@ skip_retry_init: ha->isp_ops->enable_intrs(ha); /* Start timer thread. */ - qla4xxx_start_timer(ha, qla4xxx_timer, 1); + qla4xxx_start_timer(ha, 1); set_bit(AF_INIT_DONE, &ha->flags); From 8a47aa9dc636db851254615ea79ba91a52cf9206 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 23 May 2017 16:48:50 -0700 Subject: [PATCH 0788/1085] target/iscsi: Simplify timer manipulation code Move timer initialization from before add_timer() to the context where the containing object is initialized. Use setup_timer() and mod_timer() instead of open coding these. Use 'jiffies' instead of get_jiffies_64() when calculating expiry times because expiry times have type unsigned long, just like 'jiffies'. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Andy Grover Cc: David Disseldorp Signed-off-by: Kees Cook --- drivers/target/iscsi/iscsi_target.c | 3 +++ drivers/target/iscsi/iscsi_target_erl0.c | 10 +++------ drivers/target/iscsi/iscsi_target_erl0.h | 1 + drivers/target/iscsi/iscsi_target_erl1.c | 8 ++----- drivers/target/iscsi/iscsi_target_erl1.h | 1 + drivers/target/iscsi/iscsi_target_login.c | 16 ++++++++------ drivers/target/iscsi/iscsi_target_login.h | 1 + drivers/target/iscsi/iscsi_target_nego.c | 8 +++---- drivers/target/iscsi/iscsi_target_util.c | 26 +++++++---------------- drivers/target/iscsi/iscsi_target_util.h | 2 ++ 10 files changed, 34 insertions(+), 42 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 5001261f5d69..7d619160b3d3 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -372,6 +372,9 @@ struct iscsi_np *iscsit_add_np( init_completion(&np->np_restart_comp); INIT_LIST_HEAD(&np->np_list); + setup_timer(&np->np_login_timer, iscsi_handle_login_thread_timeout, + (unsigned long)np); + ret = iscsi_target_setup_login_socket(np, sockaddr); if (ret != 0) { kfree(np); diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index 7fe2aa73cff6..f164098594e9 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -749,7 +749,7 @@ int iscsit_check_post_dataout( } } -static void iscsit_handle_time2retain_timeout(unsigned long data) +void iscsit_handle_time2retain_timeout(unsigned long data) { struct iscsi_session *sess = (struct iscsi_session *) data; struct iscsi_portal_group *tpg = sess->tpg; @@ -809,14 +809,10 @@ void iscsit_start_time2retain_handler(struct iscsi_session *sess) pr_debug("Starting Time2Retain timer for %u seconds on" " SID: %u\n", sess->sess_ops->DefaultTime2Retain, sess->sid); - init_timer(&sess->time2retain_timer); - sess->time2retain_timer.expires = - (get_jiffies_64() + sess->sess_ops->DefaultTime2Retain * HZ); - sess->time2retain_timer.data = (unsigned long)sess; - sess->time2retain_timer.function = iscsit_handle_time2retain_timeout; sess->time2retain_timer_flags &= ~ISCSI_TF_STOP; sess->time2retain_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&sess->time2retain_timer); + mod_timer(&sess->time2retain_timer, + jiffies + sess->sess_ops->DefaultTime2Retain * HZ); } /* diff --git a/drivers/target/iscsi/iscsi_target_erl0.h b/drivers/target/iscsi/iscsi_target_erl0.h index 3822d9cd1230..97e4ae728926 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.h +++ b/drivers/target/iscsi/iscsi_target_erl0.h @@ -11,6 +11,7 @@ extern void iscsit_set_dataout_sequence_values(struct iscsi_cmd *); extern int iscsit_check_pre_dataout(struct iscsi_cmd *, unsigned char *); extern int iscsit_check_post_dataout(struct iscsi_cmd *, unsigned char *, u8); extern void iscsit_start_time2retain_handler(struct iscsi_session *); +extern void iscsit_handle_time2retain_timeout(unsigned long data); extern int iscsit_stop_time2retain_timer(struct iscsi_session *); extern void iscsit_connection_reinstatement_rcfr(struct iscsi_conn *); extern void iscsit_cause_connection_reinstatement(struct iscsi_conn *, int); diff --git a/drivers/target/iscsi/iscsi_target_erl1.c b/drivers/target/iscsi/iscsi_target_erl1.c index fe9b7f1e44ac..19b28255b687 100644 --- a/drivers/target/iscsi/iscsi_target_erl1.c +++ b/drivers/target/iscsi/iscsi_target_erl1.c @@ -1148,7 +1148,7 @@ static int iscsit_set_dataout_timeout_values( /* * NOTE: Called from interrupt (timer) context. */ -static void iscsit_handle_dataout_timeout(unsigned long data) +void iscsit_handle_dataout_timeout(unsigned long data) { u32 pdu_length = 0, pdu_offset = 0; u32 r2t_length = 0, r2t_offset = 0; @@ -1264,13 +1264,9 @@ void iscsit_start_dataout_timer( pr_debug("Starting DataOUT timer for ITT: 0x%08x on" " CID: %hu.\n", cmd->init_task_tag, conn->cid); - init_timer(&cmd->dataout_timer); - cmd->dataout_timer.expires = (get_jiffies_64() + na->dataout_timeout * HZ); - cmd->dataout_timer.data = (unsigned long)cmd; - cmd->dataout_timer.function = iscsit_handle_dataout_timeout; cmd->dataout_timer_flags &= ~ISCSI_TF_STOP; cmd->dataout_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&cmd->dataout_timer); + mod_timer(&cmd->dataout_timer, jiffies + na->dataout_timeout * HZ); } void iscsit_stop_dataout_timer(struct iscsi_cmd *cmd) diff --git a/drivers/target/iscsi/iscsi_target_erl1.h b/drivers/target/iscsi/iscsi_target_erl1.h index 54d36bd25bea..0ff6e310ca36 100644 --- a/drivers/target/iscsi/iscsi_target_erl1.h +++ b/drivers/target/iscsi/iscsi_target_erl1.h @@ -29,6 +29,7 @@ extern int iscsit_execute_ooo_cmdsns(struct iscsi_session *); extern int iscsit_execute_cmd(struct iscsi_cmd *, int); extern int iscsit_handle_ooo_cmdsn(struct iscsi_session *, struct iscsi_cmd *, u32); extern void iscsit_remove_ooo_cmdsn(struct iscsi_session *, struct iscsi_ooo_cmdsn *); +extern void iscsit_handle_dataout_timeout(unsigned long data); extern void iscsit_mod_dataout_timer(struct iscsi_cmd *); extern void iscsit_start_dataout_timer(struct iscsi_cmd *, struct iscsi_conn *); extern void iscsit_stop_dataout_timer(struct iscsi_cmd *); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index dc13afbd4c88..8be9221b6b9d 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -333,6 +333,9 @@ static int iscsi_login_zero_tsih_s1( spin_lock_init(&sess->session_usage_lock); spin_lock_init(&sess->ttt_lock); + setup_timer(&sess->time2retain_timer, iscsit_handle_time2retain_timeout, + (unsigned long)sess); + idr_preload(GFP_KERNEL); spin_lock_bh(&sess_idr_lock); ret = idr_alloc(&sess_idr, NULL, 0, 0, GFP_NOWAIT); @@ -839,7 +842,7 @@ void iscsi_post_login_handler( iscsit_dec_conn_usage_count(conn); } -static void iscsi_handle_login_thread_timeout(unsigned long data) +void iscsi_handle_login_thread_timeout(unsigned long data) { struct iscsi_np *np = (struct iscsi_np *) data; @@ -866,13 +869,9 @@ static void iscsi_start_login_thread_timer(struct iscsi_np *np) * point we do not have access to ISCSI_TPG_ATTRIB(tpg)->login_timeout */ spin_lock_bh(&np->np_thread_lock); - init_timer(&np->np_login_timer); - np->np_login_timer.expires = (get_jiffies_64() + TA_LOGIN_TIMEOUT * HZ); - np->np_login_timer.data = (unsigned long)np; - np->np_login_timer.function = iscsi_handle_login_thread_timeout; np->np_login_timer_flags &= ~ISCSI_TF_STOP; np->np_login_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&np->np_login_timer); + mod_timer(&np->np_login_timer, jiffies + TA_LOGIN_TIMEOUT * HZ); pr_debug("Added timeout timer to iSCSI login request for" " %u seconds.\n", TA_LOGIN_TIMEOUT); @@ -1266,6 +1265,11 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) pr_debug("Moving to TARG_CONN_STATE_FREE.\n"); conn->conn_state = TARG_CONN_STATE_FREE; + setup_timer(&conn->nopin_response_timer, + iscsit_handle_nopin_response_timeout, (unsigned long)conn); + setup_timer(&conn->nopin_timer, iscsit_handle_nopin_timeout, + (unsigned long)conn); + if (iscsit_conn_set_transport(conn, np->np_transport) < 0) { kfree(conn); return 1; diff --git a/drivers/target/iscsi/iscsi_target_login.h b/drivers/target/iscsi/iscsi_target_login.h index 0e1fd6cedd54..f77850bf3e58 100644 --- a/drivers/target/iscsi/iscsi_target_login.h +++ b/drivers/target/iscsi/iscsi_target_login.h @@ -24,5 +24,6 @@ extern void iscsi_post_login_handler(struct iscsi_np *, struct iscsi_conn *, u8) extern void iscsi_target_login_sess_out(struct iscsi_conn *, struct iscsi_np *, bool, bool); extern int iscsi_target_login_thread(void *); +extern void iscsi_handle_login_thread_timeout(unsigned long data); #endif /*** ISCSI_TARGET_LOGIN_H ***/ diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index 7a6751fecd32..c851bd9f3175 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -618,11 +618,9 @@ static void iscsi_target_do_login_rx(struct work_struct *work) conn->login_kworker = current; allow_signal(SIGINT); - init_timer(&login_timer); - login_timer.expires = (get_jiffies_64() + TA_LOGIN_TIMEOUT * HZ); - login_timer.data = (unsigned long)conn; - login_timer.function = iscsi_target_login_timeout; - add_timer(&login_timer); + setup_timer_on_stack(&login_timer, iscsi_target_login_timeout, + (unsigned long)conn); + mod_timer(&login_timer, jiffies + TA_LOGIN_TIMEOUT * HZ); pr_debug("Starting login_timer for %s/%d\n", current->comm, current->pid); rc = conn->conn_transport->iscsit_get_login_rx(conn, login); diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 1e36f83b5961..505dad5dab7f 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -176,6 +176,8 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn *conn, int state) spin_lock_init(&cmd->istate_lock); spin_lock_init(&cmd->error_lock); spin_lock_init(&cmd->r2t_lock); + setup_timer(&cmd->dataout_timer, iscsit_handle_dataout_timeout, + (unsigned long)cmd); return cmd; } @@ -880,7 +882,7 @@ static int iscsit_add_nopin(struct iscsi_conn *conn, int want_response) return 0; } -static void iscsit_handle_nopin_response_timeout(unsigned long data) +void iscsit_handle_nopin_response_timeout(unsigned long data) { struct iscsi_conn *conn = (struct iscsi_conn *) data; @@ -949,14 +951,10 @@ void iscsit_start_nopin_response_timer(struct iscsi_conn *conn) return; } - init_timer(&conn->nopin_response_timer); - conn->nopin_response_timer.expires = - (get_jiffies_64() + na->nopin_response_timeout * HZ); - conn->nopin_response_timer.data = (unsigned long)conn; - conn->nopin_response_timer.function = iscsit_handle_nopin_response_timeout; conn->nopin_response_timer_flags &= ~ISCSI_TF_STOP; conn->nopin_response_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&conn->nopin_response_timer); + mod_timer(&conn->nopin_response_timer, + jiffies + na->nopin_response_timeout * HZ); pr_debug("Started NOPIN Response Timer on CID: %d to %u" " seconds\n", conn->cid, na->nopin_response_timeout); @@ -980,7 +978,7 @@ void iscsit_stop_nopin_response_timer(struct iscsi_conn *conn) spin_unlock_bh(&conn->nopin_timer_lock); } -static void iscsit_handle_nopin_timeout(unsigned long data) +void iscsit_handle_nopin_timeout(unsigned long data) { struct iscsi_conn *conn = (struct iscsi_conn *) data; @@ -1015,13 +1013,9 @@ void __iscsit_start_nopin_timer(struct iscsi_conn *conn) if (conn->nopin_timer_flags & ISCSI_TF_RUNNING) return; - init_timer(&conn->nopin_timer); - conn->nopin_timer.expires = (get_jiffies_64() + na->nopin_timeout * HZ); - conn->nopin_timer.data = (unsigned long)conn; - conn->nopin_timer.function = iscsit_handle_nopin_timeout; conn->nopin_timer_flags &= ~ISCSI_TF_STOP; conn->nopin_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&conn->nopin_timer); + mod_timer(&conn->nopin_timer, jiffies + na->nopin_timeout * HZ); pr_debug("Started NOPIN Timer on CID: %d at %u second" " interval\n", conn->cid, na->nopin_timeout); @@ -1043,13 +1037,9 @@ void iscsit_start_nopin_timer(struct iscsi_conn *conn) return; } - init_timer(&conn->nopin_timer); - conn->nopin_timer.expires = (get_jiffies_64() + na->nopin_timeout * HZ); - conn->nopin_timer.data = (unsigned long)conn; - conn->nopin_timer.function = iscsit_handle_nopin_timeout; conn->nopin_timer_flags &= ~ISCSI_TF_STOP; conn->nopin_timer_flags |= ISCSI_TF_RUNNING; - add_timer(&conn->nopin_timer); + mod_timer(&conn->nopin_timer, jiffies + na->nopin_timeout * HZ); pr_debug("Started NOPIN Timer on CID: %d at %u second" " interval\n", conn->cid, na->nopin_timeout); diff --git a/drivers/target/iscsi/iscsi_target_util.h b/drivers/target/iscsi/iscsi_target_util.h index 425160565d0c..78b3b9d20963 100644 --- a/drivers/target/iscsi/iscsi_target_util.h +++ b/drivers/target/iscsi/iscsi_target_util.h @@ -47,9 +47,11 @@ extern struct iscsi_conn *iscsit_get_conn_from_cid_rcfr(struct iscsi_session *, extern void iscsit_check_conn_usage_count(struct iscsi_conn *); extern void iscsit_dec_conn_usage_count(struct iscsi_conn *); extern void iscsit_inc_conn_usage_count(struct iscsi_conn *); +extern void iscsit_handle_nopin_response_timeout(unsigned long data); extern void iscsit_mod_nopin_response_timer(struct iscsi_conn *); extern void iscsit_start_nopin_response_timer(struct iscsi_conn *); extern void iscsit_stop_nopin_response_timer(struct iscsi_conn *); +extern void iscsit_handle_nopin_timeout(unsigned long data); extern void __iscsit_start_nopin_timer(struct iscsi_conn *); extern void iscsit_start_nopin_timer(struct iscsi_conn *); extern void iscsit_stop_nopin_timer(struct iscsi_conn *); From f7c9564a7cfa3ff69be74bcb1867c898772748b6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 14:58:45 -0700 Subject: [PATCH 0789/1085] target/iscsi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Includes a fix for correcting an on-stack timer usage. Cc: "Nicholas A. Bellinger" Cc: Bart Van Assche Cc: Jiang Yi Cc: Varun Prakash Cc: linux-scsi@vger.kernel.org Cc: target-devel@vger.kernel.org Reviewed-and-Tested-by: Bart Van Assche Signed-off-by: Kees Cook --- drivers/target/iscsi/iscsi_target.c | 3 +-- drivers/target/iscsi/iscsi_target_erl0.c | 4 ++-- drivers/target/iscsi/iscsi_target_erl0.h | 2 +- drivers/target/iscsi/iscsi_target_erl1.c | 4 ++-- drivers/target/iscsi/iscsi_target_erl1.h | 2 +- drivers/target/iscsi/iscsi_target_login.c | 15 +++++++-------- drivers/target/iscsi/iscsi_target_login.h | 2 +- drivers/target/iscsi/iscsi_target_nego.c | 23 +++++++++++++++-------- drivers/target/iscsi/iscsi_target_util.c | 11 +++++------ drivers/target/iscsi/iscsi_target_util.h | 4 ++-- 10 files changed, 37 insertions(+), 33 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 7d619160b3d3..9e67c7678c86 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -372,8 +372,7 @@ struct iscsi_np *iscsit_add_np( init_completion(&np->np_restart_comp); INIT_LIST_HEAD(&np->np_list); - setup_timer(&np->np_login_timer, iscsi_handle_login_thread_timeout, - (unsigned long)np); + timer_setup(&np->np_login_timer, iscsi_handle_login_thread_timeout, 0); ret = iscsi_target_setup_login_socket(np, sockaddr); if (ret != 0) { diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index f164098594e9..718fe9a1b709 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -749,9 +749,9 @@ int iscsit_check_post_dataout( } } -void iscsit_handle_time2retain_timeout(unsigned long data) +void iscsit_handle_time2retain_timeout(struct timer_list *t) { - struct iscsi_session *sess = (struct iscsi_session *) data; + struct iscsi_session *sess = from_timer(sess, t, time2retain_timer); struct iscsi_portal_group *tpg = sess->tpg; struct se_portal_group *se_tpg = &tpg->tpg_se_tpg; diff --git a/drivers/target/iscsi/iscsi_target_erl0.h b/drivers/target/iscsi/iscsi_target_erl0.h index 97e4ae728926..dba5e7826574 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.h +++ b/drivers/target/iscsi/iscsi_target_erl0.h @@ -11,7 +11,7 @@ extern void iscsit_set_dataout_sequence_values(struct iscsi_cmd *); extern int iscsit_check_pre_dataout(struct iscsi_cmd *, unsigned char *); extern int iscsit_check_post_dataout(struct iscsi_cmd *, unsigned char *, u8); extern void iscsit_start_time2retain_handler(struct iscsi_session *); -extern void iscsit_handle_time2retain_timeout(unsigned long data); +extern void iscsit_handle_time2retain_timeout(struct timer_list *t); extern int iscsit_stop_time2retain_timer(struct iscsi_session *); extern void iscsit_connection_reinstatement_rcfr(struct iscsi_conn *); extern void iscsit_cause_connection_reinstatement(struct iscsi_conn *, int); diff --git a/drivers/target/iscsi/iscsi_target_erl1.c b/drivers/target/iscsi/iscsi_target_erl1.c index 19b28255b687..76184094a0cf 100644 --- a/drivers/target/iscsi/iscsi_target_erl1.c +++ b/drivers/target/iscsi/iscsi_target_erl1.c @@ -1148,11 +1148,11 @@ static int iscsit_set_dataout_timeout_values( /* * NOTE: Called from interrupt (timer) context. */ -void iscsit_handle_dataout_timeout(unsigned long data) +void iscsit_handle_dataout_timeout(struct timer_list *t) { u32 pdu_length = 0, pdu_offset = 0; u32 r2t_length = 0, r2t_offset = 0; - struct iscsi_cmd *cmd = (struct iscsi_cmd *) data; + struct iscsi_cmd *cmd = from_timer(cmd, t, dataout_timer); struct iscsi_conn *conn = cmd->conn; struct iscsi_session *sess = NULL; struct iscsi_node_attrib *na; diff --git a/drivers/target/iscsi/iscsi_target_erl1.h b/drivers/target/iscsi/iscsi_target_erl1.h index 0ff6e310ca36..6a6e182674de 100644 --- a/drivers/target/iscsi/iscsi_target_erl1.h +++ b/drivers/target/iscsi/iscsi_target_erl1.h @@ -29,7 +29,7 @@ extern int iscsit_execute_ooo_cmdsns(struct iscsi_session *); extern int iscsit_execute_cmd(struct iscsi_cmd *, int); extern int iscsit_handle_ooo_cmdsn(struct iscsi_session *, struct iscsi_cmd *, u32); extern void iscsit_remove_ooo_cmdsn(struct iscsi_session *, struct iscsi_ooo_cmdsn *); -extern void iscsit_handle_dataout_timeout(unsigned long data); +extern void iscsit_handle_dataout_timeout(struct timer_list *t); extern void iscsit_mod_dataout_timer(struct iscsi_cmd *); extern void iscsit_start_dataout_timer(struct iscsi_cmd *, struct iscsi_conn *); extern void iscsit_stop_dataout_timer(struct iscsi_cmd *); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 8be9221b6b9d..64c5a57b92e4 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -333,8 +333,8 @@ static int iscsi_login_zero_tsih_s1( spin_lock_init(&sess->session_usage_lock); spin_lock_init(&sess->ttt_lock); - setup_timer(&sess->time2retain_timer, iscsit_handle_time2retain_timeout, - (unsigned long)sess); + timer_setup(&sess->time2retain_timer, + iscsit_handle_time2retain_timeout, 0); idr_preload(GFP_KERNEL); spin_lock_bh(&sess_idr_lock); @@ -842,9 +842,9 @@ void iscsi_post_login_handler( iscsit_dec_conn_usage_count(conn); } -void iscsi_handle_login_thread_timeout(unsigned long data) +void iscsi_handle_login_thread_timeout(struct timer_list *t) { - struct iscsi_np *np = (struct iscsi_np *) data; + struct iscsi_np *np = from_timer(np, t, np_login_timer); spin_lock_bh(&np->np_thread_lock); pr_err("iSCSI Login timeout on Network Portal %pISpc\n", @@ -1265,10 +1265,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) pr_debug("Moving to TARG_CONN_STATE_FREE.\n"); conn->conn_state = TARG_CONN_STATE_FREE; - setup_timer(&conn->nopin_response_timer, - iscsit_handle_nopin_response_timeout, (unsigned long)conn); - setup_timer(&conn->nopin_timer, iscsit_handle_nopin_timeout, - (unsigned long)conn); + timer_setup(&conn->nopin_response_timer, + iscsit_handle_nopin_response_timeout, 0); + timer_setup(&conn->nopin_timer, iscsit_handle_nopin_timeout, 0); if (iscsit_conn_set_transport(conn, np->np_transport) < 0) { kfree(conn); diff --git a/drivers/target/iscsi/iscsi_target_login.h b/drivers/target/iscsi/iscsi_target_login.h index f77850bf3e58..f3c4e7bf5763 100644 --- a/drivers/target/iscsi/iscsi_target_login.h +++ b/drivers/target/iscsi/iscsi_target_login.h @@ -24,6 +24,6 @@ extern void iscsi_post_login_handler(struct iscsi_np *, struct iscsi_conn *, u8) extern void iscsi_target_login_sess_out(struct iscsi_conn *, struct iscsi_np *, bool, bool); extern int iscsi_target_login_thread(void *); -extern void iscsi_handle_login_thread_timeout(unsigned long data); +extern void iscsi_handle_login_thread_timeout(struct timer_list *t); #endif /*** ISCSI_TARGET_LOGIN_H ***/ diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index c851bd9f3175..b686e2ce9c0e 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -559,9 +559,15 @@ static void iscsi_target_login_drop(struct iscsi_conn *conn, struct iscsi_login iscsi_target_login_sess_out(conn, np, zero_tsih, true); } -static void iscsi_target_login_timeout(unsigned long data) +struct conn_timeout { + struct timer_list timer; + struct iscsi_conn *conn; +}; + +static void iscsi_target_login_timeout(struct timer_list *t) { - struct iscsi_conn *conn = (struct iscsi_conn *)data; + struct conn_timeout *timeout = from_timer(timeout, t, timer); + struct iscsi_conn *conn = timeout->conn; pr_debug("Entering iscsi_target_login_timeout >>>>>>>>>>>>>>>>>>>\n"); @@ -580,7 +586,7 @@ static void iscsi_target_do_login_rx(struct work_struct *work) struct iscsi_np *np = login->np; struct iscsi_portal_group *tpg = conn->tpg; struct iscsi_tpg_np *tpg_np = conn->tpg_np; - struct timer_list login_timer; + struct conn_timeout timeout; int rc, zero_tsih = login->zero_tsih; bool state; @@ -618,13 +624,14 @@ static void iscsi_target_do_login_rx(struct work_struct *work) conn->login_kworker = current; allow_signal(SIGINT); - setup_timer_on_stack(&login_timer, iscsi_target_login_timeout, - (unsigned long)conn); - mod_timer(&login_timer, jiffies + TA_LOGIN_TIMEOUT * HZ); - pr_debug("Starting login_timer for %s/%d\n", current->comm, current->pid); + timeout.conn = conn; + timer_setup_on_stack(&timeout.timer, iscsi_target_login_timeout, 0); + mod_timer(&timeout.timer, jiffies + TA_LOGIN_TIMEOUT * HZ); + pr_debug("Starting login timer for %s/%d\n", current->comm, current->pid); rc = conn->conn_transport->iscsit_get_login_rx(conn, login); - del_timer_sync(&login_timer); + del_timer_sync(&timeout.timer); + destroy_timer_on_stack(&timeout.timer); flush_signals(current); conn->login_kworker = NULL; diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 505dad5dab7f..54f20f184dd6 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -176,8 +176,7 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn *conn, int state) spin_lock_init(&cmd->istate_lock); spin_lock_init(&cmd->error_lock); spin_lock_init(&cmd->r2t_lock); - setup_timer(&cmd->dataout_timer, iscsit_handle_dataout_timeout, - (unsigned long)cmd); + timer_setup(&cmd->dataout_timer, iscsit_handle_dataout_timeout, 0); return cmd; } @@ -882,9 +881,9 @@ static int iscsit_add_nopin(struct iscsi_conn *conn, int want_response) return 0; } -void iscsit_handle_nopin_response_timeout(unsigned long data) +void iscsit_handle_nopin_response_timeout(struct timer_list *t) { - struct iscsi_conn *conn = (struct iscsi_conn *) data; + struct iscsi_conn *conn = from_timer(conn, t, nopin_response_timer); iscsit_inc_conn_usage_count(conn); @@ -978,9 +977,9 @@ void iscsit_stop_nopin_response_timer(struct iscsi_conn *conn) spin_unlock_bh(&conn->nopin_timer_lock); } -void iscsit_handle_nopin_timeout(unsigned long data) +void iscsit_handle_nopin_timeout(struct timer_list *t) { - struct iscsi_conn *conn = (struct iscsi_conn *) data; + struct iscsi_conn *conn = from_timer(conn, t, nopin_timer); iscsit_inc_conn_usage_count(conn); diff --git a/drivers/target/iscsi/iscsi_target_util.h b/drivers/target/iscsi/iscsi_target_util.h index 78b3b9d20963..5a5f27877f49 100644 --- a/drivers/target/iscsi/iscsi_target_util.h +++ b/drivers/target/iscsi/iscsi_target_util.h @@ -47,11 +47,11 @@ extern struct iscsi_conn *iscsit_get_conn_from_cid_rcfr(struct iscsi_session *, extern void iscsit_check_conn_usage_count(struct iscsi_conn *); extern void iscsit_dec_conn_usage_count(struct iscsi_conn *); extern void iscsit_inc_conn_usage_count(struct iscsi_conn *); -extern void iscsit_handle_nopin_response_timeout(unsigned long data); +extern void iscsit_handle_nopin_response_timeout(struct timer_list *t); extern void iscsit_mod_nopin_response_timer(struct iscsi_conn *); extern void iscsit_start_nopin_response_timer(struct iscsi_conn *); extern void iscsit_stop_nopin_response_timer(struct iscsi_conn *); -extern void iscsit_handle_nopin_timeout(unsigned long data); +extern void iscsit_handle_nopin_timeout(struct timer_list *t); extern void __iscsit_start_nopin_timer(struct iscsi_conn *); extern void iscsit_start_nopin_timer(struct iscsi_conn *); extern void iscsit_stop_nopin_timer(struct iscsi_conn *); From 254db5bd075427b0f71ea112b30c924fb22bd0fd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 21 Oct 2017 08:37:11 -0700 Subject: [PATCH 0790/1085] RAS/CEC: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Christophe JAILLET Cc: Nicolas Iooss Cc: Ingo Molnar Signed-off-by: Kees Cook Reviewed-by: Borislav Petkov --- drivers/ras/cec.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index d0e5d6ee882c..4c586d731c48 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -168,11 +168,9 @@ static void cec_mod_timer(struct timer_list *t, unsigned long interval) mod_timer(t, round_jiffies(iv)); } -static void cec_timer_fn(unsigned long data) +static void cec_timer_fn(struct timer_list *unused) { - struct ce_array *ca = (struct ce_array *)data; - - do_spring_cleaning(ca); + do_spring_cleaning(&ce_arr); cec_mod_timer(&cec_timer, timer_interval); } @@ -509,7 +507,7 @@ void __init cec_init(void) if (create_debugfs_nodes()) return; - setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr); + timer_setup(&cec_timer, cec_timer_fn, 0); cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL); pr_info("Correctable Errors collector initialized.\n"); From 7d221856add016ef6cc0f4c0cad8e5d2612feedb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 17:57:22 -0700 Subject: [PATCH 0791/1085] usb: usbtest: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Also adds missing call to destroy_timer_on_stack(); Cc: Greg Kroah-Hartman Cc: Felipe Balbi Cc: Alan Stern Cc: "Gustavo A. R. Silva" Cc: linux-usb@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Greg Kroah-Hartman --- drivers/usb/misc/usbtest.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index eee82ca55b7b..b1ad69edf839 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -575,11 +575,16 @@ alloc_sglist(int nents, int max, int vary, struct usbtest_dev *dev, int pipe) return sg; } -static void sg_timeout(unsigned long _req) -{ - struct usb_sg_request *req = (struct usb_sg_request *) _req; +struct sg_timeout { + struct timer_list timer; + struct usb_sg_request *req; +}; - usb_sg_cancel(req); +static void sg_timeout(struct timer_list *t) +{ + struct sg_timeout *timeout = from_timer(timeout, t, timer); + + usb_sg_cancel(timeout->req); } static int perform_sglist( @@ -593,9 +598,11 @@ static int perform_sglist( { struct usb_device *udev = testdev_to_usbdev(tdev); int retval = 0; - struct timer_list sg_timer; + struct sg_timeout timeout = { + .req = req, + }; - setup_timer_on_stack(&sg_timer, sg_timeout, (unsigned long) req); + timer_setup_on_stack(&timeout.timer, sg_timeout, 0); while (retval == 0 && iterations-- > 0) { retval = usb_sg_init(req, udev, pipe, @@ -606,13 +613,14 @@ static int perform_sglist( if (retval) break; - mod_timer(&sg_timer, jiffies + + mod_timer(&timeout.timer, jiffies + msecs_to_jiffies(SIMPLE_IO_TIMEOUT)); usb_sg_wait(req); - if (!del_timer_sync(&sg_timer)) + if (!del_timer_sync(&timeout.timer)) retval = -ETIMEDOUT; else retval = req->status; + destroy_timer_on_stack(&timeout.timer); /* FIXME check resulting data pattern */ From 856ec53fcab37f52b184b0b2e3757702005455ff Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 17:02:56 -0700 Subject: [PATCH 0792/1085] drm: gma500: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Patrik Jakobsson Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Signed-off-by: Kees Cook Acked-by: Daniel Vetter --- drivers/gpu/drm/gma500/psb_lid.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/gma500/psb_lid.c b/drivers/gpu/drm/gma500/psb_lid.c index 1d2ebb5e530f..be6dda58fcae 100644 --- a/drivers/gpu/drm/gma500/psb_lid.c +++ b/drivers/gpu/drm/gma500/psb_lid.c @@ -23,9 +23,9 @@ #include "psb_intel_reg.h" #include -static void psb_lid_timer_func(unsigned long data) +static void psb_lid_timer_func(struct timer_list *t) { - struct drm_psb_private * dev_priv = (struct drm_psb_private *)data; + struct drm_psb_private *dev_priv = from_timer(dev_priv, t, lid_timer); struct drm_device *dev = (struct drm_device *)dev_priv->dev; struct timer_list *lid_timer = &dev_priv->lid_timer; unsigned long irq_flags; @@ -77,10 +77,8 @@ void psb_lid_timer_init(struct drm_psb_private *dev_priv) spin_lock_init(&dev_priv->lid_lock); spin_lock_irqsave(&dev_priv->lid_lock, irq_flags); - init_timer(lid_timer); + timer_setup(lid_timer, psb_lid_timer_func, 0); - lid_timer->data = (unsigned long)dev_priv; - lid_timer->function = psb_lid_timer_func; lid_timer->expires = jiffies + PSB_LID_DELAY; add_timer(lid_timer); From 1067f030994c69ca1fba8c607437c8895dcf8509 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:28 -0700 Subject: [PATCH 0793/1085] x86/mm: Relocate page fault error codes to traps.h Up to this point, only fault.c used the definitions of the page fault error codes. Thus, it made sense to keep them within such file. Other portions of code might be interested in those definitions too. For instance, the User- Mode Instruction Prevention emulation code will use such definitions to emulate a page fault when it is unable to successfully copy the results of the emulated instructions to user space. While relocating the error code enumeration, the prefix X86_ is used to make it consistent with the rest of the definitions in traps.h. Of course, code using the enumeration had to be updated as well. No functional changes were performed. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Chen Yucong Cc: Vlastimil Babka Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/traps.h | 18 ++++++++ arch/x86/mm/fault.c | 88 ++++++++++++++---------------------- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5545f6459bf5..da3c3a3674a5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -144,4 +144,22 @@ enum { X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e2baeaa053a5..db71c73530bd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,26 +28,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - - PF_PROT = 1 << 0, - PF_WRITE = 1 << 1, - PF_USER = 1 << 2, - PF_RSVD = 1 << 3, - PF_INSTR = 1 << 4, - PF_PK = 1 << 5, -}; - /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: @@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); @@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * siginfo so userspace can discover which protection key was set * on the PTE. * - * If we get here, we know that the hardware signaled a PF_PK + * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. @@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) /* * force_sig_info_fault() is called from a number of * contexts, some of which have a VMA and some of which - * do not. The PF_PK handing happens after we have a + * do not. The X86_PF_PK handing happens after we have a * valid VMA, so we should never reach this without a * valid VMA. */ @@ -697,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (!oops_may_print()) return; - if (error_code & PF_INSTR) { + if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; @@ -779,7 +759,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, */ if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | PF_USER; + tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ @@ -897,7 +877,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * It's possible to have interrupts off here: */ @@ -918,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * Instruction fetch faults in the vsyscall page might need * emulation. */ - if (unlikely((error_code & PF_INSTR) && + if (unlikely((error_code & X86_PF_INSTR) && ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; @@ -931,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * are always protection faults. */ if (address >= TASK_SIZE_MAX) - error_code |= PF_PROT; + error_code |= X86_PF_PROT; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -992,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return false; - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return true; return false; } @@ -1024,7 +1004,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1052,14 +1032,14 @@ static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) { - if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; @@ -1084,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, static int spurious_fault_check(unsigned long error_code, pte_t *pte) { - if ((error_code & PF_WRITE) && !pte_write(*pte)) + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; /* * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set PF_PK. + * changes, so no spurious fault will ever set X86_PF_PK. */ - if ((error_code & PF_PK)) + if ((error_code & X86_PF_PK)) return 1; return 1; @@ -1139,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address) * change, so user accesses are not expected to cause spurious * faults. */ - if (error_code != (PF_WRITE | PF_PROT) - && error_code != (PF_INSTR | PF_PROT)) + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); @@ -1200,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return 1; /* * Make sure to check the VMA so that we do not perform - * faults just to hit a PF_PK as soon as we fill in a + * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return 1; - if (error_code & PF_WRITE) { + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -1220,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) } /* read, present: */ - if (unlikely(error_code & PF_PROT)) + if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ @@ -1243,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (!static_cpu_has(X86_FEATURE_SMAP)) return false; - if (error_code & PF_USER) + if (error_code & X86_PF_USER) return false; if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) @@ -1296,7 +1276,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; @@ -1324,7 +1304,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & PF_RSVD)) + if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { @@ -1350,7 +1330,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= PF_USER; + error_code |= X86_PF_USER; flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1359,9 +1339,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1381,7 +1361,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if ((error_code & PF_USER) == 0 && + if (!(error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address, NULL); return; @@ -1408,7 +1388,7 @@ retry: bad_area(regs, error_code, address); return; } - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter From b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:29 -0700 Subject: [PATCH 0794/1085] x86/boot: Relocate definition of the initial state of CR0 Both head_32.S and head_64.S utilize the same value to initialize the control register CR0. Also, other parts of the kernel might want to access this initial definition (e.g., emulation code for User-Mode Instruction Prevention uses this state to provide a sane dummy value for CR0 when emulating the smsw instruction). Thus, relocate this definition to a header file from which it can be conveniently accessed. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: linux-mm@kvack.org Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: linux-arch@vger.kernel.org Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Denys Vlasenko Cc: Chris Metcalf Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Chen Yucong Cc: Vlastimil Babka Cc: Dave Hansen Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Linus Torvalds Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/uapi/asm/processor-flags.h | 3 +++ arch/x86/kernel/head_32.S | 3 --- arch/x86/kernel/head_64.S | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 185f3d10c194..39946d0a1d41 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -151,5 +151,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..c3cfc655f551 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -211,9 +211,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..5e1bfdd86b5b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -149,9 +149,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 From ed40a10431701d683bfd59f7ca01a8c97408cf67 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:31 -0700 Subject: [PATCH 0795/1085] uprobes/x86: Use existing definitions for segment override prefixes Rather than using hard-coded values of the segment override prefixes, leverage the existing definitions provided in inat.h. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Srikar Dronamraju Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Denys Vlasenko Cc: Chris Metcalf Cc: Brian Gerst Cc: Andy Lutomirski Cc: Chen Yucong Cc: Vlastimil Babka Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Link: https://lkml.kernel.org/r/1509135945-13762-5-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/uprobes.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 495c776de4b4..a3755d293a48 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn) int i; for (i = 0; i < insn->prefixes.nbytes; i++) { - switch (insn->prefixes.bytes[i]) { - case 0x26: /* INAT_PFX_ES */ - case 0x2E: /* INAT_PFX_CS */ - case 0x36: /* INAT_PFX_DS */ - case 0x3E: /* INAT_PFX_SS */ - case 0xF0: /* INAT_PFX_LOCK */ + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_ES): + case INAT_MAKE_PREFIX(INAT_PFX_CS): + case INAT_MAKE_PREFIX(INAT_PFX_DS): + case INAT_MAKE_PREFIX(INAT_PFX_SS): + case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } From b15d70df6e685912be8bbcb7557d277d48aa942c Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:32 -0700 Subject: [PATCH 0796/1085] x86/mpx: Simplify handling of errors when computing linear addresses When errors occur in the computation of the linear address, -1L is returned. Rather than having a separate return path for errors, the variable used to return the computed linear address can be initialized with the error value. Hence, only one return path is needed. This makes the function easier to read. While here, ensure that the error value is -1L, a 64-bit value, rather than -1, a 32-bit value. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Adan Hawthorn Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Nathan Howard Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Joe Perches Cc: Paolo Bonzini Cc: Andrew Morton Link: https://lkml.kernel.org/r/1509135945-13762-6-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/mm/mpx.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 9ceaa955d2ba..f4c48a01c906 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -138,7 +138,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, */ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) { - unsigned long addr, base, indx; + unsigned long addr = -1L, base, indx; int addr_offset, base_offset, indx_offset; insn_byte_t sib; @@ -149,17 +149,17 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) if (X86_MODRM_MOD(insn->modrm.value) == 3) { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); if (addr_offset < 0) - goto out_err; + goto out; addr = regs_get_register(regs, addr_offset); } else { if (insn->sib.nbytes) { base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); if (base_offset < 0) - goto out_err; + goto out; indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); if (indx_offset < 0) - goto out_err; + goto out; base = regs_get_register(regs, base_offset); indx = regs_get_register(regs, indx_offset); @@ -167,14 +167,13 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) } else { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); if (addr_offset < 0) - goto out_err; + goto out; addr = regs_get_register(regs, addr_offset); } addr += insn->displacement.value; } +out: return (void __user *)addr; -out_err: - return (void __user *)-1; } static int mpx_insn_decode(struct insn *insn, From e27c310af5c05cf876d9cad006928076c27f54d4 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:30 -0700 Subject: [PATCH 0797/1085] ptrace,x86: Make user_64bit_mode() available to 32-bit builds In its current form, user_64bit_mode() can only be used when CONFIG_X86_64 is selected. This implies that code built with CONFIG_X86_64=n cannot use it. If a piece of code needs to be built for both CONFIG_X86_64=y and CONFIG_X86_64=n and wants to use this function, it needs to wrap it in an #ifdef/#endif; potentially, in multiple places. This can be easily avoided with a single #ifdef/#endif pair within user_64bit_mode() itself. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/ptrace.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 91c04c8e67fa..e2afbf689309 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -135,9 +135,9 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } -#ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { +#ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 @@ -148,8 +148,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif +#else /* !CONFIG_X86_64 */ + return false; +#endif } +#ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif From b8d2eff3b1c6e46238a5fb3f56843e9974b4889f Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:33 -0700 Subject: [PATCH 0798/1085] x86/mpx: Use signed variables to compute effective addresses Even though memory addresses are unsigned, the operands used to compute the effective address do have a sign. This is true for ModRM.rm, SIB.base, SIB.index as well as the displacement bytes. Thus, signed variables shall be used when computing the effective address from these operands. Once the signed effective address has been computed, it is casted to an unsigned long to determine the linear address. Variables are renamed to better reflect the type of address being computed. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Adan Hawthorn Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Nathan Howard Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Joe Perches Cc: Paolo Bonzini Cc: Andrew Morton Link: https://lkml.kernel.org/r/1509135945-13762-7-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/mm/mpx.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index f4c48a01c906..57e5bf58ea34 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -138,8 +138,9 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, */ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) { - unsigned long addr = -1L, base, indx; int addr_offset, base_offset, indx_offset; + unsigned long linear_addr = -1L; + long eff_addr, base, indx; insn_byte_t sib; insn_get_modrm(insn); @@ -150,7 +151,8 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); if (addr_offset < 0) goto out; - addr = regs_get_register(regs, addr_offset); + + eff_addr = regs_get_register(regs, addr_offset); } else { if (insn->sib.nbytes) { base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); @@ -163,17 +165,23 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) base = regs_get_register(regs, base_offset); indx = regs_get_register(regs, indx_offset); - addr = base + indx * (1 << X86_SIB_SCALE(sib)); + + eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); } else { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); if (addr_offset < 0) goto out; - addr = regs_get_register(regs, addr_offset); + + eff_addr = regs_get_register(regs, addr_offset); } - addr += insn->displacement.value; + + eff_addr += insn->displacement.value; } + + linear_addr = (unsigned long)eff_addr; + out: - return (void __user *)addr; + return (void __user *)linear_addr; } static int mpx_insn_decode(struct insn *insn, From ff9d78025c519046cfbc212b34f09116685402fc Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:34 -0700 Subject: [PATCH 0799/1085] x86/mpx: Do not use SIB.index if its value is 100b and ModRM.mod is not 11b Section 2.2.1.2 of the Intel 64 and IA-32 Architectures Software Developer's Manual volume 2A states that when ModRM.mod !=11b and ModRM.rm = 100b indexed register-indirect addressing is used. In other words, a SIB byte follows the ModRM byte. In the specific case of SIB.index = 100b, the scale*index portion of the computation of the effective address is null. To signal callers of this particular situation, get_reg_offset() can return -EDOM (-EINVAL continues to indicate that an error when decoding the SIB byte). An example of this situation can be the following instruction: 8b 4c 23 80 mov -0x80(%rbx,%riz,1),%rcx ModRM: 0x4c [mod:1b][reg:1b][rm:100b] SIB: 0x23 [scale:0b][index:100b][base:11b] Displacement: 0x80 (1-byte, as per ModRM.mod = 1b) The %riz 'register' indicates a null index. In long mode, a REX prefix may be used. When a REX prefix is present, REX.X adds a fourth bit to the register selection of SIB.index. This gives the ability to refer to all the 16 general purpose registers. When REX.X is 1b and SIB.index is 100b, the index is indicated in %r12. In our example, this would look like: 42 8b 4c 23 80 mov -0x80(%rbx,%r12,1),%rcx REX: 0x42 [W:0b][R:0b][X:1b][B:0b] ModRM: 0x4c [mod:1b][reg:1b][rm:100b] SIB: 0x23 [scale:0b][.X: 1b, index:100b][.B:0b, base:11b] Displacement: 0x80 (1-byte, as per ModRM.mod = 1b) %r12 is a valid register to use in the scale*index part of the effective address computation. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Adan Hawthorn Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Nathan Howard Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Joe Perches Cc: Paolo Bonzini Cc: Andrew Morton Link: https://lkml.kernel.org/r/1509135945-13762-8-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/mm/mpx.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 57e5bf58ea34..2ad1d4a42fcc 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -110,6 +110,15 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, regno = X86_SIB_INDEX(insn->sib.value); if (X86_REX_X(insn->rex_prefix.value)) regno += 8; + + /* + * If ModRM.mod != 3 and SIB.index = 4 the scale*index + * portion of the address computation is null. This is + * true only if REX.X is 0. In such a case, the SIB index + * is used in the address computation. + */ + if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4) + return -EDOM; break; case REG_TYPE_BASE: @@ -160,11 +169,19 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) goto out; indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - if (indx_offset < 0) + /* + * A negative offset generally means a error, except + * -EDOM, which means that the contents of the register + * should not be used as index. + */ + if (indx_offset == -EDOM) + indx = 0; + else if (indx_offset < 0) goto out; + else + indx = regs_get_register(regs, indx_offset); base = regs_get_register(regs, base_offset); - indx = regs_get_register(regs, indx_offset); eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); } else { From 4578f06fc93fb73c9c644ed838f4cdabbfdc4df1 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:35 -0700 Subject: [PATCH 0800/1085] x86/mpx: Do not use SIB.base if its value is 101b and ModRM.mod = 0 Section 2.2.1.2 of the Intel 64 and IA-32 Architectures Software Developer's Manual volume 2A states that if a SIB byte is used and SIB.base is 101b and ModRM.mod is zero, then the base part of the base part of the effective address computation is null. To signal this situation, a -EDOM error is returned to indicate callers to ignore the base value present in the register operand. In this scenario, a 32-bit displacement follows the SIB byte. Displacement is obtained when the instruction decoder parses the operands. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Adan Hawthorn Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Nathan Howard Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Joe Perches Cc: Paolo Bonzini Cc: Andrew Morton Link: https://lkml.kernel.org/r/1509135945-13762-9-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/mm/mpx.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 2ad1d4a42fcc..581a960a4f09 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -123,6 +123,14 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); + /* + * If ModRM.mod is 0 and SIB.base == 5, the base of the + * register-indirect addressing is 0. In this case, a + * 32-bit displacement follows the SIB byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; @@ -164,16 +172,22 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) eff_addr = regs_get_register(regs, addr_offset); } else { if (insn->sib.nbytes) { + /* + * Negative values in the base and index offset means + * an error when decoding the SIB byte. Except -EDOM, + * which means that the registers should not be used + * in the address computation. + */ base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); - if (base_offset < 0) + if (base_offset == -EDOM) + base = 0; + else if (base_offset < 0) goto out; + else + base = regs_get_register(regs, base_offset); indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - /* - * A negative offset generally means a error, except - * -EDOM, which means that the contents of the register - * should not be used as index. - */ + if (indx_offset == -EDOM) indx = 0; else if (indx_offset < 0) @@ -181,8 +195,6 @@ static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) else indx = regs_get_register(regs, indx_offset); - base = regs_get_register(regs, base_offset); - eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); } else { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); From 32542ee295bec38e5e1608f8c9d6d28e5a7e6112 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:36 -0700 Subject: [PATCH 0801/1085] x86/mpx, x86/insn: Relocate insn util functions to a new insn-eval file Other kernel submodules can benefit from using the utility functions defined in mpx.c to obtain the addresses and values of operands contained in the general purpose registers. An instance of this is the emulation code used for instructions protected by the Intel User-Mode Instruction Prevention feature. Thus, these functions are relocated to a new insn-eval.c file. The reason to not relocate these utilities into insn.c is that the latter solely analyses instructions given by a struct insn without any knowledge of the meaning of the values of instruction operands. This new utility insn- eval.c aims to be used to resolve userspace linear addresses based on the contents of the instruction operands as well as the contents of pt_regs structure. These utilities come with a separate header. This is to avoid taking insn.c out of sync from the instructions decoders under tools/obj and tools/perf. This also avoids adding cumbersome #ifdef's for the #include'd files required to decode instructions in a kernel context. Functions are simply relocated. There are not functional or indentation changes. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-10-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/insn-eval.h | 16 +++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/insn-eval.c | 163 +++++++++++++++++++++++++++++++ arch/x86/mm/mpx.c | 156 +---------------------------- 4 files changed, 182 insertions(+), 155 deletions(-) create mode 100644 arch/x86/include/asm/insn-eval.h create mode 100644 arch/x86/lib/insn-eval.c diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h new file mode 100644 index 000000000000..5cab1b1da84d --- /dev/null +++ b/arch/x86/include/asm/insn-eval.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_INSN_EVAL_H +#define _ASM_X86_INSN_EVAL_H +/* + * A collection of utility functions for x86 instruction analysis to be + * used in a kernel context. Useful when, for instance, making sense + * of the registers indicated by operands. + */ + +#include +#include +#include +#include + +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); + +#endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 34a74131a12c..675d7b075fed 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,7 +23,7 @@ lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o -lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c new file mode 100644 index 000000000000..df9418c43356 --- /dev/null +++ b/arch/x86/lib/insn-eval.c @@ -0,0 +1,163 @@ +/* + * Utility functions for x86 operand and address decoding + * + * Copyright (C) Intel Corporation 2017 + */ +#include +#include +#include +#include +#include + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value)) + regno += 8; + + /* + * If ModRM.mod != 3 and SIB.index = 4 the scale*index + * portion of the address computation is null. This is + * true only if REX.X is 0. In such a case, the SIB index + * is used in the address computation. + */ + if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4) + return -EDOM; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + /* + * If ModRM.mod is 0 and SIB.base == 5, the base of the + * register-indirect addressing is 0. In this case, a + * 32-bit displacement follows the SIB byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + default: + pr_err("invalid register type"); + BUG(); + break; + } + + if (regno >= nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/* + * return the address being referenced be instruction + * for rm=3 returning the content of the rm reg + * for rm!=3 calculates the address using SIB and Disp + */ +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + int addr_offset, base_offset, indx_offset; + unsigned long linear_addr = -1L; + long eff_addr, base, indx; + insn_byte_t sib; + + insn_get_modrm(insn); + insn_get_sib(insn); + sib = insn->sib.value; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out; + + eff_addr = regs_get_register(regs, addr_offset); + } else { + if (insn->sib.nbytes) { + /* + * Negative values in the base and index offset means + * an error when decoding the SIB byte. Except -EDOM, + * which means that the registers should not be used + * in the address computation. + */ + base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + if (base_offset == -EDOM) + base = 0; + else if (base_offset < 0) + goto out; + else + base = regs_get_register(regs, base_offset); + + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + + if (indx_offset == -EDOM) + indx = 0; + else if (indx_offset < 0) + goto out; + else + indx = regs_get_register(regs, indx_offset); + + eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); + } else { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out; + + eff_addr = regs_get_register(regs, addr_offset); + } + + eff_addr += insn->displacement.value; + } + + linear_addr = (unsigned long)eff_addr; + +out: + return (void __user *)linear_addr; +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 581a960a4f09..28782059ad2d 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -60,159 +61,6 @@ static unsigned long mpx_mmap(unsigned long len) return addr; } -enum reg_type { - REG_TYPE_RM = 0, - REG_TYPE_INDEX, - REG_TYPE_BASE, -}; - -static int get_reg_offset(struct insn *insn, struct pt_regs *regs, - enum reg_type type) -{ - int regno = 0; - - static const int regoff[] = { - offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), - offsetof(struct pt_regs, dx), - offsetof(struct pt_regs, bx), - offsetof(struct pt_regs, sp), - offsetof(struct pt_regs, bp), - offsetof(struct pt_regs, si), - offsetof(struct pt_regs, di), -#ifdef CONFIG_X86_64 - offsetof(struct pt_regs, r8), - offsetof(struct pt_regs, r9), - offsetof(struct pt_regs, r10), - offsetof(struct pt_regs, r11), - offsetof(struct pt_regs, r12), - offsetof(struct pt_regs, r13), - offsetof(struct pt_regs, r14), - offsetof(struct pt_regs, r15), -#endif - }; - int nr_registers = ARRAY_SIZE(regoff); - /* - * Don't possibly decode a 32-bit instructions as - * reading a 64-bit-only register. - */ - if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) - nr_registers -= 8; - - switch (type) { - case REG_TYPE_RM: - regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - case REG_TYPE_INDEX: - regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value)) - regno += 8; - - /* - * If ModRM.mod != 3 and SIB.index = 4 the scale*index - * portion of the address computation is null. This is - * true only if REX.X is 0. In such a case, the SIB index - * is used in the address computation. - */ - if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4) - return -EDOM; - break; - - case REG_TYPE_BASE: - regno = X86_SIB_BASE(insn->sib.value); - /* - * If ModRM.mod is 0 and SIB.base == 5, the base of the - * register-indirect addressing is 0. In this case, a - * 32-bit displacement follows the SIB byte. - */ - if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) - return -EDOM; - - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - default: - pr_err("invalid register type"); - BUG(); - break; - } - - if (regno >= nr_registers) { - WARN_ONCE(1, "decoded an instruction with an invalid register"); - return -EINVAL; - } - return regoff[regno]; -} - -/* - * return the address being referenced be instruction - * for rm=3 returning the content of the rm reg - * for rm!=3 calculates the address using SIB and Disp - */ -static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) -{ - int addr_offset, base_offset, indx_offset; - unsigned long linear_addr = -1L; - long eff_addr, base, indx; - insn_byte_t sib; - - insn_get_modrm(insn); - insn_get_sib(insn); - sib = insn->sib.value; - - if (X86_MODRM_MOD(insn->modrm.value) == 3) { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out; - - eff_addr = regs_get_register(regs, addr_offset); - } else { - if (insn->sib.nbytes) { - /* - * Negative values in the base and index offset means - * an error when decoding the SIB byte. Except -EDOM, - * which means that the registers should not be used - * in the address computation. - */ - base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); - if (base_offset == -EDOM) - base = 0; - else if (base_offset < 0) - goto out; - else - base = regs_get_register(regs, base_offset); - - indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - - if (indx_offset == -EDOM) - indx = 0; - else if (indx_offset < 0) - goto out; - else - indx = regs_get_register(regs, indx_offset); - - eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); - } else { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out; - - eff_addr = regs_get_register(regs, addr_offset); - } - - eff_addr += insn->displacement.value; - } - - linear_addr = (unsigned long)eff_addr; - -out: - return (void __user *)linear_addr; -} - static int mpx_insn_decode(struct insn *insn, struct pt_regs *regs) { @@ -325,7 +173,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) info->si_signo = SIGSEGV; info->si_errno = 0; info->si_code = SEGV_BNDERR; - info->si_addr = mpx_get_addr_ref(&insn, regs); + info->si_addr = insn_get_addr_ref(&insn, regs); /* * We were not able to extract an address from the instruction, * probably because there was something invalid in it. From ed594e4ba5bfe268d63d7cee3c1a827e3dd5056f Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:37 -0700 Subject: [PATCH 0802/1085] x86/insn-eval: Do not BUG on invalid register type We are not in a critical failure path. The invalid register type is caused when trying to decode invalid instruction bytes from a user-space program. Thus, simply print an error message. To prevent this warning from being abused from user space programs, use the rate-limited variant of pr_err(). along with a descriptive prefix. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-11-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/lib/insn-eval.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index df9418c43356..4931d9252dde 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -5,10 +5,14 @@ */ #include #include +#include #include #include #include +#undef pr_fmt +#define pr_fmt(fmt) "insn: " fmt + enum reg_type { REG_TYPE_RM = 0, REG_TYPE_INDEX, @@ -85,9 +89,8 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, break; default: - pr_err("invalid register type"); - BUG(); - break; + pr_err_ratelimited("invalid register type: %d\n", type); + return -EINVAL; } if (regno >= nr_registers) { From e5e45f11110191740ecb365fa8c7a25814ce8ac8 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:38 -0700 Subject: [PATCH 0803/1085] x86/insn-eval: Add a utility function to get register offsets The function get_reg_offset() returns the offset to the register the argument specifies as indicated in an enumeration of type offset. Callers of this function would need the definition of such enumeration. This is not needed. Instead, add helper functions for this purpose. These functions are useful in cases when, for instance, the caller needs to decide whether the operand is a register or a memory location by looking at the rm part of the ModRM byte. As of now, this is the only helper function that is needed. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-12-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/insn-eval.h | 1 + arch/x86/lib/insn-eval.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 5cab1b1da84d..7e8c9633a377 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -12,5 +12,6 @@ #include void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4931d9252dde..405ffeb1c382 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -100,6 +100,23 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, return regoff[regno]; } +/** + * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the r/m part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. In specific + * cases, the returned value can be -EDOM to indicate that the particular value + * of ModRM does not refer to a register and shall be ignored. + */ +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) +{ + return get_reg_offset(insn, regs, REG_TYPE_RM); +} + /* * return the address being referenced be instruction * for rm=3 returning the content of the rm reg From 536b815388f7f4d2a7cd1418939902fb037ea370 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:39 -0700 Subject: [PATCH 0804/1085] x86/insn-eval: Add utility function to identify string instructions String instructions are special because, in protected mode, the linear address is always obtained via the ES segment register in operands that use the (E)DI register; the DS segment register in operands that use the (E)SI register. Furthermore, segment override prefixes are ignored when calculating a linear address involving the (E)DI register; segment override prefixes can be used when calculating linear addresses involving the (E)SI register. It follows that linear addresses are calculated differently for the case of string instructions. The purpose of this utility function is to identify such instructions for callers to determine a linear address correctly. Note that this function only identifies string instructions; it does not determine what segment register to use in the address computation. That is left to callers. A subsequent commmit introduces a function to determine the segment register to use given the instruction, operands and segment override prefixes. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-13-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/lib/insn-eval.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 405ffeb1c382..ac7b87c228b9 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -19,6 +19,34 @@ enum reg_type { REG_TYPE_BASE, }; +/** + * is_string_insn() - Determine if instruction is a string instruction + * @insn: Instruction containing the opcode to inspect + * + * Returns: + * + * true if the instruction, determined by the opcode, is any of the + * string instructions as defined in the Intel Software Development manual. + * False otherwise. + */ +static bool is_string_insn(struct insn *insn) +{ + insn_get_opcode(insn); + + /* All string instructions have a 1-byte opcode. */ + if (insn->opcode.nbytes != 1) + return false; + + switch (insn->opcode.bytes[0]) { + case 0x6c ... 0x6f: /* INS, OUTS */ + case 0xa4 ... 0xa7: /* MOVS, CMPS */ + case 0xaa ... 0xaf: /* STOS, LODS, SCAS */ + return true; + default: + return false; + } +} + static int get_reg_offset(struct insn *insn, struct pt_regs *regs, enum reg_type type) { From 32d0b95300db03c2b23b2ea2c94769a4a138e79d Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:40 -0700 Subject: [PATCH 0805/1085] x86/insn-eval: Add utility functions to get segment selector When computing a linear address and segmentation is used, we need to know the base address of the segment involved in the computation. In most of the cases, the segment base address will be zero as in USER_DS/USER32_DS. However, it may be possible that a user space program defines its own segments via a local descriptor table. In such a case, the segment base address may not be zero. Thus, the segment base address is needed to calculate correctly the linear address. If running in protected mode, the segment selector to be used when computing a linear address is determined by either any of segment override prefixes in the instruction or inferred from the registers involved in the computation of the effective address; in that order. Also, there are cases when the segment override prefixes shall be ignored (i.e., code segments are always selected by the CS segment register; string instructions always use the ES segment register when using rDI register as operand). In long mode, segment registers are ignored, except for FS and GS. In these two cases, base addresses are obtained from the respective MSRs. For clarity, this process can be split into four steps (and an equal number of functions): determine if segment prefixes overrides can be used; parse the segment override prefixes, and use them if found; if not found or cannot be used, use the default segment registers associated with the operand registers. Once the segment register to use has been identified, read its value to obtain the segment selector. The method to obtain the segment selector depends on several factors. In 32-bit builds, segment selectors are saved into a pt_regs structure when switching to kernel mode. The same is also true for virtual-8086 mode. In 64-bit builds, segmentation is mostly ignored, except when running a program in 32-bit legacy mode. In this case, CS and SS can be obtained from pt_regs. DS, ES, FS and GS can be read directly from the respective segment registers. In order to identify the segment registers, a new set of #defines is introduced. It also includes two special identifiers. One of them indicates when the default segment register associated with instruction operands shall be used. Another one indicates that the contents of the segment register shall be ignored; this identifier is used when in long mode. Improvements-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-14-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/inat.h | 10 ++ arch/x86/lib/insn-eval.c | 340 ++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 02aff0867211..1c78580e58be 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index ac7b87c228b9..6a902b155f5d 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -9,6 +9,7 @@ #include #include #include +#include #undef pr_fmt #define pr_fmt(fmt) "insn: " fmt @@ -47,6 +48,345 @@ static bool is_string_insn(struct insn *insn) } } +/** + * get_seg_reg_override_idx() - obtain segment register override index + * @insn: Valid instruction with segment override prefixes + * + * Inspect the instruction prefixes in @insn and find segment overrides, if any. + * + * Returns: + * + * A constant identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override + * prefixes were found. + * + * -EINVAL in case of error. + */ +static int get_seg_reg_override_idx(struct insn *insn) +{ + int idx = INAT_SEG_REG_DEFAULT; + int num_overrides = 0, i; + + insn_get_prefixes(insn); + + /* Look for any segment override prefixes. */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_CS): + idx = INAT_SEG_REG_CS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_SS): + idx = INAT_SEG_REG_SS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_DS): + idx = INAT_SEG_REG_DS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_ES): + idx = INAT_SEG_REG_ES; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_FS): + idx = INAT_SEG_REG_FS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_GS): + idx = INAT_SEG_REG_GS; + num_overrides++; + break; + /* No default action needed. */ + } + } + + /* More than one segment override prefix leads to undefined behavior. */ + if (num_overrides > 1) + return -EINVAL; + + return idx; +} + +/** + * check_seg_overrides() - check if segment override prefixes are allowed + * @insn: Valid instruction with segment override prefixes + * @regoff: Operand offset, in pt_regs, for which the check is performed + * + * For a particular register used in register-indirect addressing, determine if + * segment override prefixes can be used. Specifically, no overrides are allowed + * for rDI if used with a string instruction. + * + * Returns: + * + * True if segment override prefixes can be used with the register indicated + * in @regoff. False if otherwise. + */ +static bool check_seg_overrides(struct insn *insn, int regoff) +{ + if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn)) + return false; + + return true; +} + +/** + * resolve_default_seg() - resolve default segment register index for an operand + * @insn: Instruction with opcode and address size. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @off: Operand offset, in pt_regs, for which resolution is needed + * + * Resolve the default segment register index associated with the instruction + * operand register indicated by @off. Such index is resolved based on defaults + * described in the Intel Software Development Manual. + * + * Returns: + * + * If in protected mode, a constant identifying the segment register to use, + * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE. + * + * -EINVAL in case of error. + */ +static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) +{ + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + /* + * Resolve the default segment register as described in Section 3.7.4 + * of the Intel Software Development Manual Vol. 1: + * + * + DS for all references involving r[ABCD]X, and rSI. + * + If used in a string instruction, ES for rDI. Otherwise, DS. + * + AX, CX and DX are not valid register operands in 16-bit address + * encodings but are valid for 32-bit and 64-bit encodings. + * + -EDOM is reserved to identify for cases in which no register + * is used (i.e., displacement-only addressing). Use DS. + * + SS for rSP or rBP. + * + CS for rIP. + */ + + switch (off) { + case offsetof(struct pt_regs, ax): + case offsetof(struct pt_regs, cx): + case offsetof(struct pt_regs, dx): + /* Need insn to verify address size. */ + if (insn->addr_bytes == 2) + return -EINVAL; + + case -EDOM: + case offsetof(struct pt_regs, bx): + case offsetof(struct pt_regs, si): + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, di): + if (is_string_insn(insn)) + return INAT_SEG_REG_ES; + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, bp): + case offsetof(struct pt_regs, sp): + return INAT_SEG_REG_SS; + + case offsetof(struct pt_regs, ip): + return INAT_SEG_REG_CS; + + default: + return -EINVAL; + } +} + +/** + * resolve_seg_reg() - obtain segment register index + * @insn: Instruction with operands + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * + * Determine the segment register associated with the operands and, if + * applicable, prefixes and the instruction pointed by @insn. + * + * The segment register associated to an operand used in register-indirect + * addressing depends on: + * + * a) Whether running in long mode (in such a case segments are ignored, except + * if FS or GS are used). + * + * b) Whether segment override prefixes can be used. Certain instructions and + * registers do not allow override prefixes. + * + * c) Whether segment overrides prefixes are found in the instruction prefixes. + * + * d) If there are not segment override prefixes or they cannot be used, the + * default segment register associated with the operand register is used. + * + * The function checks first if segment override prefixes can be used with the + * operand indicated by @regoff. If allowed, obtain such overridden segment + * register index. Lastly, if not prefixes were found or cannot be used, resolve + * the segment register index to use based on the defaults described in the + * Intel documentation. In long mode, all segment register indexes will be + * ignored, except if overrides were found for FS or GS. All these operations + * are done using helper functions. + * + * The operand register, @regoff, is represented as the offset from the base of + * pt_regs. + * + * As stated, the main use of this function is to determine the segment register + * index based on the instruction, its operands and prefixes. Hence, @insn + * must be valid. However, if @regoff indicates rIP, we don't need to inspect + * @insn at all as in this case CS is used in all cases. This case is checked + * before proceeding further. + * + * Please note that this function does not return the value in the segment + * register (i.e., the segment selector) but our defined index. The segment + * selector needs to be obtained using get_segment_selector() and passing the + * segment register index resolved by this function. + * + * Returns: + * + * An index identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode. + * + * -EINVAL in case of error. + */ +static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) +{ + int idx; + + /* + * In the unlikely event of having to resolve the segment register + * index for rIP, do it first. Segment override prefixes should not + * be used. Hence, it is not necessary to inspect the instruction, + * which may be invalid at this point. + */ + if (regoff == offsetof(struct pt_regs, ip)) { + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + else + return INAT_SEG_REG_CS; + } + + if (!insn) + return -EINVAL; + + if (!check_seg_overrides(insn, regoff)) + return resolve_default_seg(insn, regs, regoff); + + idx = get_seg_reg_override_idx(insn); + if (idx < 0) + return idx; + + if (idx == INAT_SEG_REG_DEFAULT) + return resolve_default_seg(insn, regs, regoff); + + /* + * In long mode, segment override prefixes are ignored, except for + * overrides for FS and GS. + */ + if (user_64bit_mode(regs)) { + if (idx != INAT_SEG_REG_FS && + idx != INAT_SEG_REG_GS) + idx = INAT_SEG_REG_IGNORE; + } + + return idx; +} + +/** + * get_segment_selector() - obtain segment selector + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Segment register index to use + * + * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment + * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or + * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained + * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU + * registers. This done for only for completeness as in CONFIG_X86_64 segment + * registers are ignored. + * + * Returns: + * + * Value of the segment selector, including null when running in + * long mode. + * + * -EINVAL on error. + */ +static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + switch (seg_reg_idx) { + case INAT_SEG_REG_IGNORE: + return 0; + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + savesegment(ds, sel); + return sel; + case INAT_SEG_REG_ES: + savesegment(es, sel); + return sel; + case INAT_SEG_REG_FS: + savesegment(fs, sel); + return sel; + case INAT_SEG_REG_GS: + savesegment(gs, sel); + return sel; + default: + return -EINVAL; + } +#else /* CONFIG_X86_32 */ + struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs; + + if (v8086_mode(regs)) { + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return vm86regs->ds; + case INAT_SEG_REG_ES: + return vm86regs->es; + case INAT_SEG_REG_FS: + return vm86regs->fs; + case INAT_SEG_REG_GS: + return vm86regs->gs; + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } + } + + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return (unsigned short)(regs->ds & 0xffff); + case INAT_SEG_REG_ES: + return (unsigned short)(regs->es & 0xffff); + case INAT_SEG_REG_FS: + return (unsigned short)(regs->fs & 0xffff); + case INAT_SEG_REG_GS: + /* + * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS. + * The macro below takes care of both cases. + */ + return get_user_gs(regs); + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } +#endif /* CONFIG_X86_64 */ +} + static int get_reg_offset(struct insn *insn, struct pt_regs *regs, enum reg_type type) { From 670f928ba09b06712da34a3c44be6c8fa561fb19 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:41 -0700 Subject: [PATCH 0806/1085] x86/insn-eval: Add utility function to get segment descriptor The segment descriptor contains information that is relevant to how linear addresses need to be computed. It contains the default size of addresses as well as the base address of the segment. Thus, given a segment selector, we ought to look at segment descriptor to correctly calculate the linear address. In protected mode, the segment selector might indicate a segment descriptor from either the global descriptor table or a local descriptor table. Both cases are considered in this function. This function is a prerequisite for functions in subsequent commits that will obtain the aforementioned attributes of the segment descriptor. Improvements-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-15-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/lib/insn-eval.c | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 6a902b155f5d..d85e84091ddc 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -6,9 +6,13 @@ #include #include #include +#include +#include +#include #include #include #include +#include #include #undef pr_fmt @@ -468,6 +472,59 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, return regoff[regno]; } +/** + * get_desc() - Obtain pointer to a segment descriptor + * @sel: Segment selector + * + * Given a segment selector, obtain a pointer to the segment descriptor. + * Both global and local descriptor tables are supported. + * + * Returns: + * + * Pointer to segment descriptor on success. + * + * NULL on error. + */ +static struct desc_struct *get_desc(unsigned short sel) +{ + struct desc_ptr gdt_desc = {0, 0}; + unsigned long desc_base; + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc = NULL; + struct ldt_struct *ldt; + + /* Bits [15:3] contain the index of the desired entry. */ + sel >>= 3; + + mutex_lock(¤t->active_mm->context.lock); + ldt = current->active_mm->context.ldt; + if (ldt && sel < ldt->nr_entries) + desc = &ldt->entries[sel]; + + mutex_unlock(¤t->active_mm->context.lock); + + return desc; + } +#endif + native_store_gdt(&gdt_desc); + + /* + * Segment descriptors have a size of 8 bytes. Thus, the index is + * multiplied by 8 to obtain the memory offset of the desired descriptor + * from the base of the GDT. As bits [15:3] of the segment selector + * contain the index, it can be regarded as multiplied by 8 already. + * All that remains is to clear bits [2:0]. + */ + desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK); + + if (desc_base > gdt_desc.size) + return NULL; + + return (struct desc_struct *)(gdt_desc.address + desc_base); +} + /** * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte * @insn: Instruction containing the ModRM byte From bd5a410a5de3a6893eaacc749e706b85506dc908 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:42 -0700 Subject: [PATCH 0807/1085] x86/insn-eval: Add utility functions to get segment descriptor base address and limit With segmentation, the base address of the segment is needed to compute a linear address. This base address is obtained from the applicable segment descriptor. Such segment descriptor is referenced from a segment selector. These new functions obtain the segment base and limit of the segment selector indicated by segment register index given as argument. This index is any of the INAT_SEG_REG_* family of #define's. The logic to obtain the segment selector is wrapped in the function get_segment_selector() with the inputs described above. Once the selector is known, the base address is determined. In protected mode, the selector is used to obtain the segment descriptor and then its base address. In long mode, the segment base address is zero except when FS or GS are used. In virtual-8086 mode, the base address is computed as the value of the segment selector shifted 4 positions to the left. In protected mode, segment limits are enforced. Thus, a function to determine the limit of the segment is added. Segment limits are not enforced in long or virtual-8086. For the latter, addresses are limited to 20 bits; address size will be handled when computing the linear address. Improvements-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-16-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/insn-eval.h | 1 + arch/x86/lib/insn-eval.c | 114 +++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 7e8c9633a377..25d6e44dcbfc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -13,5 +13,6 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index d85e84091ddc..89d5c8902f42 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -525,6 +525,120 @@ static struct desc_struct *get_desc(unsigned short sel) return (struct desc_struct *)(gdt_desc.address + desc_base); } +/** + * insn_get_seg_base() - Obtain base address of segment descriptor. + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the base address of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, base address of the segment. Zero in long mode, + * except when FS or GS are used. In virtual-8086 mode, the segment + * selector shifted 4 bits to the right. + * + * -1L in case of error. + */ +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return -1L; + + if (v8086_mode(regs)) + /* + * Base is simply the segment selector shifted 4 + * bits to the right. + */ + return (unsigned long)(sel << 4); + + if (user_64bit_mode(regs)) { + /* + * Only FS or GS will have a base address, the rest of + * the segments' bases are forced to 0. + */ + unsigned long base; + + if (seg_reg_idx == INAT_SEG_REG_FS) + rdmsrl(MSR_FS_BASE, base); + else if (seg_reg_idx == INAT_SEG_REG_GS) + /* + * swapgs was called at the kernel entry point. Thus, + * MSR_KERNEL_GS_BASE will have the user-space GS base. + */ + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = 0; + return base; + } + + /* In protected mode the segment selector cannot be null. */ + if (!sel) + return -1L; + + desc = get_desc(sel); + if (!desc) + return -1L; + + return get_desc_base(desc); +} + +/** + * get_seg_limit() - Obtain the limit of a segment descriptor + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the limit of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, the limit of the segment descriptor in bytes. + * In long mode and virtual-8086 mode, segment limits are not enforced. Thus, + * limit is returned as -1L to imply a limit-less segment. + * + * Zero is returned on error. + */ +static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + unsigned long limit; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return 0; + + if (user_64bit_mode(regs) || v8086_mode(regs)) + return -1L; + + if (!sel) + return 0; + + desc = get_desc(sel); + if (!desc) + return 0; + + /* + * If the granularity bit is set, the limit is given in multiples + * of 4096. This also means that the 12 least significant bits are + * not tested when checking the segment limits. In practice, + * this means that the segment ends in (limit << 12) + 0xfff. + */ + limit = get_desc_limit(desc); + if (desc->g) + limit = (limit << 12) + 0xfff; + + return limit; +} + /** * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte * @insn: Instruction containing the ModRM byte From 4efea85fb56fa1691b79af1eea4c1425660cf4e3 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:43 -0700 Subject: [PATCH 0808/1085] x86/insn-eval: Add function to get default params of code segment Obtain the default values of the address and operand sizes as specified in the D and L bits of the the segment descriptor selected by the register CS. The function can be used for both protected and long modes. For virtual-8086 mode, the default address and operand sizes are always 2 bytes. The returned parameters are encoded in a signed 8-bit data type. Auxiliar macros are provided to encode and decode such values. Improvements-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-17-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/insn-eval.h | 5 +++ arch/x86/lib/insn-eval.c | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 25d6e44dcbfc..e1d3b4ce8a92 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -11,8 +11,13 @@ #include #include +#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf) +#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) +#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) + void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); +char insn_get_code_seg_params(struct pt_regs *regs); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 89d5c8902f42..01e36bd86a5d 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -639,6 +639,70 @@ static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx) return limit; } +/** + * insn_get_code_seg_params() - Obtain code segment parameters + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain address and operand sizes of the code segment. It is obtained from the + * selector contained in the CS register in regs. In protected mode, the default + * address is determined by inspecting the L and D bits of the segment + * descriptor. In virtual-8086 mode, the default is always two bytes for both + * address and operand sizes. + * + * Returns: + * + * A signed 8-bit value containing the default parameters on success. + * + * -EINVAL on error. + */ +char insn_get_code_seg_params(struct pt_regs *regs) +{ + struct desc_struct *desc; + short sel; + + if (v8086_mode(regs)) + /* Address and operand size are both 16-bit. */ + return INSN_CODE_SEG_PARAMS(2, 2); + + sel = get_segment_selector(regs, INAT_SEG_REG_CS); + if (sel < 0) + return sel; + + desc = get_desc(sel); + if (!desc) + return -EINVAL; + + /* + * The most significant byte of the Type field of the segment descriptor + * determines whether a segment contains data or code. If this is a data + * segment, return error. + */ + if (!(desc->type & BIT(3))) + return -EINVAL; + + switch ((desc->l << 1) | desc->d) { + case 0: /* + * Legacy mode. CS.L=0, CS.D=0. Address and operand size are + * both 16-bit. + */ + return INSN_CODE_SEG_PARAMS(2, 2); + case 1: /* + * Legacy mode. CS.L=0, CS.D=1. Address and operand size are + * both 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 4); + case 2: /* + * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit; + * operand size is 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 8); + case 3: /* Invalid setting. CS.L=1, CS.D=1 */ + /* fall through */ + default: + return -EINVAL; + } +} + /** * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte * @insn: Instruction containing the ModRM byte From e526a302e425ab11111efc5f59e52449bbcc768e Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:44 -0700 Subject: [PATCH 0809/1085] x86/insn-eval: Indicate a 32-bit displacement if ModRM.mod is 0 and ModRM.rm is 101b Section 2.2.1.3 of the Intel 64 and IA-32 Architectures Software Developer's Manual volume 2A states that when ModRM.mod is zero and ModRM.rm is 101b, a 32-bit displacement follows the ModRM byte. This means that none of the registers are used in the computation of the effective address. A return value of -EDOM indicates callers that they should not use the value of registers when computing the effective address for the instruction. In long mode, the effective address is given by the 32-bit displacement plus the location of the next instruction. In protected mode, only the displacement is used. The instruction decoder takes care of obtaining the displacement. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-18-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/lib/insn-eval.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 01e36bd86a5d..6bf819f923e7 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -427,6 +427,14 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); + + /* + * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement + * follows the ModRM byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; @@ -770,10 +778,21 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); } else { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) + /* + * -EDOM means that we must ignore the address_offset. + * In such a case, in 64-bit mode the effective address + * relative to the RIP of the following instruction. + */ + if (addr_offset == -EDOM) { + if (user_64bit_mode(regs)) + eff_addr = (long)regs->ip + insn->length; + else + eff_addr = 0; + } else if (addr_offset < 0) { goto out; - - eff_addr = regs_get_register(regs, addr_offset); + } else { + eff_addr = regs_get_register(regs, addr_offset); + } } eff_addr += insn->displacement.value; From 108904442850c2884679f81121df3ef42d88cb9c Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:45 -0700 Subject: [PATCH 0810/1085] x86/insn-eval: Incorporate segment base in linear address computation insn_get_addr_ref() returns the effective address as defined by the section 3.7.5.1 Vol 1 of the Intel 64 and IA-32 Architectures Software Developer's Manual. In order to compute the linear address, we must add to the effective address the segment base address as set in the segment descriptor. The segment descriptor to use depends on the register used as operand and segment override prefixes, if any. In most cases, the segment base address will be 0 if the USER_DS/USER32_DS segment is used or if segmentation is not used. However, the base address is not necessarily zero if a user programs defines its own segments. This is possible by using a local descriptor table. Since the effective address is a signed quantity, the unsigned segment base address is saved in a separate variable and added to the final, unsigned, effective address. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-19-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/lib/insn-eval.c | 55 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 6bf819f923e7..1c23ec03c568 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -728,6 +728,43 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) return get_reg_offset(insn, regs, REG_TYPE_RM); } +/** + * get_seg_base_addr() - obtain base address of a segment + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor + * @base: Obtained segment base + * + * Obtain the base address of the segment associated with the operand @regoff + * and, if any or allowed, override prefixes in @insn. This function is + * different from insn_get_seg_base() as the latter does not resolve the segment + * associated with the instruction operand. + * + * Returns: + * + * 0 on success. @base will contain the base address of the resolved segment. + * + * -EINVAL on error. + */ +static int get_seg_base_addr(struct insn *insn, struct pt_regs *regs, + int regoff, unsigned long *base) +{ + int seg_reg_idx; + + if (!base) + return -EINVAL; + + seg_reg_idx = resolve_seg_reg(insn, regs, regoff); + if (seg_reg_idx < 0) + return seg_reg_idx; + + *base = insn_get_seg_base(regs, seg_reg_idx); + if (*base == -1L) + return -EINVAL; + + return 0; +} + /* * return the address being referenced be instruction * for rm=3 returning the content of the rm reg @@ -735,8 +772,8 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) */ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) { - int addr_offset, base_offset, indx_offset; - unsigned long linear_addr = -1L; + int addr_offset, base_offset, indx_offset, ret; + unsigned long linear_addr = -1L, seg_base; long eff_addr, base, indx; insn_byte_t sib; @@ -750,6 +787,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) goto out; eff_addr = regs_get_register(regs, addr_offset); + } else { if (insn->sib.nbytes) { /* @@ -776,6 +814,13 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) indx = regs_get_register(regs, indx_offset); eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); + + /* + * The base determines the segment used to compute + * the linear address. + */ + addr_offset = base_offset; + } else { addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); /* @@ -798,7 +843,11 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) eff_addr += insn->displacement.value; } - linear_addr = (unsigned long)eff_addr; + ret = get_seg_base_addr(insn, regs, addr_offset, &seg_base); + if (ret) + goto out; + + linear_addr = (unsigned long)eff_addr + seg_base; out: return (void __user *)linear_addr; From d6d80cb57be45fc1a7d08c30526ab81ae9e7bc3d Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Thu, 28 Sep 2017 14:54:50 -0700 Subject: [PATCH 0811/1085] Smack: Base support for overlayfs Supply the Smack module hooks in support of overlayfs. Ensure that the Smack label of new files gets the correct value when a directory is transmuting. Original implementation by Romanini Daniele, with a few tweaks added. Signed-off-by: Romanini Daniele Signed-off-by: Casey Schaufler Signed-off-by: James Morris --- security/smack/smack_lsm.c | 79 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 319add31b4a4..569f28034116 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4605,6 +4605,82 @@ static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) return 0; } +static int smack_inode_copy_up(struct dentry *dentry, struct cred **new) +{ + + struct task_smack *tsp; + struct smack_known *skp; + struct inode_smack *isp; + struct cred *new_creds = *new; + + if (new_creds == NULL) { + new_creds = prepare_creds(); + if (new_creds == NULL) + return -ENOMEM; + } + + tsp = new_creds->security; + + /* + * Get label from overlay inode and set it in create_sid + */ + isp = d_inode(dentry->d_parent)->i_security; + skp = isp->smk_inode; + tsp->smk_task = skp; + *new = new_creds; + return 0; +} + +static int smack_inode_copy_up_xattr(const char *name) +{ + /* + * Return 1 if this is the smack access Smack attribute. + */ + if (strcmp(name, XATTR_NAME_SMACK) == 0) + return 1; + + return -EOPNOTSUPP; +} + +static int smack_dentry_create_files_as(struct dentry *dentry, int mode, + struct qstr *name, + const struct cred *old, + struct cred *new) +{ + struct task_smack *otsp = old->security; + struct task_smack *ntsp = new->security; + struct inode_smack *isp; + int may; + + /* + * Use the process credential unless all of + * the transmuting criteria are met + */ + ntsp->smk_task = otsp->smk_task; + + /* + * the attribute of the containing directory + */ + isp = d_inode(dentry->d_parent)->i_security; + + if (isp->smk_flags & SMK_INODE_TRANSMUTE) { + rcu_read_lock(); + may = smk_access_entry(otsp->smk_task->smk_known, + isp->smk_inode->smk_known, + &otsp->smk_task->smk_rules); + rcu_read_unlock(); + + /* + * If the directory is transmuting and the rule + * providing access is transmuting use the containing + * directory label instead of the process label. + */ + if (may > 0 && (may & MAY_TRANSMUTE)) + ntsp->smk_task = isp->smk_inode; + } + return 0; +} + static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), @@ -4740,6 +4816,9 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx), LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx), LSM_HOOK_INIT(inode_getsecctx, smack_inode_getsecctx), + LSM_HOOK_INIT(inode_copy_up, smack_inode_copy_up), + LSM_HOOK_INIT(inode_copy_up_xattr, smack_inode_copy_up_xattr), + LSM_HOOK_INIT(dentry_create_files_as, smack_dentry_create_files_as), }; From 3b42c17a7e264ce4b351d2de205d4c39f725af0f Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Tue, 31 Oct 2017 18:22:08 +0100 Subject: [PATCH 0812/1085] s390: pass endianness info to sparse s390 is big-endian only but sparse assumes the same endianness as the building machine. This is problematic for code which expect __BYTE_ORDER__ being correctly predefined by the compiler which sparse can then pre-process differently from what gcc would, depending on the building machine endianness. Fix this by letting sparse know about the architecture endianness. Signed-off-by: Luc Van Oostenryck Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index dac821cfcd43..64c2fe9dcfbe 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -21,7 +21,7 @@ KBUILD_CFLAGS += -m64 KBUILD_AFLAGS += -m64 UTS_MACHINE := s390x STACK_SIZE := 16384 -CHECKFLAGS += -D__s390__ -D__s390x__ +CHECKFLAGS += -D__s390__ -D__s390x__ -mbig-endian export LD_BFD From 71271269ef9a997fb4416b2f8ef3558dd846c7cb Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 16:51:38 -0700 Subject: [PATCH 0813/1085] x86/insn-eval: Extend get_seg_base_addr() to also obtain segment limit In protected mode, it is common to want to obtain the limit of a segment along with its base address. This is useful, for instance, to verify that an effective address lies within a segment before computing a linear address. Up to this point, this library only computes linear addresses in long mode. Subsequent patches will include support for protected mode. Support to verify the segment limit will be needed. Signed-off-by: Ricardo Neri Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509148310-30862-2-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 1c23ec03c568..91f08aafb65e 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -729,25 +729,29 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) } /** - * get_seg_base_addr() - obtain base address of a segment + * get_seg_base_limit() - obtain base address and limit of a segment * @insn: Instruction. Must be valid. * @regs: Register values as seen when entering kernel mode * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor * @base: Obtained segment base + * @limit: Obtained segment limit * - * Obtain the base address of the segment associated with the operand @regoff - * and, if any or allowed, override prefixes in @insn. This function is + * Obtain the base address and limit of the segment associated with the operand + * @regoff and, if any or allowed, override prefixes in @insn. This function is * different from insn_get_seg_base() as the latter does not resolve the segment - * associated with the instruction operand. + * associated with the instruction operand. If a limit is not needed (e.g., + * when running in long mode), @limit can be NULL. * * Returns: * - * 0 on success. @base will contain the base address of the resolved segment. + * 0 on success. @base and @limit will contain the base address and of the + * resolved segment, respectively. * * -EINVAL on error. */ -static int get_seg_base_addr(struct insn *insn, struct pt_regs *regs, - int regoff, unsigned long *base) +static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, + int regoff, unsigned long *base, + unsigned long *limit) { int seg_reg_idx; @@ -762,6 +766,13 @@ static int get_seg_base_addr(struct insn *insn, struct pt_regs *regs, if (*base == -1L) return -EINVAL; + if (!limit) + return 0; + + *limit = get_seg_limit(regs, seg_reg_idx); + if (!(*limit)) + return -EINVAL; + return 0; } @@ -843,7 +854,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) eff_addr += insn->displacement.value; } - ret = get_seg_base_addr(insn, regs, addr_offset, &seg_base); + ret = get_seg_base_limit(insn, regs, addr_offset, &seg_base, NULL); if (ret) goto out; From 9da78ba6b47b46428cfdfc0851511ab29c869798 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:58:58 -0700 Subject: [PATCH 0814/1085] x86/entry/64: Remove the restore_c_regs_and_iret label The only user was the 64-bit opportunistic SYSRET failure path, and that path didn't really need it. This change makes the opportunistic SYSRET code a bit more straightforward and gets rid of the label. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 846e84a1d1f7..e8ef83df46e6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path: call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* @@ -314,6 +313,7 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ + RESTORE_EXTRA_REGS RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY @@ -321,7 +321,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -638,7 +638,6 @@ retint_kernel: */ GLOBAL(restore_regs_and_iret) RESTORE_EXTRA_REGS -restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN From 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:58:59 -0700 Subject: [PATCH 0815/1085] x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths These code paths will diverge soon. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++--------- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e8ef83df46e6..3eeb1694210c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -321,7 +321,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -423,7 +423,7 @@ ENTRY(ret_from_fork) call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -612,7 +612,20 @@ GLOBAL(retint_user) call prepare_exit_to_usermode TRACE_IRQS_IRETQ SWAPGS - jmp restore_regs_and_iret + +GLOBAL(restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testl $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -632,11 +645,14 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testl $3, CS(%rsp) + jz 1f + ud2 +1: +#endif RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -1327,7 +1343,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e26c25ca7756..9ca014a99968 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 189bf42dfa2b..08f067faa264 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -326,7 +326,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_kernel END(early_idt_handler_common) __INITDATA From 8a055d7f411d41755ce30db5bb65b154777c4b78 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:00 -0700 Subject: [PATCH 0816/1085] x86/entry/64: Move SWAPGS into the common IRET-to-usermode path All of the code paths that ended up doing IRET to usermode did SWAPGS immediately beforehand. Move the SWAPGS into the common code. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------ arch/x86/entry/entry_64_compat.S | 3 +-- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3eeb1694210c..d6ffdc9afcbb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -249,12 +249,14 @@ return_from_SYSCALL_64: /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -272,14 +274,14 @@ return_from_SYSCALL_64: /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -300,12 +302,12 @@ return_from_SYSCALL_64: * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding @@ -318,10 +320,6 @@ syscall_return_via_sysret: movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_regs_and_return_to_usermode END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -422,8 +420,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -611,9 +608,8 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ - SWAPGS -GLOBAL(restore_regs_and_return_to_usermode) +GLOBAL(swapgs_restore_regs_and_return_to_usermode) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testl $3, CS(%rsp) @@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode) ud2 1: #endif + SWAPGS RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -1342,8 +1339,7 @@ ENTRY(nmi) * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9ca014a99968..932b96ce1b06 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) From e872045bfd9c465a8555bab4b8567d56a4d2d3bb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:01 -0700 Subject: [PATCH 0817/1085] x86/entry/64: Simplify reg restore code in the standard IRET paths The old code restored all the registers with movq instead of pop. In theory, this was done because some CPUs have higher movq throughput, but any gain there would be tiny and is almost certainly outweighed by the higher text size. This saves 96 bytes of text. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 21 +++++++++++++++++++++ arch/x86/entry/entry_64.S | 12 ++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 640aafebdc00..0b9dd8123701 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset extra=0 .endm + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .endm + + .macro POP_C_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + .endm + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d6ffdc9afcbb..925d56246071 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) 1: #endif SWAPGS - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN @@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel) ud2 1: #endif - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) From e53178328c9b96fbdbc719e78c93b5687ee007c3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:02 -0700 Subject: [PATCH 0818/1085] x86/entry/64: Shrink paranoid_exit_restore and make labels local paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel. Merge them and make the paranoid_exit internal labels local. Keeping .Lparanoid_exit makes the code a bit shorter because it allows a 2-byte jnz instead of a 5-byte jnz. Saves 96 bytes of text. ( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS kernel, but fixing that would make the code rather messy. ) Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 925d56246071..155396443aaa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1123,17 +1123,14 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* From 4fbb39108f972437c44e5ffa781b56635d496826 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:03 -0700 Subject: [PATCH 0819/1085] x86/entry/64: Use pop instead of movq in syscall_return_via_sysret Saves 64 bytes. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 155396443aaa..4f9b4463b3fc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -315,10 +315,18 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - RESTORE_EXTRA_REGS - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY + POP_EXTRA_REGS + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi + popq %rdi + movq RSP-ORIG_RAX(%rsp), %rsp USERGS_SYSRET64 END(entry_SYSCALL_64) From a512210643da8082cb44181dba8b18e752bd68f0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:04 -0700 Subject: [PATCH 0820/1085] x86/entry/64: Merge the fast and slow SYSRET paths They did almost the same thing. Remove a bunch of pointless instructions (mostly hidden in macros) and reduce cognitive load by merging them. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4f9b4463b3fc..b5a0ea63d391 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp + addq $6*8, %rsp /* skip extra regs -- they were preserved */ UNWIND_HINT_EMPTY - USERGS_SYSRET64 + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -317,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ UNWIND_HINT_EMPTY POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: popq %rsi /* skip r11 */ popq %r10 popq %r9 From 471ee4832209e986029b9fabdaad57b1eecb856b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:05 -0700 Subject: [PATCH 0821/1085] x86/entry/64: Use POP instead of MOV to restore regs on NMI return This gets rid of the last user of the old RESTORE_..._REGS infrastructure. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b5a0ea63d391..5b2f0bc661a0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1559,11 +1559,14 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily From c39858de696f0cc160a544455e8403d663d577e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:06 -0700 Subject: [PATCH 0822/1085] x86/entry/64: Remove the RESTORE_..._REGS infrastructure All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 52 ---------------------------------------- 1 file changed, 52 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 0b9dd8123701..1895a685d3dd 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - .macro POP_EXTRA_REGS popq %r15 popq %r14 @@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with popq %rdi .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - UNWIND_HINT_IRET_REGS offset=16*8 - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp - .endm - .macro icebp .byte 0xf1 .endm From 43e4111086a70c78bedb6ad990bee97f17b27a6e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 00:59:07 -0700 Subject: [PATCH 0823/1085] xen, x86/entry/64: Add xen NMI trap entry Instead of trying to execute any NMI via the bare metal's NMI trap handler use a Xen specific one for PV domains, like we do for e.g. debug traps. As in a PV domain the NMI is handled via the normal kernel stack this is the correct thing to do. This will enable us to get rid of the very fragile and questionable dependencies between the bare metal NMI handler and Xen assumptions believed to be broken anyway. Signed-off-by: Juergen Gross Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/traps.h | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b2f0bc661a0..a3f76ab5d0ea 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1078,6 +1078,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 #endif @@ -1240,7 +1241,6 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ -/* XXX: broken on Xen PV */ ENTRY(nmi) UNWIND_HINT_IRET_REGS /* diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index da3c3a3674a5..e76ce80ca18b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -37,9 +37,9 @@ asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_divide_error(void); +asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_xenint3(void); -asmlinkage void xen_nmi(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69b9deff7e5c..8da4eff19c2a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif - { nmi, xen_nmi, true }, + { nmi, xen_xennmi, true }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index dae2cc33afb5..286ecc198562 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,7 +29,7 @@ xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xenint3 -xen_pv_trap nmi +xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds xen_pv_trap invalid_op From 929bacec21478a72c78e4f29f98fb799bd00105a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:08 -0700 Subject: [PATCH 0824/1085] x86/entry/64: De-Xen-ify our NMI code Xen PV is fundamentally incompatible with our fancy NMI code: it doesn't use IST at all, and Xen entries clobber two stack slots below the hardware frame. Drop Xen PV support from our NMI code entirely. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a3f76ab5d0ea..40e9933a2d33 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1240,9 +1240,13 @@ ENTRY(error_exit) jmp retint_user END(error_exit) -/* Runs on exception stack */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + */ ENTRY(nmi) UNWIND_HINT_IRET_REGS + /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1300,7 +1304,7 @@ ENTRY(nmi) * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1465,7 +1469,7 @@ nested_nmi_out: popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1496,7 +1500,7 @@ first_nmi: pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: #endif @@ -1571,20 +1575,22 @@ nmi_restore: /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) From bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:09 -0700 Subject: [PATCH 0825/1085] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out of native_load_sp0() This causes the MSR_IA32_SYSENTER_CS write to move out of the paravirt callback. This shouldn't affect Xen PV: Xen already ignores MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support vm86() in a useful way. Note to any potential backporters: This patch won't break lguest, as lguest didn't have any SYSENTER support at all. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 ------- arch/x86/include/asm/switch_to.h | 12 ++++++++++++ arch/x86/kernel/process_32.c | 4 +++- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 6 +++++- 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b390ff76e58f..0167e3e35a57 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -520,13 +520,6 @@ static inline void native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index fcc5cd387fd1..7ae8caffbada 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -72,4 +72,16 @@ do { \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..0936ed3da6b6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ load_sp0(tss, next); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..a6ff6d1a0110 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -464,7 +464,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* Reload esp0 and ss1. This changes current_thread_info(). */ + /* Reload sp0. */ load_sp0(tss, next); /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 7924a5356c8a..5bc1c3ab6287 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -54,6 +54,7 @@ #include #include #include +#include /* * Known problems: @@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, &tsk->thread); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; put_cpu(); @@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } load_sp0(tss, &tsk->thread); put_cpu(); From da51da189a24bb9b7e2d5a123be096e51a4695a5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:10 -0700 Subject: [PATCH 0826/1085] x86/entry/64: Pass SP0 directly to load_sp0() load_sp0() had an odd signature: void load_sp0(struct tss_struct *tss, struct thread_struct *thread); Simplify it to: void load_sp0(unsigned long sp0); Also simplify a few get_cpu()/put_cpu() sequences to preempt_disable()/preempt_enable(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 5 ++--- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/include/asm/processor.h | 9 ++++----- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 14 ++++++-------- arch/x86/xen/enlighten_pv.c | 7 +++---- 8 files changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 12deec722cf0..43d4f90edebc 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,10 +15,9 @@ #include #include -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 280d94c36dad..a916788ac478 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -133,7 +133,7 @@ struct pv_cpu_ops { void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0167e3e35a57..064b84722166 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -517,9 +517,9 @@ static inline void native_set_iopl_mask(unsigned mask) } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -544,10 +544,9 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 03bb004bb15e..4e7fb9c3bfa5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,7 +1570,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); + load_sp0(current->thread.sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1625,7 +1625,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); + load_sp0(thread->sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0936ed3da6b6..40b85870e429 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(tss, next); + load_sp0(next->sp0); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a6ff6d1a0110..2124304fb77a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(tss, next); + load_sp0(next->sp0); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5bc1c3ab6287..0f1d92cd20ad 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -94,7 +94,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + load_sp0(tsk->thread.sp0); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; if (static_cpu_has(X86_FEATURE_SEP)) { @@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tss, &tsk->thread); - put_cpu(); + load_sp0(tsk->thread.sp0); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8da4eff19c2a..e7b213047724 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, } } -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static void xen_load_sp0(unsigned long sp0) { struct multicall_space mcs; mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) From 3500130b84a3cdc5b6796eba1daf178944935efe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:11 -0700 Subject: [PATCH 0827/1085] x86/entry: Add task_top_of_stack() to find the top of a task's stack This will let us get rid of a few places that hardcode accesses to thread.sp0. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 064b84722166..ad59cec14239 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -795,6 +795,8 @@ static inline void spin_lock_prefetch(const void *x) #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) +#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). From f16b3da1dc936c0f8121741d0a1731bf242f2f56 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:12 -0700 Subject: [PATCH 0828/1085] x86/xen/64, x86/entry/64: Clean up SP code in cpu_initialize_context() I'm removing thread_struct::sp0, and Xen's usage of it is slightly dubious and unnecessary. Use appropriate helpers instead. While we're at at, reorder the code slightly to make it more obvious what's going on. Signed-off-by: Andy Lutomirski Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/xen/smp_pv.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 51471408fdd1..8c0e047d0b80 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -13,6 +13,7 @@ * single-threaded. */ #include +#include #include #include #include @@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + /* + * Bring up the CPU in cpu_bringup_and_idle() with the stack + * pointing just below where pt_regs would be if it were a normal + * kernel entry. + */ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); xen_copy_trap_info(ctxt->trap_ctxt); @@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; + /* + * Set SS:SP that Xen will use when entering guest kernel mode + * from guest user mode. Subsequent calls to load_sp0() can + * change this value. + */ ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_sp = task_top_of_stack(idle); #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; @@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) BUG(); From 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:13 -0700 Subject: [PATCH 0829/1085] x86/entry/64: Stop initializing TSS.sp0 at boot In my quest to get rid of thread_struct::sp0, I want to clean up or remove all of its readers. Two of them are in cpu_init() (32-bit and 64-bit), and they aren't needed. This is because we never enter userspace at all on the threads that CPUs are initialized in. Poison the initial TSS.sp0 and stop initializing it on CPU init. The comment text mostly comes from Dave Hansen. Thanks! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 13 ++++++++++--- arch/x86/kernel/process.c | 8 +++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e7fb9c3bfa5..cdf79ab628c2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,9 +1570,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(current->thread.sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1594,7 +1598,6 @@ void cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); @@ -1625,9 +1628,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(thread->sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd6b85fac666..ff8a9acbcf8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -48,7 +48,13 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, From 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:14 -0700 Subject: [PATCH 0830/1085] x86/entry/64: Remove all remaining direct thread_struct::sp0 reads The only remaining readers in context switch code or vm86(), and they all just want to update TSS.sp0 to match the current task. Replace them all with a new helper update_sp0(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7ae8caffbada..54e64d909725 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) } #endif +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ + load_sp0(task->thread.sp0); +} + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 40b85870e429..45bf0c5f93e1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(next->sp0); + update_sp0(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2124304fb77a..45e380958392 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(next->sp0); + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 0f1d92cd20ad..a7b44c75c642 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tsk->thread.sp0); + update_sp0(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tsk->thread.sp0); + update_sp0(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) From cd493a6deb8b78eca280d05f7fa73fd69403ae29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:15 -0700 Subject: [PATCH 0831/1085] x86/entry/32: Fix cpu_current_top_of_stack initialization at boot cpu_current_top_of_stack's initialization forgot about TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the idle threads never enter user mode. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..06c18fe1c09e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif From d375cf1530595e33961a8844192cddab913650e3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:16 -0700 Subject: [PATCH 0832/1085] x86/entry/64: Remove thread_struct::sp0 On x86_64, we can easily calculate sp0 when needed instead of storing it in thread_struct. On x86_32, a similar cleanup would be possible, but it would require cleaning up the vm86 code first, and that can wait for a later cleanup series. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 1 + arch/x86/include/asm/processor.h | 28 +++++++++------------------- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_64.c | 1 - 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5343c19814b3..948b6d8ec46f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad59cec14239..ae2ae6d80674 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -430,7 +430,9 @@ typedef struct { struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -797,6 +799,13 @@ static inline void spin_lock_prefetch(const void *x) #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -816,23 +825,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else @@ -866,11 +858,9 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 54e64d909725..010cd6e4eafc 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { +#ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); +#else + load_sp0(task_top_of_stack(task)); +#endif } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 45e380958392..eeeb34f85c25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; From 3383642c2f9d4f5b4fa37436db4a109a1a10018c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:17 -0700 Subject: [PATCH 0833/1085] x86/traps: Use a new on_thread_stack() helper to clean up an assertion Let's keep the stack-related logic together rather than open-coding a comparison in an assertion in the traps code. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/traps.c | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ae2ae6d80674..f10dae14f951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -541,6 +541,12 @@ static inline unsigned long current_top_of_stack(void) #endif } +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer) < THREAD_SIZE; +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 67db4f43309e..42a9c4458f5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } From ca5cd8c9400c7eeeda84e34fa773c6c245e65c82 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Thu, 2 Nov 2017 14:54:00 +0530 Subject: [PATCH 0834/1085] regulator: qcom_spmi: Add support for pmi8994 Document the regulators available on pmi8994 and add support for this PMIC to the SPMI PMIC regulator driver. Signed-off-by: Rajendra Nayak Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,spmi-regulator.txt | 13 +++++++++++++ drivers/regulator/qcom_spmi-regulator.c | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt index 0fa3b0fac129..57d2c65899df 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt @@ -8,6 +8,7 @@ Qualcomm SPMI Regulators "qcom,pm8916-regulators" "qcom,pm8941-regulators" "qcom,pm8994-regulators" + "qcom,pmi8994-regulators" - interrupts: Usage: optional @@ -100,6 +101,15 @@ Qualcomm SPMI Regulators Definition: Reference to regulator supplying the input pin, as described in the data sheet. +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_l1-supply: + Usage: optional (pmi8994 only) + Value type: + Definition: Reference to regulator supplying the input pin, as + described in the data sheet. + The regulator node houses sub-nodes for each regulator within the device. Each sub-node is identified using the node's name, with valid values listed for each @@ -122,6 +132,9 @@ pm8994: l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26, l27, l28, l29, l30, l31, l32, lvs1, lvs2 +pmi8994: + s1, s2, s3, l1 + The content of each sub-node is defined by the standard binding for regulators - see regulator.txt - with additional custom properties described below: diff --git a/drivers/regulator/qcom_spmi-regulator.c b/drivers/regulator/qcom_spmi-regulator.c index 16c5f84e06a7..9f8eb5dd2a4e 100644 --- a/drivers/regulator/qcom_spmi-regulator.c +++ b/drivers/regulator/qcom_spmi-regulator.c @@ -1619,11 +1619,20 @@ static const struct spmi_regulator_data pm8994_regulators[] = { { } }; +static const struct spmi_regulator_data pmi8994_regulators[] = { + { "s1", 0x1400, "vdd_s1", }, + { "s2", 0x1700, "vdd_s2", }, + { "s3", 0x1a00, "vdd_s3", }, + { "l1", 0x4000, "vdd_l1", }, + { } +}; + static const struct of_device_id qcom_spmi_regulator_match[] = { { .compatible = "qcom,pm8841-regulators", .data = &pm8841_regulators }, { .compatible = "qcom,pm8916-regulators", .data = &pm8916_regulators }, { .compatible = "qcom,pm8941-regulators", .data = &pm8941_regulators }, { .compatible = "qcom,pm8994-regulators", .data = &pm8994_regulators }, + { .compatible = "qcom,pmi8994-regulators", .data = &pmi8994_regulators }, { } }; MODULE_DEVICE_TABLE(of, qcom_spmi_regulator_match); From 36735783fdb599c94b9c86824583df367c65900b Mon Sep 17 00:00:00 2001 From: Hiromitsu Yamasaki Date: Thu, 2 Nov 2017 10:32:36 +0100 Subject: [PATCH 0835/1085] spi: sh-msiof: Fix DMA transfer size check DMA supports 32-bit words only, even if BITLEN1 of SITMDR2 register is 16bit. Fixes: b0d0ce8b6b91 ("spi: sh-msiof: Add DMA support") Signed-off-by: Hiromitsu Yamasaki Signed-off-by: Simon Horman Acked-by: Geert Uytterhoeven Acked-by: Dirk Behme Signed-off-by: Mark Brown --- drivers/spi/spi-sh-msiof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 0eb1e9583485..837bb95eea62 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -900,7 +900,7 @@ static int sh_msiof_transfer_one(struct spi_master *master, break; copy32 = copy_bswap32; } else if (bits <= 16) { - if (l & 1) + if (l & 3) break; copy32 = copy_wswap32; } else { From 2a2d7befd40c95fd0d27d14edd0ec3b479fcf21f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 27 Oct 2017 12:44:48 +0200 Subject: [PATCH 0836/1085] s390/nmi: avoid using long-displacement facility __LC_MCESAD is currently 4528 /* offsetof(struct lowcore, mcesad) */ that would require long-displacement facility for lg, which we don't have on z900. Fixes: 3037a52f9846 ("s390/nmi: do register validation as early as possible") Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 9887d3ed6eb1..8a0b54e7e946 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -964,7 +964,7 @@ ENTRY(mcck_int_handler) la %r14,4095 lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lg %r11,__LC_MCESAD # extended machine check save area + lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area nill %r11,0xfc00 # MCESA_ORIGIN_MASK TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE jno 0f From 6082a6e44434a17f194048b7d48df56f148ec6d4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 1 Nov 2017 11:04:51 -0700 Subject: [PATCH 0837/1085] kernel/time/Kconfig: Fix typo in comment Fix typo in Kconfig comment text. Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: Jiri Kosina Link: https://lkml.kernel.org/r/0e586dd4-2b27-864e-c252-bc72df52fd01@infradead.org --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index ac09bc29eb08..d689a9557e17 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -56,7 +56,7 @@ menu "Timers subsystem" # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices -# are supported independ of this. +# are supported independent of this. config TICK_ONESHOT bool From 1e4c4f610f774df6088d7c065b2dd4d22adba698 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Nov 2017 13:09:26 +0100 Subject: [PATCH 0838/1085] x86/entry/64: Shorten TEST instructions Convert TESTL to TESTB and save 3 bytes per callsite. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 40e9933a2d33..84263c79a119 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -620,7 +620,7 @@ GLOBAL(retint_user) GLOBAL(swapgs_restore_regs_and_return_to_usermode) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ - testl $3, CS(%rsp) + testb $3, CS(%rsp) jnz 1f ud2 1: @@ -653,7 +653,7 @@ retint_kernel: GLOBAL(restore_regs_and_return_to_kernel) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ - testl $3, CS(%rsp) + testb $3, CS(%rsp) jz 1f ud2 1: From f747c3104efd1e21528d368910afd978fbcd6a78 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Nov 2017 12:16:25 +0000 Subject: [PATCH 0839/1085] spi: orion: remove redundant assignment of status to zero The assignment of status to zero is never read, status is either updated in the next iteration of the of the loop or several lines after the end of the loop. Remove it, cleans up clang warning: drivers/spi/spi-orion.c:674:4: warning: Value stored to 'status' is never read Signed-off-by: Colin Ian King Signed-off-by: Mark Brown --- drivers/spi/spi-orion.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-orion.c b/drivers/spi/spi-orion.c index 4b6dd73b80da..8974bb340b3a 100644 --- a/drivers/spi/spi-orion.c +++ b/drivers/spi/spi-orion.c @@ -671,7 +671,6 @@ static int orion_spi_probe(struct platform_device *pdev) dev_err(&pdev->dev, "%pOF has no valid 'reg' property (%d)\n", np, status); - status = 0; continue; } From 06dd688ddda5819025e014b79aea9af6ab475fa2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Nov 2017 13:22:35 +0100 Subject: [PATCH 0840/1085] x86/cpuid: Replace set/clear_bit32() Peter pointed out that the set/clear_bit32() variants are broken in various aspects. Replace them with open coded set/clear_bit() and type cast cpu_info::x86_capability as it's done in all other places throughout x86. Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") Reported-by: Peter Ziljstra Signed-off-by: Thomas Gleixner Cc: Andi Kleen --- arch/x86/kernel/cpu/cpuid-deps.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c21f22d836ad..904b0a3c4e53 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -62,23 +62,19 @@ const static struct cpuid_dep cpuid_deps[] = { {} }; -static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) -{ - clear_bit32(bit, c->x86_capability); -} - -static inline void __setup_clear_cpu_cap(unsigned int bit) -{ - clear_cpu_cap(&boot_cpu_data, bit); - set_bit32(bit, cpu_caps_cleared); -} - static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) { - if (!c) - __setup_clear_cpu_cap(feature); - else - __clear_cpu_cap(c, feature); + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } } /* Take the capabilities and the BUG bits into account */ From 1943dc07b45e347c52c1bfdd4a37e04a86e399aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Nov 2017 13:30:03 +0100 Subject: [PATCH 0841/1085] bitops: Revert cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h") These ops are not endian safe and may break on architectures which have aligment requirements. Reverts: cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h") Reported-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Cc: Andi Kleen --- include/linux/bitops.h | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 36794f058ba6..8fbe259b197c 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -227,32 +227,6 @@ static inline unsigned long __ffs64(u64 word) return __ffs((unsigned long)word); } -/* - * clear_bit32 - Clear a bit in memory for u32 array - * @nr: Bit to clear - * @addr: u32 * address of bitmap - * - * Same as clear_bit, but avoids needing casts for u32 arrays. - */ - -static __always_inline void clear_bit32(long nr, volatile u32 *addr) -{ - clear_bit(nr, (volatile unsigned long *)addr); -} - -/* - * set_bit32 - Set a bit in memory for u32 array - * @nr: Bit to clear - * @addr: u32 * address of bitmap - * - * Same as set_bit, but avoids needing casts for u32 arrays. - */ - -static __always_inline void set_bit32(long nr, volatile u32 *addr) -{ - set_bit(nr, (volatile unsigned long *)addr); -} - #ifdef __KERNEL__ #ifndef set_mask_bits From 689362b3c9ebfb8c6f8f903d107fb784d78e79b1 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:30 +0800 Subject: [PATCH 0842/1085] arm64: dts: mt8173: remove "mediatek, mt8135-mmc" from mmc nodes devicetree bindings has been updated to support multi-platforms, so that each platform has its owns compatible name. And, this compatible name may used in driver to distinguish with other platform. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson Acked-by: Matthias Brugger --- arch/arm64/boot/dts/mediatek/mt8173.dtsi | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi index b99a27372965..26396ef53bde 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi @@ -682,8 +682,7 @@ }; mmc0: mmc@11230000 { - compatible = "mediatek,mt8173-mmc", - "mediatek,mt8135-mmc"; + compatible = "mediatek,mt8173-mmc"; reg = <0 0x11230000 0 0x1000>; interrupts = ; clocks = <&pericfg CLK_PERI_MSDC30_0>, @@ -693,8 +692,7 @@ }; mmc1: mmc@11240000 { - compatible = "mediatek,mt8173-mmc", - "mediatek,mt8135-mmc"; + compatible = "mediatek,mt8173-mmc"; reg = <0 0x11240000 0 0x1000>; interrupts = ; clocks = <&pericfg CLK_PERI_MSDC30_1>, @@ -704,8 +702,7 @@ }; mmc2: mmc@11250000 { - compatible = "mediatek,mt8173-mmc", - "mediatek,mt8135-mmc"; + compatible = "mediatek,mt8173-mmc"; reg = <0 0x11250000 0 0x1000>; interrupts = ; clocks = <&pericfg CLK_PERI_MSDC30_2>, @@ -715,8 +712,7 @@ }; mmc3: mmc@11260000 { - compatible = "mediatek,mt8173-mmc", - "mediatek,mt8135-mmc"; + compatible = "mediatek,mt8173-mmc"; reg = <0 0x11260000 0 0x1000>; interrupts = ; clocks = <&pericfg CLK_PERI_MSDC30_3>, From 7f3d58523d12e233de9896fd26781862ca4985d6 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:31 +0800 Subject: [PATCH 0843/1085] mmc: mediatek: make hs400_tune_response only for mt8173 the origin design of hs400_tune_response is for mt8173 because of mt8173 has a special design. for doing that, we add a new member "compatible", by now it's only for mt8173. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 407745ab5c0a..a979e2791485 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -300,6 +300,7 @@ struct msdc_save_para { struct mtk_mmc_compatible { u8 clk_div_bits; + bool hs400_tune; /* only used for MT8173 */ }; struct msdc_tune_para { @@ -360,18 +361,22 @@ struct msdc_host { static const struct mtk_mmc_compatible mt8135_compat = { .clk_div_bits = 8, + .hs400_tune = false, }; static const struct mtk_mmc_compatible mt8173_compat = { .clk_div_bits = 8, + .hs400_tune = true, }; static const struct mtk_mmc_compatible mt2701_compat = { .clk_div_bits = 12, + .hs400_tune = false, }; static const struct mtk_mmc_compatible mt2712_compat = { .clk_div_bits = 12, + .hs400_tune = false, }; static const struct of_device_id msdc_of_ids[] = { @@ -666,7 +671,8 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) host->base + PAD_CMD_TUNE); } - if (timing == MMC_TIMING_MMC_HS400) + if (timing == MMC_TIMING_MMC_HS400 && + host->dev_comp->hs400_tune) sdr_set_field(host->base + PAD_CMD_TUNE, MSDC_PAD_TUNE_CMDRRDLY, host->hs400_cmd_int_delay); @@ -1594,7 +1600,8 @@ static int msdc_execute_tuning(struct mmc_host *mmc, u32 opcode) struct msdc_host *host = mmc_priv(mmc); int ret; - if (host->hs400_mode) + if (host->hs400_mode && + host->dev_comp->hs400_tune) ret = hs400_tune_response(mmc, opcode); else ret = msdc_tune_response(mmc, opcode); From 39add2521f64bcd53307367902e28e91536a00c3 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:32 +0800 Subject: [PATCH 0844/1085] mmc: mediatek: add pad_tune0 support from mt2701, the register of PAD_TUNE has been phased out, while there is a new register of PAD_TUNE0 Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 51 +++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index a979e2791485..bbf4d6cc8922 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -75,6 +75,7 @@ #define MSDC_PATCH_BIT 0xb0 #define MSDC_PATCH_BIT1 0xb4 #define MSDC_PAD_TUNE 0xec +#define MSDC_PAD_TUNE0 0xf0 #define PAD_DS_TUNE 0x188 #define PAD_CMD_TUNE 0x18c #define EMMC50_CFG0 0x208 @@ -301,6 +302,7 @@ struct msdc_save_para { struct mtk_mmc_compatible { u8 clk_div_bits; bool hs400_tune; /* only used for MT8173 */ + u32 pad_tune_reg; }; struct msdc_tune_para { @@ -362,21 +364,25 @@ struct msdc_host { static const struct mtk_mmc_compatible mt8135_compat = { .clk_div_bits = 8, .hs400_tune = false, + .pad_tune_reg = MSDC_PAD_TUNE, }; static const struct mtk_mmc_compatible mt8173_compat = { .clk_div_bits = 8, .hs400_tune = true, + .pad_tune_reg = MSDC_PAD_TUNE, }; static const struct mtk_mmc_compatible mt2701_compat = { .clk_div_bits = 12, .hs400_tune = false, + .pad_tune_reg = MSDC_PAD_TUNE0, }; static const struct mtk_mmc_compatible mt2712_compat = { .clk_div_bits = 12, .hs400_tune = false, + .pad_tune_reg = MSDC_PAD_TUNE0, }; static const struct of_device_id msdc_of_ids[] = { @@ -581,6 +587,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) u32 flags; u32 div; u32 sclk; + u32 tune_reg = host->dev_comp->pad_tune_reg; if (!hz) { dev_dbg(host->dev, "set mclk to 0\n"); @@ -663,10 +670,10 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) */ if (host->sclk <= 52000000) { writel(host->def_tune_para.iocon, host->base + MSDC_IOCON); - writel(host->def_tune_para.pad_tune, host->base + MSDC_PAD_TUNE); + writel(host->def_tune_para.pad_tune, host->base + tune_reg); } else { writel(host->saved_tune_para.iocon, host->base + MSDC_IOCON); - writel(host->saved_tune_para.pad_tune, host->base + MSDC_PAD_TUNE); + writel(host->saved_tune_para.pad_tune, host->base + tune_reg); writel(host->saved_tune_para.pad_cmd_tune, host->base + PAD_CMD_TUNE); } @@ -1224,6 +1231,7 @@ static irqreturn_t msdc_irq(int irq, void *dev_id) static void msdc_init_hw(struct msdc_host *host) { u32 val; + u32 tune_reg = host->dev_comp->pad_tune_reg; /* Configure to MMC/SD mode, clock free running */ sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_MODE | MSDC_CFG_CKPDN); @@ -1239,7 +1247,7 @@ static void msdc_init_hw(struct msdc_host *host) val = readl(host->base + MSDC_INT); writel(val, host->base + MSDC_INT); - writel(0, host->base + MSDC_PAD_TUNE); + writel(0, host->base + tune_reg); writel(0, host->base + MSDC_IOCON); sdr_set_field(host->base + MSDC_IOCON, MSDC_IOCON_DDLSEL, 0); writel(0x403c0046, host->base + MSDC_PATCH_BIT); @@ -1259,7 +1267,7 @@ static void msdc_init_hw(struct msdc_host *host) sdr_set_field(host->base + SDC_CFG, SDC_CFG_DTOC, 3); host->def_tune_para.iocon = readl(host->base + MSDC_IOCON); - host->def_tune_para.pad_tune = readl(host->base + MSDC_PAD_TUNE); + host->def_tune_para.pad_tune = readl(host->base + tune_reg); dev_dbg(host->dev, "init hardware done!"); } @@ -1402,18 +1410,19 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode) struct msdc_delay_phase internal_delay_phase; u8 final_delay, final_maxlen; u32 internal_delay = 0; + u32 tune_reg = host->dev_comp->pad_tune_reg; int cmd_err; int i, j; if (mmc->ios.timing == MMC_TIMING_MMC_HS200 || mmc->ios.timing == MMC_TIMING_UHS_SDR104) - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRRDLY, host->hs200_cmd_int_delay); sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL); for (i = 0 ; i < PAD_DELAY_MAX; i++) { - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY, i); /* * Using the same parameters, it may sometimes pass the test, @@ -1437,7 +1446,7 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode) sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL); for (i = 0; i < PAD_DELAY_MAX; i++) { - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY, i); /* * Using the same parameters, it may sometimes pass the test, @@ -1462,12 +1471,12 @@ skip_fall: final_maxlen = final_fall_delay.maxlen; if (final_maxlen == final_rise_delay.maxlen) { sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL); - sdr_set_field(host->base + MSDC_PAD_TUNE, MSDC_PAD_TUNE_CMDRDLY, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY, final_rise_delay.final_phase); final_delay = final_rise_delay.final_phase; } else { sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL); - sdr_set_field(host->base + MSDC_PAD_TUNE, MSDC_PAD_TUNE_CMDRDLY, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY, final_fall_delay.final_phase); final_delay = final_fall_delay.final_phase; } @@ -1475,7 +1484,7 @@ skip_fall: goto skip_internal; for (i = 0; i < PAD_DELAY_MAX; i++) { - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRRDLY, i); mmc_send_tuning(mmc, opcode, &cmd_err); if (!cmd_err) @@ -1483,7 +1492,7 @@ skip_fall: } dev_dbg(host->dev, "Final internal delay: 0x%x\n", internal_delay); internal_delay_phase = get_best_delay(host, internal_delay); - sdr_set_field(host->base + MSDC_PAD_TUNE, MSDC_PAD_TUNE_CMDRRDLY, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRRDLY, internal_delay_phase.final_phase); skip_internal: dev_dbg(host->dev, "Final cmd pad delay: %x\n", final_delay); @@ -1545,12 +1554,13 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode) u32 rise_delay = 0, fall_delay = 0; struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,}; u8 final_delay, final_maxlen; + u32 tune_reg = host->dev_comp->pad_tune_reg; int i, ret; sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL); sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL); for (i = 0 ; i < PAD_DELAY_MAX; i++) { - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY, i); ret = mmc_send_tuning(mmc, opcode, NULL); if (!ret) @@ -1565,7 +1575,7 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode) sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL); sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL); for (i = 0; i < PAD_DELAY_MAX; i++) { - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY, i); ret = mmc_send_tuning(mmc, opcode, NULL); if (!ret) @@ -1578,14 +1588,14 @@ skip_fall: if (final_maxlen == final_rise_delay.maxlen) { sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL); sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL); - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY, final_rise_delay.final_phase); final_delay = final_rise_delay.final_phase; } else { sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL); sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL); - sdr_set_field(host->base + MSDC_PAD_TUNE, + sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY, final_fall_delay.final_phase); final_delay = final_fall_delay.final_phase; @@ -1599,6 +1609,7 @@ static int msdc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct msdc_host *host = mmc_priv(mmc); int ret; + u32 tune_reg = host->dev_comp->pad_tune_reg; if (host->hs400_mode && host->dev_comp->hs400_tune) @@ -1616,7 +1627,7 @@ static int msdc_execute_tuning(struct mmc_host *mmc, u32 opcode) } host->saved_tune_para.iocon = readl(host->base + MSDC_IOCON); - host->saved_tune_para.pad_tune = readl(host->base + MSDC_PAD_TUNE); + host->saved_tune_para.pad_tune = readl(host->base + tune_reg); host->saved_tune_para.pad_cmd_tune = readl(host->base + PAD_CMD_TUNE); return ret; } @@ -1857,10 +1868,12 @@ static int msdc_drv_remove(struct platform_device *pdev) #ifdef CONFIG_PM static void msdc_save_reg(struct msdc_host *host) { + u32 tune_reg = host->dev_comp->pad_tune_reg; + host->save_para.msdc_cfg = readl(host->base + MSDC_CFG); host->save_para.iocon = readl(host->base + MSDC_IOCON); host->save_para.sdc_cfg = readl(host->base + SDC_CFG); - host->save_para.pad_tune = readl(host->base + MSDC_PAD_TUNE); + host->save_para.pad_tune = readl(host->base + tune_reg); host->save_para.patch_bit0 = readl(host->base + MSDC_PATCH_BIT); host->save_para.patch_bit1 = readl(host->base + MSDC_PATCH_BIT1); host->save_para.pad_ds_tune = readl(host->base + PAD_DS_TUNE); @@ -1870,10 +1883,12 @@ static void msdc_save_reg(struct msdc_host *host) static void msdc_restore_reg(struct msdc_host *host) { + u32 tune_reg = host->dev_comp->pad_tune_reg; + writel(host->save_para.msdc_cfg, host->base + MSDC_CFG); writel(host->save_para.iocon, host->base + MSDC_IOCON); writel(host->save_para.sdc_cfg, host->base + SDC_CFG); - writel(host->save_para.pad_tune, host->base + MSDC_PAD_TUNE); + writel(host->save_para.pad_tune, host->base + tune_reg); writel(host->save_para.patch_bit0, host->base + MSDC_PATCH_BIT); writel(host->save_para.patch_bit1, host->base + MSDC_PATCH_BIT1); writel(host->save_para.pad_ds_tune, host->base + PAD_DS_TUNE); From 2fea581926703a7cbcf92fa7e3d74d83b5e40d3c Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:33 +0800 Subject: [PATCH 0845/1085] mmc: mediatek: add async fifo and data tune support mt2701/mt2712 supports async fifo & data tune, which can improve host stability. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 52 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index bbf4d6cc8922..c17168ad2be0 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -74,6 +74,7 @@ #define MSDC_DMA_CFG 0x9c #define MSDC_PATCH_BIT 0xb0 #define MSDC_PATCH_BIT1 0xb4 +#define MSDC_PATCH_BIT2 0xb8 #define MSDC_PAD_TUNE 0xec #define MSDC_PAD_TUNE0 0xf0 #define PAD_DS_TUNE 0x188 @@ -216,11 +217,20 @@ #define MSDC_PATCH_BIT_SPCPUSH (0x1 << 29) /* RW */ #define MSDC_PATCH_BIT_DECRCTMO (0x1 << 30) /* RW */ +#define MSDC_PATCH_BIT2_CFGRESP (0x1 << 15) /* RW */ +#define MSDC_PATCH_BIT2_CFGCRCSTS (0x1 << 28) /* RW */ +#define MSDC_PB2_RESPWAIT (0x3 << 2) /* RW */ +#define MSDC_PB2_RESPSTSENSEL (0x7 << 16) /* RW */ +#define MSDC_PB2_CRCSTSENSEL (0x7 << 29) /* RW */ + #define MSDC_PAD_TUNE_DATWRDLY (0x1f << 0) /* RW */ #define MSDC_PAD_TUNE_DATRRDLY (0x1f << 8) /* RW */ #define MSDC_PAD_TUNE_CMDRDLY (0x1f << 16) /* RW */ #define MSDC_PAD_TUNE_CMDRRDLY (0x1f << 22) /* RW */ #define MSDC_PAD_TUNE_CLKTDLY (0x1f << 27) /* RW */ +#define MSDC_PAD_TUNE_RXDLYSEL (0x1 << 15) /* RW */ +#define MSDC_PAD_TUNE_RD_SEL (0x1 << 13) /* RW */ +#define MSDC_PAD_TUNE_CMD_SEL (0x1 << 21) /* RW */ #define PAD_DS_TUNE_DLY1 (0x1f << 2) /* RW */ #define PAD_DS_TUNE_DLY2 (0x1f << 7) /* RW */ @@ -294,6 +304,7 @@ struct msdc_save_para { u32 pad_tune; u32 patch_bit0; u32 patch_bit1; + u32 patch_bit2; u32 pad_ds_tune; u32 pad_cmd_tune; u32 emmc50_cfg0; @@ -303,6 +314,8 @@ struct mtk_mmc_compatible { u8 clk_div_bits; bool hs400_tune; /* only used for MT8173 */ u32 pad_tune_reg; + bool async_fifo; + bool data_tune; }; struct msdc_tune_para { @@ -365,24 +378,32 @@ static const struct mtk_mmc_compatible mt8135_compat = { .clk_div_bits = 8, .hs400_tune = false, .pad_tune_reg = MSDC_PAD_TUNE, + .async_fifo = false, + .data_tune = false, }; static const struct mtk_mmc_compatible mt8173_compat = { .clk_div_bits = 8, .hs400_tune = true, .pad_tune_reg = MSDC_PAD_TUNE, + .async_fifo = false, + .data_tune = false, }; static const struct mtk_mmc_compatible mt2701_compat = { .clk_div_bits = 12, .hs400_tune = false, .pad_tune_reg = MSDC_PAD_TUNE0, + .async_fifo = true, + .data_tune = true, }; static const struct mtk_mmc_compatible mt2712_compat = { .clk_div_bits = 12, .hs400_tune = false, .pad_tune_reg = MSDC_PAD_TUNE0, + .async_fifo = true, + .data_tune = true, }; static const struct of_device_id msdc_of_ids[] = { @@ -1252,8 +1273,29 @@ static void msdc_init_hw(struct msdc_host *host) sdr_set_field(host->base + MSDC_IOCON, MSDC_IOCON_DDLSEL, 0); writel(0x403c0046, host->base + MSDC_PATCH_BIT); sdr_set_field(host->base + MSDC_PATCH_BIT, MSDC_CKGEN_MSDC_DLY_SEL, 1); - writel(0xffff0089, host->base + MSDC_PATCH_BIT1); + writel(0xffff4089, host->base + MSDC_PATCH_BIT1); sdr_set_bits(host->base + EMMC50_CFG0, EMMC50_CFG_CFCSTS_SEL); + if (host->dev_comp->async_fifo) { + sdr_set_field(host->base + MSDC_PATCH_BIT2, + MSDC_PB2_RESPWAIT, 3); + sdr_set_field(host->base + MSDC_PATCH_BIT2, + MSDC_PB2_RESPSTSENSEL, 2); + sdr_set_field(host->base + MSDC_PATCH_BIT2, + MSDC_PB2_CRCSTSENSEL, 2); + /* use async fifo, then no need tune internal delay */ + sdr_clr_bits(host->base + MSDC_PATCH_BIT2, + MSDC_PATCH_BIT2_CFGRESP); + sdr_set_bits(host->base + MSDC_PATCH_BIT2, + MSDC_PATCH_BIT2_CFGCRCSTS); + } + + if (host->dev_comp->data_tune) { + sdr_set_bits(host->base + tune_reg, + MSDC_PAD_TUNE_RD_SEL | MSDC_PAD_TUNE_CMD_SEL); + } else { + /* choose clock tune */ + sdr_set_bits(host->base + tune_reg, MSDC_PAD_TUNE_RXDLYSEL); + } /* Configure to enable SDIO mode. * it's must otherwise sdio cmd5 failed @@ -1268,6 +1310,8 @@ static void msdc_init_hw(struct msdc_host *host) host->def_tune_para.iocon = readl(host->base + MSDC_IOCON); host->def_tune_para.pad_tune = readl(host->base + tune_reg); + host->saved_tune_para.iocon = readl(host->base + MSDC_IOCON); + host->saved_tune_para.pad_tune = readl(host->base + tune_reg); dev_dbg(host->dev, "init hardware done!"); } @@ -1480,7 +1524,7 @@ skip_fall: final_fall_delay.final_phase); final_delay = final_fall_delay.final_phase; } - if (host->hs200_cmd_int_delay) + if (host->dev_comp->async_fifo || host->hs200_cmd_int_delay) goto skip_internal; for (i = 0; i < PAD_DELAY_MAX; i++) { @@ -1638,6 +1682,8 @@ static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) host->hs400_mode = true; writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + /* hs400 mode must set it to 0 */ + sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGCRCSTS); return 0; } @@ -1876,6 +1922,7 @@ static void msdc_save_reg(struct msdc_host *host) host->save_para.pad_tune = readl(host->base + tune_reg); host->save_para.patch_bit0 = readl(host->base + MSDC_PATCH_BIT); host->save_para.patch_bit1 = readl(host->base + MSDC_PATCH_BIT1); + host->save_para.patch_bit2 = readl(host->base + MSDC_PATCH_BIT2); host->save_para.pad_ds_tune = readl(host->base + PAD_DS_TUNE); host->save_para.pad_cmd_tune = readl(host->base + PAD_CMD_TUNE); host->save_para.emmc50_cfg0 = readl(host->base + EMMC50_CFG0); @@ -1891,6 +1938,7 @@ static void msdc_restore_reg(struct msdc_host *host) writel(host->save_para.pad_tune, host->base + tune_reg); writel(host->save_para.patch_bit0, host->base + MSDC_PATCH_BIT); writel(host->save_para.patch_bit1, host->base + MSDC_PATCH_BIT1); + writel(host->save_para.patch_bit2, host->base + MSDC_PATCH_BIT2); writel(host->save_para.pad_ds_tune, host->base + PAD_DS_TUNE); writel(host->save_para.pad_cmd_tune, host->base + PAD_CMD_TUNE); writel(host->save_para.emmc50_cfg0, host->base + EMMC50_CFG0); From acde28c434623bde0812cb3051eeea63428871e8 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:34 +0800 Subject: [PATCH 0846/1085] mmc: mediatek: add busy_check support bit7 of PATCH_BIT1 has different meaning in new design, to compatible with previous platform, clear this bit in new platform. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index c17168ad2be0..650b94498f7c 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -316,6 +316,7 @@ struct mtk_mmc_compatible { u32 pad_tune_reg; bool async_fifo; bool data_tune; + bool busy_check; }; struct msdc_tune_para { @@ -380,6 +381,7 @@ static const struct mtk_mmc_compatible mt8135_compat = { .pad_tune_reg = MSDC_PAD_TUNE, .async_fifo = false, .data_tune = false, + .busy_check = false, }; static const struct mtk_mmc_compatible mt8173_compat = { @@ -388,6 +390,7 @@ static const struct mtk_mmc_compatible mt8173_compat = { .pad_tune_reg = MSDC_PAD_TUNE, .async_fifo = false, .data_tune = false, + .busy_check = false, }; static const struct mtk_mmc_compatible mt2701_compat = { @@ -396,6 +399,7 @@ static const struct mtk_mmc_compatible mt2701_compat = { .pad_tune_reg = MSDC_PAD_TUNE0, .async_fifo = true, .data_tune = true, + .busy_check = false, }; static const struct mtk_mmc_compatible mt2712_compat = { @@ -404,6 +408,7 @@ static const struct mtk_mmc_compatible mt2712_compat = { .pad_tune_reg = MSDC_PAD_TUNE0, .async_fifo = true, .data_tune = true, + .busy_check = true, }; static const struct of_device_id msdc_of_ids[] = { @@ -1275,6 +1280,8 @@ static void msdc_init_hw(struct msdc_host *host) sdr_set_field(host->base + MSDC_PATCH_BIT, MSDC_CKGEN_MSDC_DLY_SEL, 1); writel(0xffff4089, host->base + MSDC_PATCH_BIT1); sdr_set_bits(host->base + EMMC50_CFG0, EMMC50_CFG_CFCSTS_SEL); + if (host->dev_comp->busy_check) + sdr_clr_bits(host->base + MSDC_PATCH_BIT1, (1 << 7)); if (host->dev_comp->async_fifo) { sdr_set_field(host->base + MSDC_PATCH_BIT2, MSDC_PB2_RESPWAIT, 3); From d9dcbfc880126b193eab418d7b64b3774cb57152 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:35 +0800 Subject: [PATCH 0847/1085] mmc: mediatek: add stop_clk fix and enhance_rx support mt2712 supports stop_clk fix and enhance_rx, which can improve host stability. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 47 +++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 650b94498f7c..d519fa5dbf52 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -67,6 +67,7 @@ #define SDC_RESP2 0x48 #define SDC_RESP3 0x4c #define SDC_BLK_NUM 0x50 +#define SDC_ADV_CFG0 0x64 #define EMMC_IOCON 0x7c #define SDC_ACMD_RESP 0x80 #define MSDC_DMA_SA 0x90 @@ -80,6 +81,7 @@ #define PAD_DS_TUNE 0x188 #define PAD_CMD_TUNE 0x18c #define EMMC50_CFG0 0x208 +#define SDC_FIFO_CFG 0x228 /*--------------------------------------------------------------------------*/ /* Register Mask */ @@ -188,6 +190,9 @@ #define SDC_STS_CMDBUSY (0x1 << 1) /* RW */ #define SDC_STS_SWR_COMPL (0x1 << 31) /* RW */ +/* SDC_ADV_CFG0 mask */ +#define SDC_RX_ENHANCE_EN (0x1 << 20) /* RW */ + /* MSDC_DMA_CTRL mask */ #define MSDC_DMA_CTRL_START (0x1 << 0) /* W */ #define MSDC_DMA_CTRL_STOP (0x1 << 1) /* W */ @@ -217,6 +222,8 @@ #define MSDC_PATCH_BIT_SPCPUSH (0x1 << 29) /* RW */ #define MSDC_PATCH_BIT_DECRCTMO (0x1 << 30) /* RW */ +#define MSDC_PATCH_BIT1_STOP_DLY (0xf << 8) /* RW */ + #define MSDC_PATCH_BIT2_CFGRESP (0x1 << 15) /* RW */ #define MSDC_PATCH_BIT2_CFGCRCSTS (0x1 << 28) /* RW */ #define MSDC_PB2_RESPWAIT (0x3 << 2) /* RW */ @@ -242,6 +249,9 @@ #define EMMC50_CFG_CRCSTS_EDGE (0x1 << 3) /* RW */ #define EMMC50_CFG_CFCSTS_SEL (0x1 << 4) /* RW */ +#define SDC_FIFO_CFG_WRVALIDSEL (0x1 << 24) /* RW */ +#define SDC_FIFO_CFG_RDVALIDSEL (0x1 << 25) /* RW */ + #define REQ_CMD_EIO (0x1 << 0) #define REQ_CMD_TMO (0x1 << 1) #define REQ_DAT_ERR (0x1 << 2) @@ -308,6 +318,7 @@ struct msdc_save_para { u32 pad_ds_tune; u32 pad_cmd_tune; u32 emmc50_cfg0; + u32 sdc_fifo_cfg; }; struct mtk_mmc_compatible { @@ -317,6 +328,8 @@ struct mtk_mmc_compatible { bool async_fifo; bool data_tune; bool busy_check; + bool stop_clk_fix; + bool enhance_rx; }; struct msdc_tune_para { @@ -382,6 +395,8 @@ static const struct mtk_mmc_compatible mt8135_compat = { .async_fifo = false, .data_tune = false, .busy_check = false, + .stop_clk_fix = false, + .enhance_rx = false, }; static const struct mtk_mmc_compatible mt8173_compat = { @@ -391,6 +406,8 @@ static const struct mtk_mmc_compatible mt8173_compat = { .async_fifo = false, .data_tune = false, .busy_check = false, + .stop_clk_fix = false, + .enhance_rx = false, }; static const struct mtk_mmc_compatible mt2701_compat = { @@ -400,6 +417,8 @@ static const struct mtk_mmc_compatible mt2701_compat = { .async_fifo = true, .data_tune = true, .busy_check = false, + .stop_clk_fix = false, + .enhance_rx = false, }; static const struct mtk_mmc_compatible mt2712_compat = { @@ -409,6 +428,8 @@ static const struct mtk_mmc_compatible mt2712_compat = { .async_fifo = true, .data_tune = true, .busy_check = true, + .stop_clk_fix = true, + .enhance_rx = true, }; static const struct of_device_id msdc_of_ids[] = { @@ -1280,15 +1301,31 @@ static void msdc_init_hw(struct msdc_host *host) sdr_set_field(host->base + MSDC_PATCH_BIT, MSDC_CKGEN_MSDC_DLY_SEL, 1); writel(0xffff4089, host->base + MSDC_PATCH_BIT1); sdr_set_bits(host->base + EMMC50_CFG0, EMMC50_CFG_CFCSTS_SEL); + + if (host->dev_comp->stop_clk_fix) { + sdr_set_field(host->base + MSDC_PATCH_BIT1, + MSDC_PATCH_BIT1_STOP_DLY, 3); + sdr_clr_bits(host->base + SDC_FIFO_CFG, + SDC_FIFO_CFG_WRVALIDSEL); + sdr_clr_bits(host->base + SDC_FIFO_CFG, + SDC_FIFO_CFG_RDVALIDSEL); + } + if (host->dev_comp->busy_check) sdr_clr_bits(host->base + MSDC_PATCH_BIT1, (1 << 7)); + if (host->dev_comp->async_fifo) { sdr_set_field(host->base + MSDC_PATCH_BIT2, MSDC_PB2_RESPWAIT, 3); - sdr_set_field(host->base + MSDC_PATCH_BIT2, - MSDC_PB2_RESPSTSENSEL, 2); - sdr_set_field(host->base + MSDC_PATCH_BIT2, - MSDC_PB2_CRCSTSENSEL, 2); + if (host->dev_comp->enhance_rx) { + sdr_set_bits(host->base + SDC_ADV_CFG0, + SDC_RX_ENHANCE_EN); + } else { + sdr_set_field(host->base + MSDC_PATCH_BIT2, + MSDC_PB2_RESPSTSENSEL, 2); + sdr_set_field(host->base + MSDC_PATCH_BIT2, + MSDC_PB2_CRCSTSENSEL, 2); + } /* use async fifo, then no need tune internal delay */ sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGRESP); @@ -1933,6 +1970,7 @@ static void msdc_save_reg(struct msdc_host *host) host->save_para.pad_ds_tune = readl(host->base + PAD_DS_TUNE); host->save_para.pad_cmd_tune = readl(host->base + PAD_CMD_TUNE); host->save_para.emmc50_cfg0 = readl(host->base + EMMC50_CFG0); + host->save_para.sdc_fifo_cfg = readl(host->base + SDC_FIFO_CFG); } static void msdc_restore_reg(struct msdc_host *host) @@ -1949,6 +1987,7 @@ static void msdc_restore_reg(struct msdc_host *host) writel(host->save_para.pad_ds_tune, host->base + PAD_DS_TUNE); writel(host->save_para.pad_cmd_tune, host->base + PAD_CMD_TUNE); writel(host->save_para.emmc50_cfg0, host->base + EMMC50_CFG0); + writel(host->save_para.sdc_fifo_cfg, host->base + SDC_FIFO_CFG); } static int msdc_runtime_suspend(struct device *dev) From 3c1a8844369880d077f858f448afc4ef553d962a Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:36 +0800 Subject: [PATCH 0848/1085] mmc: mediatek: add support of source_cg clock source clock need an independent cg to control, when doing clk mode switch, need gate source clock to avoid hw issue(multi-bit sync hw hang) Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index d519fa5dbf52..df57481b0dda 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -372,6 +372,7 @@ struct msdc_host { struct clk *src_clk; /* msdc source clock */ struct clk *h_clk; /* msdc h_clk */ + struct clk *src_clk_cg; /* msdc source clock control gate */ u32 mclk; /* mmc subsystem clock frequency */ u32 src_clk_freq; /* source clock frequency */ u32 sclk; /* SD/MS bus clock frequency */ @@ -616,6 +617,7 @@ static void msdc_set_timeout(struct msdc_host *host, u32 ns, u32 clks) static void msdc_gate_clock(struct msdc_host *host) { + clk_disable_unprepare(host->src_clk_cg); clk_disable_unprepare(host->src_clk); clk_disable_unprepare(host->h_clk); } @@ -624,6 +626,7 @@ static void msdc_ungate_clock(struct msdc_host *host) { clk_prepare_enable(host->h_clk); clk_prepare_enable(host->src_clk); + clk_prepare_enable(host->src_clk_cg); while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB)) cpu_relax(); } @@ -692,6 +695,15 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) sclk = (host->src_clk_freq >> 2) / div; } } + sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); + /* + * As src_clk/HCLK use the same bit to gate/ungate, + * So if want to only gate src_clk, need gate its parent(mux). + */ + if (host->src_clk_cg) + clk_disable_unprepare(host->src_clk_cg); + else + clk_disable_unprepare(clk_get_parent(host->src_clk)); if (host->dev_comp->clk_div_bits == 8) sdr_set_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD | MSDC_CFG_CKDIV, @@ -700,10 +712,14 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) sdr_set_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD_EXTRA | MSDC_CFG_CKDIV_EXTRA, (mode << 12) | div); + if (host->src_clk_cg) + clk_prepare_enable(host->src_clk_cg); + else + clk_prepare_enable(clk_get_parent(host->src_clk)); - sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB)) cpu_relax(); + sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); host->sclk = sclk; host->mclk = hz; host->timing = timing; @@ -1822,6 +1838,11 @@ static int msdc_drv_probe(struct platform_device *pdev) goto host_free; } + /*source clock control gate is optional clock*/ + host->src_clk_cg = devm_clk_get(&pdev->dev, "source_cg"); + if (IS_ERR(host->src_clk_cg)) + host->src_clk_cg = NULL; + host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { ret = -EINVAL; From d17bb71c2c2cc2be23b1e20778fc87c80f8a7c9d Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:37 +0800 Subject: [PATCH 0849/1085] mmc: mediatek: add latch-ck support some platform(eg.mt2701) does not support "stop clk fix", in this case, need set correct latch-ck to avoid crc error caused by stop clock block-internally. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index df57481b0dda..d8bd416fd471 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -378,6 +378,7 @@ struct msdc_host { u32 sclk; /* SD/MS bus clock frequency */ unsigned char timing; bool vqmmc_enabled; + u32 latch_ck; u32 hs400_ds_delay; u32 hs200_cmd_int_delay; /* cmd internal delay for HS200/SDR104 */ u32 hs400_cmd_int_delay; /* cmd internal delay for HS400 */ @@ -1661,6 +1662,8 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode) u32 tune_reg = host->dev_comp->pad_tune_reg; int i, ret; + sdr_set_field(host->base + MSDC_PATCH_BIT, MSDC_INT_DAT_LATCH_CK_SEL, + host->latch_ck); sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL); sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL); for (i = 0 ; i < PAD_DELAY_MAX; i++) { @@ -1773,6 +1776,9 @@ static const struct mmc_host_ops mt_msdc_ops = { static void msdc_of_property_parse(struct platform_device *pdev, struct msdc_host *host) { + of_property_read_u32(pdev->dev.of_node, "mediatek,latch-ck", + &host->latch_ck); + of_property_read_u32(pdev->dev.of_node, "hs400-ds-delay", &host->hs400_ds_delay); From c8609b22529755be6f75d9b0aeb2577f4bfa83de Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:38 +0800 Subject: [PATCH 0850/1085] mmc: mediatek: improve eMMC hs400 mode read performance enlarge outstanding value to improve read performance Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index d8bd416fd471..3247a5aed7d8 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -81,6 +81,7 @@ #define PAD_DS_TUNE 0x188 #define PAD_CMD_TUNE 0x18c #define EMMC50_CFG0 0x208 +#define EMMC50_CFG3 0x220 #define SDC_FIFO_CFG 0x228 /*--------------------------------------------------------------------------*/ @@ -249,6 +250,8 @@ #define EMMC50_CFG_CRCSTS_EDGE (0x1 << 3) /* RW */ #define EMMC50_CFG_CFCSTS_SEL (0x1 << 4) /* RW */ +#define EMMC50_CFG3_OUTS_WR (0x1f << 0) /* RW */ + #define SDC_FIFO_CFG_WRVALIDSEL (0x1 << 24) /* RW */ #define SDC_FIFO_CFG_RDVALIDSEL (0x1 << 25) /* RW */ @@ -318,6 +321,7 @@ struct msdc_save_para { u32 pad_ds_tune; u32 pad_cmd_tune; u32 emmc50_cfg0; + u32 emmc50_cfg3; u32 sdc_fifo_cfg; }; @@ -1747,6 +1751,9 @@ static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); /* hs400 mode must set it to 0 */ sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGCRCSTS); + /* to improve read performance, set outstanding to 2 */ + sdr_set_field(host->base + EMMC50_CFG3, EMMC50_CFG3_OUTS_WR, 2); + return 0; } @@ -1997,6 +2004,7 @@ static void msdc_save_reg(struct msdc_host *host) host->save_para.pad_ds_tune = readl(host->base + PAD_DS_TUNE); host->save_para.pad_cmd_tune = readl(host->base + PAD_CMD_TUNE); host->save_para.emmc50_cfg0 = readl(host->base + EMMC50_CFG0); + host->save_para.emmc50_cfg3 = readl(host->base + EMMC50_CFG3); host->save_para.sdc_fifo_cfg = readl(host->base + SDC_FIFO_CFG); } @@ -2014,6 +2022,7 @@ static void msdc_restore_reg(struct msdc_host *host) writel(host->save_para.pad_ds_tune, host->base + PAD_DS_TUNE); writel(host->save_para.pad_cmd_tune, host->base + PAD_CMD_TUNE); writel(host->save_para.emmc50_cfg0, host->base + EMMC50_CFG0); + writel(host->save_para.emmc50_cfg3, host->base + EMMC50_CFG3); writel(host->save_para.sdc_fifo_cfg, host->base + SDC_FIFO_CFG); } From 6b10c9abfbceaf94ef227d98317020c557996186 Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Mon, 16 Oct 2017 09:46:39 +0800 Subject: [PATCH 0851/1085] mmc: mediatek: perfer to use rise edge latching for cmd line data lines have applied to perfer to use rise edge, also need apply it to cmd line. Signed-off-by: Chaotian Jing Tested-by: Sean Wang Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 3247a5aed7d8..6457a7d8880f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -1550,7 +1550,8 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode) } final_rise_delay = get_best_delay(host, rise_delay); /* if rising edge has enough margin, then do not scan falling edge */ - if (final_rise_delay.maxlen >= 12 && final_rise_delay.start < 4) + if (final_rise_delay.maxlen >= 12 || + (final_rise_delay.start == 0 && final_rise_delay.maxlen >= 4)) goto skip_fall; sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL); From f07b7952df3b3ff014e83952b402425cf50c51ce Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 19 Oct 2017 13:41:45 +0300 Subject: [PATCH 0852/1085] mmc: sdhci-acpi: Let devices define their own private data Let devices define their own private data to facilitate device-specific operations. The size of the private structure is specified in the sdhci_acpi_slot structure, then sdhci_acpi_probe() will allocate extra space for it, and sdhci_acpi_priv() can be used to get a reference to it. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 640be5b618fc..5bb5880403b2 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -73,6 +73,7 @@ struct sdhci_acpi_slot { unsigned int caps2; mmc_pm_flag_t pm_caps; unsigned int flags; + size_t priv_size; int (*probe_slot)(struct platform_device *, const char *, const char *); int (*remove_slot)(struct platform_device *); }; @@ -82,8 +83,14 @@ struct sdhci_acpi_host { const struct sdhci_acpi_slot *slot; struct platform_device *pdev; bool use_runtime_pm; + unsigned long private[0] ____cacheline_aligned; }; +static inline void *sdhci_acpi_priv(struct sdhci_acpi_host *c) +{ + return (void *)c->private; +} + static inline bool sdhci_acpi_flag(struct sdhci_acpi_host *c, unsigned int flag) { return c->slot && (c->slot->flags & flag); @@ -393,11 +400,13 @@ static const struct sdhci_acpi_slot *sdhci_acpi_get_slot(const char *hid, static int sdhci_acpi_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; + const struct sdhci_acpi_slot *slot; struct acpi_device *device, *child; struct sdhci_acpi_host *c; struct sdhci_host *host; struct resource *iomem; resource_size_t len; + size_t priv_size; const char *hid; const char *uid; int err; @@ -409,6 +418,8 @@ static int sdhci_acpi_probe(struct platform_device *pdev) hid = acpi_device_hid(device); uid = acpi_device_uid(device); + slot = sdhci_acpi_get_slot(hid, uid); + /* Power on the SDHCI controller and its children */ acpi_device_fix_up_power(device); if (!sdhci_acpi_no_fixup_child_power(hid, uid)) { @@ -431,13 +442,14 @@ static int sdhci_acpi_probe(struct platform_device *pdev) if (!devm_request_mem_region(dev, iomem->start, len, dev_name(dev))) return -ENOMEM; - host = sdhci_alloc_host(dev, sizeof(struct sdhci_acpi_host)); + priv_size = slot ? slot->priv_size : 0; + host = sdhci_alloc_host(dev, sizeof(struct sdhci_acpi_host) + priv_size); if (IS_ERR(host)) return PTR_ERR(host); c = sdhci_priv(host); c->host = host; - c->slot = sdhci_acpi_get_slot(hid, uid); + c->slot = slot; c->pdev = pdev; c->use_runtime_pm = sdhci_acpi_flag(c, SDHCI_ACPI_RUNTIME_PM); From 1c451c139ee4cca63b293eb30eb4e38ab7ca05af Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 19 Oct 2017 13:41:46 +0300 Subject: [PATCH 0853/1085] mmc: sdhci-acpi: Fix voltage switch for some Intel host controllers Some Intel host controllers use an ACPI device-specific method to ensure correct voltage switching. Fix voltage switch for those, by adding a call to the DSM. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 108 ++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 5bb5880403b2..b988997a1e80 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -96,6 +96,105 @@ static inline bool sdhci_acpi_flag(struct sdhci_acpi_host *c, unsigned int flag) return c->slot && (c->slot->flags & flag); } +enum { + INTEL_DSM_FNS = 0, + INTEL_DSM_V18_SWITCH = 3, + INTEL_DSM_V33_SWITCH = 4, +}; + +struct intel_host { + u32 dsm_fns; +}; + +static const guid_t intel_dsm_guid = + GUID_INIT(0xF6C13EA5, 0x65CD, 0x461F, + 0xAB, 0x7A, 0x29, 0xF7, 0xE8, 0xD5, 0xBD, 0x61); + +static int __intel_dsm(struct intel_host *intel_host, struct device *dev, + unsigned int fn, u32 *result) +{ + union acpi_object *obj; + int err = 0; + + obj = acpi_evaluate_dsm(ACPI_HANDLE(dev), &intel_dsm_guid, 0, fn, NULL); + if (!obj) + return -EOPNOTSUPP; + + if (obj->type == ACPI_TYPE_INTEGER) { + *result = obj->integer.value; + } else if (obj->type == ACPI_TYPE_BUFFER && obj->buffer.length > 0) { + size_t len = min_t(size_t, obj->buffer.length, 4); + + *result = 0; + memcpy(result, obj->buffer.pointer, len); + } else { + dev_err(dev, "%s DSM fn %u obj->type %d obj->buffer.length %d\n", + __func__, fn, obj->type, obj->buffer.length); + err = -EINVAL; + } + + ACPI_FREE(obj); + + return err; +} + +static int intel_dsm(struct intel_host *intel_host, struct device *dev, + unsigned int fn, u32 *result) +{ + if (fn > 31 || !(intel_host->dsm_fns & (1 << fn))) + return -EOPNOTSUPP; + + return __intel_dsm(intel_host, dev, fn, result); +} + +static void intel_dsm_init(struct intel_host *intel_host, struct device *dev, + struct mmc_host *mmc) +{ + int err; + + err = __intel_dsm(intel_host, dev, INTEL_DSM_FNS, &intel_host->dsm_fns); + if (err) { + pr_debug("%s: DSM not supported, error %d\n", + mmc_hostname(mmc), err); + return; + } + + pr_debug("%s: DSM function mask %#x\n", + mmc_hostname(mmc), intel_host->dsm_fns); +} + +static int intel_start_signal_voltage_switch(struct mmc_host *mmc, + struct mmc_ios *ios) +{ + struct device *dev = mmc_dev(mmc); + struct sdhci_acpi_host *c = dev_get_drvdata(dev); + struct intel_host *intel_host = sdhci_acpi_priv(c); + unsigned int fn; + u32 result = 0; + int err; + + err = sdhci_start_signal_voltage_switch(mmc, ios); + if (err) + return err; + + switch (ios->signal_voltage) { + case MMC_SIGNAL_VOLTAGE_330: + fn = INTEL_DSM_V33_SWITCH; + break; + case MMC_SIGNAL_VOLTAGE_180: + fn = INTEL_DSM_V18_SWITCH; + break; + default: + return 0; + } + + err = intel_dsm(intel_host, dev, fn, &result); + pr_debug("%s: %s DSM fn %u error %d result %u\n", + mmc_hostname(mmc), __func__, fn, err, result); + + return 0; +} + static void sdhci_acpi_int_hw_reset(struct sdhci_host *host) { u8 reg; @@ -280,6 +379,7 @@ static int intel_probe_slot(struct platform_device *pdev, const char *hid, const char *uid) { struct sdhci_acpi_host *c = platform_get_drvdata(pdev); + struct intel_host *intel_host = sdhci_acpi_priv(c); struct sdhci_host *host = c->host; if (hid && uid && !strcmp(hid, "80860F14") && !strcmp(uid, "1") && @@ -290,6 +390,11 @@ static int intel_probe_slot(struct platform_device *pdev, const char *hid, if (hid && !strcmp(hid, "80865ACA")) host->mmc_host_ops.get_cd = bxt_get_cd; + intel_dsm_init(intel_host, &pdev->dev, host->mmc); + + host->mmc_host_ops.start_signal_voltage_switch = + intel_start_signal_voltage_switch; + return 0; } @@ -304,6 +409,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = { SDHCI_QUIRK2_STOP_WITH_TC | SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400, .probe_slot = intel_probe_slot, + .priv_size = sizeof(struct intel_host), }; static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = { @@ -315,6 +421,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = { .flags = SDHCI_ACPI_RUNTIME_PM, .pm_caps = MMC_PM_KEEP_POWER, .probe_slot = intel_probe_slot, + .priv_size = sizeof(struct intel_host), }; static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = { @@ -325,6 +432,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = { SDHCI_QUIRK2_STOP_WITH_TC, .caps = MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_AGGRESSIVE_PM, .probe_slot = intel_probe_slot, + .priv_size = sizeof(struct intel_host), }; static const struct sdhci_acpi_slot sdhci_acpi_slot_qcom_sd_3v = { From 2ee4f6200597bda9713e28a9c1e65a392615b4b5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 08:03:45 -0700 Subject: [PATCH 0854/1085] mmc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Ludovic Desroches Cc: Ulf Hansson Cc: Jaehoon Chung Cc: Carlo Caione Cc: Kevin Hilman Cc: Nicolas Pitre Cc: Jarkko Lavinen Cc: Adrian Hunter Cc: Alex Dubov Cc: Bruce Chang Cc: Harald Welte Cc: Tony Olech Cc: Pierre Ossman Cc: Linus Walleij Cc: Paul Cercueil Cc: Heiner Kallweit Cc: Shawn Lin Cc: Arvind Yadav Cc: Allen Cc: linux-mmc@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Cc: linux-omap@vger.kernel.org Cc: linux-usb@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Ulf Hansson --- drivers/mmc/host/atmel-mci.c | 13 ++++++------- drivers/mmc/host/jz4740_mmc.c | 7 +++---- drivers/mmc/host/meson-mx-sdio.c | 7 +++---- drivers/mmc/host/mvsdio.c | 6 +++--- drivers/mmc/host/mxcmmc.c | 7 +++---- drivers/mmc/host/omap.c | 20 +++++++++----------- drivers/mmc/host/sdhci.c | 13 ++++++------- drivers/mmc/host/tifm_sd.c | 6 +++--- drivers/mmc/host/via-sdmmc.c | 6 +++--- drivers/mmc/host/vub300.c | 17 +++++++++-------- drivers/mmc/host/wbsd.c | 7 +++---- 11 files changed, 51 insertions(+), 58 deletions(-) diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 0a0ebf3a096d..e55f3932d580 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -732,11 +732,11 @@ static inline unsigned int atmci_convert_chksize(struct atmel_mci *host, return 0; } -static void atmci_timeout_timer(unsigned long data) +static void atmci_timeout_timer(struct timer_list *t) { struct atmel_mci *host; - host = (struct atmel_mci *)data; + host = from_timer(host, t, timer); dev_dbg(&host->pdev->dev, "software timeout\n"); @@ -1661,9 +1661,9 @@ static void atmci_command_complete(struct atmel_mci *host, cmd->error = 0; } -static void atmci_detect_change(unsigned long data) +static void atmci_detect_change(struct timer_list *t) { - struct atmel_mci_slot *slot = (struct atmel_mci_slot *)data; + struct atmel_mci_slot *slot = from_timer(slot, t, detect_timer); bool present; bool present_old; @@ -2349,8 +2349,7 @@ static int atmci_init_slot(struct atmel_mci *host, if (gpio_is_valid(slot->detect_pin)) { int ret; - setup_timer(&slot->detect_timer, atmci_detect_change, - (unsigned long)slot); + timer_setup(&slot->detect_timer, atmci_detect_change, 0); ret = request_irq(gpio_to_irq(slot->detect_pin), atmci_detect_interrupt, @@ -2563,7 +2562,7 @@ static int atmci_probe(struct platform_device *pdev) platform_set_drvdata(pdev, host); - setup_timer(&host->timer, atmci_timeout_timer, (unsigned long)host); + timer_setup(&host->timer, atmci_timeout_timer, 0); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 7db8c7a8d38d..712e08d9a45e 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -586,9 +586,9 @@ poll_timeout: return true; } -static void jz4740_mmc_timeout(unsigned long data) +static void jz4740_mmc_timeout(struct timer_list *t) { - struct jz4740_mmc_host *host = (struct jz4740_mmc_host *)data; + struct jz4740_mmc_host *host = from_timer(host, t, timeout_timer); if (!test_and_clear_bit(0, &host->waiting)) return; @@ -1036,8 +1036,7 @@ static int jz4740_mmc_probe(struct platform_device* pdev) jz4740_mmc_reset(host); jz4740_mmc_clock_disable(host); - setup_timer(&host->timeout_timer, jz4740_mmc_timeout, - (unsigned long)host); + timer_setup(&host->timeout_timer, jz4740_mmc_timeout, 0); host->use_dma = true; if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0) diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index ce1d9216b5a7..09cb89645d06 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -474,9 +474,9 @@ static irqreturn_t meson_mx_mmc_irq_thread(int irq, void *irq_data) return IRQ_HANDLED; } -static void meson_mx_mmc_timeout(unsigned long arg) +static void meson_mx_mmc_timeout(struct timer_list *t) { - struct meson_mx_mmc_host *host = (void *) arg; + struct meson_mx_mmc_host *host = from_timer(host, t, cmd_timeout); unsigned long irqflags; u32 irqc; @@ -652,8 +652,7 @@ static int meson_mx_mmc_probe(struct platform_device *pdev) host->controller_dev = &pdev->dev; spin_lock_init(&host->irq_lock); - setup_timer(&host->cmd_timeout, meson_mx_mmc_timeout, - (unsigned long)host); + timer_setup(&host->cmd_timeout, meson_mx_mmc_timeout, 0); platform_set_drvdata(pdev, host); diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 58d74b8d6c79..210247b3d11a 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -508,9 +508,9 @@ static irqreturn_t mvsd_irq(int irq, void *dev) return IRQ_NONE; } -static void mvsd_timeout_timer(unsigned long data) +static void mvsd_timeout_timer(struct timer_list *t) { - struct mvsd_host *host = (struct mvsd_host *)data; + struct mvsd_host *host = from_timer(host, t, timer); void __iomem *iobase = host->base; struct mmc_request *mrq; unsigned long flags; @@ -776,7 +776,7 @@ static int mvsd_probe(struct platform_device *pdev) goto out; } - setup_timer(&host->timer, mvsd_timeout_timer, (unsigned long)host); + timer_setup(&host->timer, mvsd_timeout_timer, 0); platform_set_drvdata(pdev, mmc); ret = mmc_add_host(mmc); if (ret) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index 8bd9f060be2a..5ff8ef7223cc 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -963,10 +963,9 @@ static bool filter(struct dma_chan *chan, void *param) return true; } -static void mxcmci_watchdog(unsigned long data) +static void mxcmci_watchdog(struct timer_list *t) { - struct mmc_host *mmc = (struct mmc_host *)data; - struct mxcmci_host *host = mmc_priv(mmc); + struct mxcmci_host *host = from_timer(host, t, watchdog); struct mmc_request *req = host->req; unsigned int stat = mxcmci_readl(host, MMC_REG_STATUS); @@ -1165,7 +1164,7 @@ static int mxcmci_probe(struct platform_device *pdev) goto out_free_dma; } - setup_timer(&host->watchdog, &mxcmci_watchdog, (unsigned long)mmc); + timer_setup(&host->watchdog, mxcmci_watchdog, 0); mmc_add_host(mmc); diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index bd49f34d7654..adf32682f27a 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -625,9 +625,9 @@ static void mmc_omap_abort_command(struct work_struct *work) } static void -mmc_omap_cmd_timer(unsigned long data) +mmc_omap_cmd_timer(struct timer_list *t) { - struct mmc_omap_host *host = (struct mmc_omap_host *) data; + struct mmc_omap_host *host = from_timer(host, t, cmd_abort_timer); unsigned long flags; spin_lock_irqsave(&host->slot_lock, flags); @@ -654,9 +654,9 @@ mmc_omap_sg_to_buf(struct mmc_omap_host *host) } static void -mmc_omap_clk_timer(unsigned long data) +mmc_omap_clk_timer(struct timer_list *t) { - struct mmc_omap_host *host = (struct mmc_omap_host *) data; + struct mmc_omap_host *host = from_timer(host, t, clk_timer); mmc_omap_fclk_enable(host, 0); } @@ -874,9 +874,9 @@ void omap_mmc_notify_cover_event(struct device *dev, int num, int is_closed) tasklet_hi_schedule(&slot->cover_tasklet); } -static void mmc_omap_cover_timer(unsigned long arg) +static void mmc_omap_cover_timer(struct timer_list *t) { - struct mmc_omap_slot *slot = (struct mmc_omap_slot *) arg; + struct mmc_omap_slot *slot = from_timer(slot, t, cover_timer); tasklet_schedule(&slot->cover_tasklet); } @@ -1264,8 +1264,7 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id) mmc->max_seg_size = mmc->max_req_size; if (slot->pdata->get_cover_state != NULL) { - setup_timer(&slot->cover_timer, mmc_omap_cover_timer, - (unsigned long)slot); + timer_setup(&slot->cover_timer, mmc_omap_cover_timer, 0); tasklet_init(&slot->cover_tasklet, mmc_omap_cover_handler, (unsigned long)slot); } @@ -1352,11 +1351,10 @@ static int mmc_omap_probe(struct platform_device *pdev) INIT_WORK(&host->send_stop_work, mmc_omap_send_stop_work); INIT_WORK(&host->cmd_abort_work, mmc_omap_abort_command); - setup_timer(&host->cmd_abort_timer, mmc_omap_cmd_timer, - (unsigned long) host); + timer_setup(&host->cmd_abort_timer, mmc_omap_cmd_timer, 0); spin_lock_init(&host->clk_lock); - setup_timer(&host->clk_timer, mmc_omap_clk_timer, (unsigned long) host); + timer_setup(&host->clk_timer, mmc_omap_clk_timer, 0); spin_lock_init(&host->dma_lock); spin_lock_init(&host->slot_lock); diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 1052b52045b1..2f14334e42df 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2407,12 +2407,12 @@ static void sdhci_tasklet_finish(unsigned long param) ; } -static void sdhci_timeout_timer(unsigned long data) +static void sdhci_timeout_timer(struct timer_list *t) { struct sdhci_host *host; unsigned long flags; - host = (struct sdhci_host*)data; + host = from_timer(host, t, timer); spin_lock_irqsave(&host->lock, flags); @@ -2429,12 +2429,12 @@ static void sdhci_timeout_timer(unsigned long data) spin_unlock_irqrestore(&host->lock, flags); } -static void sdhci_timeout_data_timer(unsigned long data) +static void sdhci_timeout_data_timer(struct timer_list *t) { struct sdhci_host *host; unsigned long flags; - host = (struct sdhci_host *)data; + host = from_timer(host, t, data_timer); spin_lock_irqsave(&host->lock, flags); @@ -3749,9 +3749,8 @@ int __sdhci_add_host(struct sdhci_host *host) tasklet_init(&host->finish_tasklet, sdhci_tasklet_finish, (unsigned long)host); - setup_timer(&host->timer, sdhci_timeout_timer, (unsigned long)host); - setup_timer(&host->data_timer, sdhci_timeout_data_timer, - (unsigned long)host); + timer_setup(&host->timer, sdhci_timeout_timer, 0); + timer_setup(&host->data_timer, sdhci_timeout_data_timer, 0); init_waitqueue_head(&host->buf_ready_int); diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c index 93c4b40df90a..a3d8380ab480 100644 --- a/drivers/mmc/host/tifm_sd.c +++ b/drivers/mmc/host/tifm_sd.c @@ -783,9 +783,9 @@ static void tifm_sd_end_cmd(unsigned long data) mmc_request_done(mmc, mrq); } -static void tifm_sd_abort(unsigned long data) +static void tifm_sd_abort(struct timer_list *t) { - struct tifm_sd *host = (struct tifm_sd*)data; + struct tifm_sd *host = from_timer(host, t, timer); pr_err("%s : card failed to respond for a long period of time " "(%x, %x)\n", @@ -968,7 +968,7 @@ static int tifm_sd_probe(struct tifm_dev *sock) tasklet_init(&host->finish_tasklet, tifm_sd_end_cmd, (unsigned long)host); - setup_timer(&host->timer, tifm_sd_abort, (unsigned long)host); + timer_setup(&host->timer, tifm_sd_abort, 0); mmc->ops = &tifm_sd_ops; mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index a8c97b6e59dc..32c4211506fc 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -932,12 +932,12 @@ out: return result; } -static void via_sdc_timeout(unsigned long ulongdata) +static void via_sdc_timeout(struct timer_list *t) { struct via_crdr_mmc_host *sdhost; unsigned long flags; - sdhost = (struct via_crdr_mmc_host *)ulongdata; + sdhost = from_timer(sdhost, t, timer); spin_lock_irqsave(&sdhost->lock, flags); @@ -1036,7 +1036,7 @@ static void via_init_mmc_host(struct via_crdr_mmc_host *host) u32 lenreg; u32 status; - setup_timer(&host->timer, via_sdc_timeout, (unsigned long)host); + timer_setup(&host->timer, via_sdc_timeout, 0); spin_lock_init(&host->lock); diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c index c1a169843f99..e6091528aca3 100644 --- a/drivers/mmc/host/vub300.c +++ b/drivers/mmc/host/vub300.c @@ -741,9 +741,10 @@ static void vub300_deadwork_thread(struct work_struct *work) kref_put(&vub300->kref, vub300_delete); } -static void vub300_inactivity_timer_expired(unsigned long data) +static void vub300_inactivity_timer_expired(struct timer_list *t) { /* softirq */ - struct vub300_mmc_host *vub300 = (struct vub300_mmc_host *)data; + struct vub300_mmc_host *vub300 = from_timer(vub300, t, + inactivity_timer); if (!vub300->interface) { kref_put(&vub300->kref, vub300_delete); } else if (vub300->cmd) { @@ -1180,9 +1181,10 @@ static void send_command(struct vub300_mmc_host *vub300) * timer callback runs in atomic mode * so it cannot call usb_kill_urb() */ -static void vub300_sg_timed_out(unsigned long data) +static void vub300_sg_timed_out(struct timer_list *t) { - struct vub300_mmc_host *vub300 = (struct vub300_mmc_host *)data; + struct vub300_mmc_host *vub300 = from_timer(vub300, t, + sg_transfer_timer); vub300->usb_timed_out = 1; usb_sg_cancel(&vub300->sg_request); usb_unlink_urb(vub300->command_out_urb); @@ -2323,11 +2325,10 @@ static int vub300_probe(struct usb_interface *interface, INIT_WORK(&vub300->cmndwork, vub300_cmndwork_thread); INIT_WORK(&vub300->deadwork, vub300_deadwork_thread); kref_init(&vub300->kref); - setup_timer(&vub300->sg_transfer_timer, vub300_sg_timed_out, - (unsigned long)vub300); + timer_setup(&vub300->sg_transfer_timer, vub300_sg_timed_out, 0); kref_get(&vub300->kref); - setup_timer(&vub300->inactivity_timer, - vub300_inactivity_timer_expired, (unsigned long)vub300); + timer_setup(&vub300->inactivity_timer, + vub300_inactivity_timer_expired, 0); vub300->inactivity_timer.expires = jiffies + HZ; add_timer(&vub300->inactivity_timer); if (vub300->card_present) diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index 499852d8f706..f4233576153b 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -956,9 +956,9 @@ static const struct mmc_host_ops wbsd_ops = { * Helper function to reset detection ignore */ -static void wbsd_reset_ignore(unsigned long data) +static void wbsd_reset_ignore(struct timer_list *t) { - struct wbsd_host *host = (struct wbsd_host *)data; + struct wbsd_host *host = from_timer(host, t, ignore_timer); BUG_ON(host == NULL); @@ -1224,8 +1224,7 @@ static int wbsd_alloc_mmc(struct device *dev) /* * Set up timers */ - setup_timer(&host->ignore_timer, wbsd_reset_ignore, - (unsigned long)host); + timer_setup(&host->ignore_timer, wbsd_reset_ignore, 0); /* * Maximum number of segments. Worst case is one sector per segment From 7f8e446b032bd6bbcec7c2f068d0a4f2d5929249 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 27 Oct 2017 19:09:17 +0200 Subject: [PATCH 0855/1085] mmc: tmio: Use common error handling code in tmio_mmc_host_probe() * Add a jump target so that a bit of exception handling can be better reused at the end of this function. * Adjust condition checks. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 3a6d49f07e22..4c8198f8b04a 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -1302,23 +1302,24 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host, pm_runtime_enable(&pdev->dev); ret = mmc_add_host(mmc); - if (ret < 0) { - tmio_mmc_host_remove(_host); - return ret; - } + if (ret) + goto remove_host; dev_pm_qos_expose_latency_limit(&pdev->dev, 100); if (pdata->flags & TMIO_MMC_USE_GPIO_CD) { ret = mmc_gpio_request_cd(mmc, pdata->cd_gpio, 0); - if (ret < 0) { - tmio_mmc_host_remove(_host); - return ret; - } + if (ret) + goto remove_host; + mmc_gpiod_request_cd_irq(mmc); } return 0; + +remove_host: + tmio_mmc_host_remove(_host); + return ret; } EXPORT_SYMBOL_GPL(tmio_mmc_host_probe); From 2d1d31dda766b146bbd77db5e4e1534f64c92c2b Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 27 Oct 2017 21:21:40 +0200 Subject: [PATCH 0856/1085] mmc: vub300: Use common code in __download_offload_pseudocode() Add a jump target so that a specific string copy operation is stored only once at the end of this function implementation. Replace two calls of the function "strncpy" by goto statements. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Ulf Hansson --- drivers/mmc/host/vub300.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c index e6091528aca3..1fe68137a30f 100644 --- a/drivers/mmc/host/vub300.c +++ b/drivers/mmc/host/vub300.c @@ -1246,12 +1246,8 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, USB_RECIP_DEVICE, 0x0000, 0x0000, xfer_buffer, xfer_length, HZ); kfree(xfer_buffer); - if (retval < 0) { - strncpy(vub300->vub_name, - "SDIO pseudocode download failed", - sizeof(vub300->vub_name)); - return; - } + if (retval < 0) + goto copy_error_message; } else { dev_err(&vub300->udev->dev, "not enough memory for xfer buffer to send" @@ -1293,12 +1289,8 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, USB_RECIP_DEVICE, 0x0000, 0x0000, xfer_buffer, xfer_length, HZ); kfree(xfer_buffer); - if (retval < 0) { - strncpy(vub300->vub_name, - "SDIO pseudocode download failed", - sizeof(vub300->vub_name)); - return; - } + if (retval < 0) + goto copy_error_message; } else { dev_err(&vub300->udev->dev, "not enough memory for xfer buffer to send" @@ -1351,6 +1343,12 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, sizeof(vub300->vub_name)); return; } + + return; + +copy_error_message: + strncpy(vub300->vub_name, "SDIO pseudocode download failed", + sizeof(vub300->vub_name)); } /* From 93c23ae385299f889606b42507b12b40e50d6088 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 12 Oct 2017 13:11:18 -0700 Subject: [PATCH 0857/1085] mmc: dw_mmc: Cleanup the DTO timer like the CTO one The recent CTO timer introduced in commit 03de19212ea3 ("mmc: dw_mmc: introduce timer for broken command transfer over scheme") was causing observable problems due to race conditions. Previous patches have fixed those race conditions. It can be observed that these same race conditions ought to be theoretically possible with the DTO timer too though they are massively less likely to happen because the data timeout is always set to 0xffffff right now. That means even at a 200 MHz card clock we were arming the DTO timer for 94 ms: >>> (0xffffff * 1000. / 200000000) + 10 93.886075 We always also were setting the DTO timer _after_ starting the transfer, unlike how the old code was seting the CTO timer. In any case, even though the DTO timer is much less likely to have races, it still makes sense to add code to handle it _just in case_. Signed-off-by: Douglas Anderson Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 55 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index c365742d6b7f..37b55b095daf 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1938,6 +1938,7 @@ static void dw_mci_set_drto(struct dw_mci *host) unsigned int drto_clks; unsigned int drto_div; unsigned int drto_ms; + unsigned long irqflags; drto_clks = mci_readl(host, TMOUT) >> 8; drto_div = (mci_readl(host, CLKDIV) & 0xff) * 2; @@ -1949,7 +1950,11 @@ static void dw_mci_set_drto(struct dw_mci *host) /* add a bit spare time */ drto_ms += 10; - mod_timer(&host->dto_timer, jiffies + msecs_to_jiffies(drto_ms)); + spin_lock_irqsave(&host->irq_lock, irqflags); + if (!test_bit(EVENT_DATA_COMPLETE, &host->pending_events)) + mod_timer(&host->dto_timer, + jiffies + msecs_to_jiffies(drto_ms)); + spin_unlock_irqrestore(&host->irq_lock, irqflags); } static bool dw_mci_clear_pending_cmd_complete(struct dw_mci *host) @@ -1970,6 +1975,18 @@ static bool dw_mci_clear_pending_cmd_complete(struct dw_mci *host) return true; } +static bool dw_mci_clear_pending_data_complete(struct dw_mci *host) +{ + if (!test_bit(EVENT_DATA_COMPLETE, &host->pending_events)) + return false; + + /* Extra paranoia just like dw_mci_clear_pending_cmd_complete() */ + WARN_ON(del_timer_sync(&host->dto_timer)); + clear_bit(EVENT_DATA_COMPLETE, &host->pending_events); + + return true; +} + static void dw_mci_tasklet_func(unsigned long priv) { struct dw_mci *host = (struct dw_mci *)priv; @@ -2111,8 +2128,7 @@ static void dw_mci_tasklet_func(unsigned long priv) /* fall through */ case STATE_DATA_BUSY: - if (!test_and_clear_bit(EVENT_DATA_COMPLETE, - &host->pending_events)) { + if (!dw_mci_clear_pending_data_complete(host)) { /* * If data error interrupt comes but data over * interrupt doesn't come within the given time. @@ -2682,6 +2698,8 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) } if (pending & SDMMC_INT_DATA_OVER) { + spin_lock_irqsave(&host->irq_lock, irqflags); + del_timer(&host->dto_timer); mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER); @@ -2694,6 +2712,8 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) } set_bit(EVENT_DATA_COMPLETE, &host->pending_events); tasklet_schedule(&host->tasklet); + + spin_unlock_irqrestore(&host->irq_lock, irqflags); } if (pending & SDMMC_INT_RXDR) { @@ -3043,7 +3063,31 @@ exit: static void dw_mci_dto_timer(unsigned long arg) { struct dw_mci *host = (struct dw_mci *)arg; + unsigned long irqflags; + u32 pending; + spin_lock_irqsave(&host->irq_lock, irqflags); + + /* + * The DTO timer is much longer than the CTO timer, so it's even less + * likely that we'll these cases, but it pays to be paranoid. + */ + pending = mci_readl(host, MINTSTS); /* read-only mask reg */ + if (pending & SDMMC_INT_DATA_OVER) { + /* The interrupt should fire; no need to act but we can warn */ + dev_warn(host->dev, "Unexpected data interrupt latency\n"); + goto exit; + } + if (test_bit(EVENT_DATA_COMPLETE, &host->pending_events)) { + /* Presumably interrupt handler couldn't delete the timer */ + dev_warn(host->dev, "DTO timeout when already completed\n"); + goto exit; + } + + /* + * Continued paranoia to make sure we're in the state we expect. + * This paranoia isn't really justified but it seems good to be safe. + */ switch (host->state) { case STATE_SENDING_DATA: case STATE_DATA_BUSY: @@ -3058,8 +3102,13 @@ static void dw_mci_dto_timer(unsigned long arg) tasklet_schedule(&host->tasklet); break; default: + dev_warn(host->dev, "Unexpected data timeout, state %d\n", + host->state); break; } + +exit: + spin_unlock_irqrestore(&host->irq_lock, irqflags); } #ifdef CONFIG_OF From 379777297cb130b8b23f82b34ea889c31e828a8c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 30 Oct 2017 14:45:00 -0700 Subject: [PATCH 0858/1085] mmc: dw_mmc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jaehoon Chung Cc: Ulf Hansson Cc: linux-mmc@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 37b55b095daf..0aa39975f33b 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2991,9 +2991,9 @@ no_dma: host->use_dma = TRANS_MODE_PIO; } -static void dw_mci_cmd11_timer(unsigned long arg) +static void dw_mci_cmd11_timer(struct timer_list *t) { - struct dw_mci *host = (struct dw_mci *)arg; + struct dw_mci *host = from_timer(host, t, cmd11_timer); if (host->state != STATE_SENDING_CMD11) { dev_warn(host->dev, "Unexpected CMD11 timeout\n"); @@ -3005,9 +3005,9 @@ static void dw_mci_cmd11_timer(unsigned long arg) tasklet_schedule(&host->tasklet); } -static void dw_mci_cto_timer(unsigned long arg) +static void dw_mci_cto_timer(struct timer_list *t) { - struct dw_mci *host = (struct dw_mci *)arg; + struct dw_mci *host = from_timer(host, t, cto_timer); unsigned long irqflags; u32 pending; @@ -3060,9 +3060,9 @@ exit: spin_unlock_irqrestore(&host->irq_lock, irqflags); } -static void dw_mci_dto_timer(unsigned long arg) +static void dw_mci_dto_timer(struct timer_list *t) { - struct dw_mci *host = (struct dw_mci *)arg; + struct dw_mci *host = from_timer(host, t, dto_timer); unsigned long irqflags; u32 pending; @@ -3257,14 +3257,9 @@ int dw_mci_probe(struct dw_mci *host) } } - setup_timer(&host->cmd11_timer, - dw_mci_cmd11_timer, (unsigned long)host); - - setup_timer(&host->cto_timer, - dw_mci_cto_timer, (unsigned long)host); - - setup_timer(&host->dto_timer, - dw_mci_dto_timer, (unsigned long)host); + timer_setup(&host->cmd11_timer, dw_mci_cmd11_timer, 0); + timer_setup(&host->cto_timer, dw_mci_cto_timer, 0); + timer_setup(&host->dto_timer, dw_mci_dto_timer, 0); spin_lock_init(&host->lock); spin_lock_init(&host->irq_lock); From d4d7b4ad2f05c03fb25252aea66f9f3cd7cfbe06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Oct 2017 10:44:07 +0100 Subject: [PATCH 0859/1085] irqchip/gic-v3-its: Setup VLPI properties at map time So far, we require the hypervisor to update the VLPI properties once the the VLPI mapping has been established. While this makes it easy for the ITS driver, it creates a window where an incoming interrupt can be delivered with an unknown set of properties. Not very nice. Instead, let's add a "properties" field to the mapping structure, and use that to configure the VLPI before it actually gets mapped. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 18 ++++++++++++++++-- include/linux/irqchip/arm-gic-v4.h | 2 ++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 6a74f0497f82..29b2ff5c6841 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1008,9 +1008,15 @@ static void lpi_write_config(struct irq_data *d, u8 clr, u8 set) if (irqd_is_forwarded_to_vcpu(d)) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); + struct its_vlpi_map *map; prop_page = its_dev->event_map.vm->vprop_page; - hwirq = its_dev->event_map.vlpi_maps[event].vintid; + map = &its_dev->event_map.vlpi_maps[event]; + hwirq = map->vintid; + + /* Remember the updated property */ + map->properties &= ~clr; + map->properties |= set | LPI_PROP_GROUP1; } else { prop_page = gic_rdists->prop_page; hwirq = d->hwirq; @@ -1249,12 +1255,20 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) /* Ensure all the VPEs are mapped on this ITS */ its_map_vm(its_dev->its, info->map->vm); + /* + * Flag the interrupt as forwarded so that we can + * start poking the virtual property table. + */ + irqd_set_forwarded_to_vcpu(d); + + /* Write out the property to the prop table */ + lpi_write_config(d, 0xff, info->map->properties); + /* Drop the physical mapping */ its_send_discard(its_dev, event); /* and install the virtual one */ its_send_vmapti(its_dev, event); - irqd_set_forwarded_to_vcpu(d); /* Increment the number of VLPIs */ its_dev->event_map.nr_vlpis++; diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 43cde15f221b..447da8ca2156 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -71,12 +71,14 @@ struct its_vpe { * @vm: Pointer to the GICv4 notion of a VM * @vpe: Pointer to the GICv4 notion of a virtual CPU (VPE) * @vintid: Virtual LPI number + * @properties: Priority and enable bits (as written in the prop table) * @db_enabled: Is the VPE doorbell to be generated? */ struct its_vlpi_map { struct its_vm *vm; struct its_vpe *vpe; u32 vintid; + u8 properties; bool db_enabled; }; From 0962289b1cd91534f7111e763d3e6a17dcd47ecb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 27 Oct 2017 10:34:22 +0200 Subject: [PATCH 0860/1085] irqchip/gic: Deal with broken firmware exposing only 4kB of GICv2 CPU interface There is a lot of broken firmware out there that don't really expose the information the kernel requires when it comes with dealing with GICv2: (1) Firmware that only describes the first 4kB of GICv2 (2) Firmware that describe 128kB of CPU interface, while the usable portion of the address space is between 60 and 68kB So far, we only deal with (2). But we have platforms exhibiting behaviour (1), resulting in two sub-cases: (a) The GIC is occupying 8kB, as required by the GICv2 architecture (b) It is actually spread 128kB, and this is likely to be a version of (2) This patch tries to work around both (a) and (b) by poking at the outside of the described memory region, and try to work out what is actually there. This is of course unsafe, and should only be enabled if there is no way to otherwise fix the DT provided by the firmware (we provide a "irqchip.gicv2_force_probe" option to that effect). Note that for the time being, we restrict ourselves to GICv2 implementations provided by ARM, since there I have no knowledge of an alternative implementations. This could be relaxed if such an implementation comes to light on a broken platform. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- .../admin-guide/kernel-parameters.txt | 7 ++ drivers/irqchip/irq-gic.c | 71 ++++++++++++++++--- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..3daa0a590236 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1713,6 +1713,13 @@ irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. + irqchip.gicv2_force_probe= + [ARM, ARM64] + Format: + Force the kernel to look for the second 4kB page + of a GICv2 controller even if the memory range + exposed by the device tree is too small. + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 651d726e8b12..f641e8e2c78d 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1256,6 +1256,19 @@ static void gic_teardown(struct gic_chip_data *gic) #ifdef CONFIG_OF static int gic_cnt __initdata; +static bool gicv2_force_probe; + +static int __init gicv2_force_probe_cfg(char *buf) +{ + return strtobool(buf, &gicv2_force_probe); +} +early_param("irqchip.gicv2_force_probe", gicv2_force_probe_cfg); + +static bool gic_check_gicv2(void __iomem *base) +{ + u32 val = readl_relaxed(base + GIC_CPU_IDENT); + return (val & 0xff0fff) == 0x02043B; +} static bool gic_check_eoimode(struct device_node *node, void __iomem **base) { @@ -1265,20 +1278,60 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base) if (!is_hyp_mode_available()) return false; - if (resource_size(&cpuif_res) < SZ_8K) - return false; - if (resource_size(&cpuif_res) == SZ_128K) { - u32 val_low, val_high; + if (resource_size(&cpuif_res) < SZ_8K) { + void __iomem *alt; + /* + * Check for a stupid firmware that only exposes the + * first page of a GICv2. + */ + if (!gic_check_gicv2(*base)) + return false; + + if (!gicv2_force_probe) { + pr_warn("GIC: GICv2 detected, but range too small and irqchip.gicv2_force_probe not set\n"); + return false; + } + + alt = ioremap(cpuif_res.start, SZ_8K); + if (!alt) + return false; + if (!gic_check_gicv2(alt + SZ_4K)) { + /* + * The first page was that of a GICv2, and + * the second was *something*. Let's trust it + * to be a GICv2, and update the mapping. + */ + pr_warn("GIC: GICv2 at %pa, but range is too small (broken DT?), assuming 8kB\n", + &cpuif_res.start); + iounmap(*base); + *base = alt; + return true; + } /* - * Verify that we have the first 4kB of a GIC400 + * We detected *two* initial GICv2 pages in a + * row. Could be a GICv2 aliased over two 64kB + * pages. Update the resource, map the iospace, and + * pray. + */ + iounmap(alt); + alt = ioremap(cpuif_res.start, SZ_128K); + if (!alt) + return false; + pr_warn("GIC: Aliased GICv2 at %pa, trying to find the canonical range over 128kB\n", + &cpuif_res.start); + cpuif_res.end = cpuif_res.start + SZ_128K -1; + iounmap(*base); + *base = alt; + } + if (resource_size(&cpuif_res) == SZ_128K) { + /* + * Verify that we have the first 4kB of a GICv2 * aliased over the first 64kB by checking the * GICC_IIDR register on both ends. */ - val_low = readl_relaxed(*base + GIC_CPU_IDENT); - val_high = readl_relaxed(*base + GIC_CPU_IDENT + 0xf000); - if ((val_low & 0xffff0fff) != 0x0202043B || - val_low != val_high) + if (!gic_check_gicv2(*base) || + !gic_check_gicv2(*base + 0xf000)) return false; /* From 4b82130077d93539c9fbb0f5eee21965cea9cfe9 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 30 Oct 2017 10:15:00 +0800 Subject: [PATCH 0861/1085] irqdomain: Update the comments of fwnode field of irq_domain structure Commit: f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode") converted of_node field to fwnode, but didn't update its comments. Update it. Fixes: f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode") Signed-off-by: Dou Liyang Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index df162f7a4aad..ce48a23d621f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -138,8 +138,8 @@ struct irq_domain_chip_generic; * @mapcount: The number of mapped interrupts * * Optional elements - * @of_node: Pointer to device tree nodes associated with the irq_domain. Used - * when decoding device tree interrupt specifiers. + * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy + * to swap it for the of_node via the irq_domain_get_of_node accessor * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. From 4e4cb1b183d6e9df57f4e54c8b1a5231995da820 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 30 Oct 2017 00:05:21 +0100 Subject: [PATCH 0862/1085] irqchip/meson-gpio: add support for Meson8 SoCs Meson8 uses the same GPIO interrupt controller IP block as the other Meson SoCs. A total of 134 pins can be spied on, which is the sum of: - 22 pins on bank GPIOX - 17 pins on bank GPIOY - 30 pins on bank GPIODV - 10 pins on bank GPIOH - 15 pins on bank GPIOZ - 7 pins on bank CARD - 19 pins on bank BOOT - 14 pins in the AO domain Acked-by: Kevin Hilman Acked-by: Rob Herring Signed-off-by: Martin Blumenstingl Signed-off-by: Marc Zyngier --- .../interrupt-controller/amlogic,meson-gpio-intc.txt | 1 + drivers/irqchip/irq-meson-gpio.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt index 633e21ce4b17..a83f9a5734ca 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt @@ -10,6 +10,7 @@ number of interrupt exposed depends on the SoC. Required properties: - compatible : must have "amlogic,meson8-gpio-intc” and either + “amlogic,meson8-gpio-intc” for meson8 SoCs (S802) or “amlogic,meson8b-gpio-intc” for meson8b SoCs (S805) or “amlogic,meson-gxbb-gpio-intc” for GXBB SoCs (S905) or “amlogic,meson-gxl-gpio-intc” for GXL SoCs (S905X, S912) diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index c7cc7e37a23c..a59bdbc0b9bb 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -47,6 +47,10 @@ struct meson_gpio_irq_params { unsigned int nr_hwirq; }; +static const struct meson_gpio_irq_params meson8_params = { + .nr_hwirq = 134, +}; + static const struct meson_gpio_irq_params meson8b_params = { .nr_hwirq = 119, }; @@ -60,6 +64,7 @@ static const struct meson_gpio_irq_params gxl_params = { }; static const struct of_device_id meson_irq_gpio_matches[] = { + { .compatible = "amlogic,meson8-gpio-intc", .data = &meson8_params }, { .compatible = "amlogic,meson8b-gpio-intc", .data = &meson8b_params }, { .compatible = "amlogic,meson-gxbb-gpio-intc", .data = &gxbb_params }, { .compatible = "amlogic,meson-gxl-gpio-intc", .data = &gxl_params }, From 63b746b19fa660737df603f97fd5f435c511d1b5 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:44 -0700 Subject: [PATCH 0863/1085] irqchip: mips-gic: Inline gic_local_irq_domain_map() The gic_local_irq_domain_map() function has only one callsite in gic_irq_domain_map(), and the split between the two functions makes it unclear that they duplicate calculations & checks. Inline gic_local_irq_domain_map() into gic_irq_domain_map() in order to clean this up. Doing this makes the following small issues obvious, and the patch tidies them up: - Both functions used GIC_HWIRQ_TO_LOCAL() to convert a hwirq number to a local IRQ number. We now only do this once. Although the compiler ought to have optimised this away before anyway, the change leaves us with less duplicate code. - gic_local_irq_domain_map() had a check for invalid local interrupt numbers (intr > GIC_LOCAL_INT_FDC). This condition can never occur because any hwirq higher than those used for local interrupts is a shared interrupt, which gic_irq_domain_map() already handles separately. We therefore remove this check. - The decision of whether to map the interrupt to gic_cpu_pin or timer_cpu_pin can be handled within the existing switch statement in gic_irq_domain_map(), shortening the code a little. The change additionally prepares us nicely for the following patch of the series which would otherwise need to duplicate the check for whether a local interrupt should be percpu_devid or just percpu (ie. the switch statement from gic_irq_domain_map()) in gic_local_irq_domain_map(). Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 58 +++++++++++++--------------------- 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index c90976d7e53c..6fdcc1552fab 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -382,39 +382,6 @@ static void gic_irq_dispatch(struct irq_desc *desc) gic_handle_shared_int(true); } -static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw) -{ - int intr = GIC_HWIRQ_TO_LOCAL(hw); - int i; - unsigned long flags; - u32 val; - - if (!gic_local_irq_is_routable(intr)) - return -EPERM; - - if (intr > GIC_LOCAL_INT_FDC) { - pr_err("Invalid local IRQ %d\n", intr); - return -EINVAL; - } - - if (intr == GIC_LOCAL_INT_TIMER) { - /* CONFIG_MIPS_CMP workaround (see __gic_init) */ - val = GIC_MAP_PIN_MAP_TO_PIN | timer_cpu_pin; - } else { - val = GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin; - } - - spin_lock_irqsave(&gic_lock, flags); - for (i = 0; i < gic_vpes; i++) { - write_gic_vl_other(mips_cm_vp_id(i)); - write_gic_vo_map(intr, val); - } - spin_unlock_irqrestore(&gic_lock, flags); - - return 0; -} - static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw, unsigned int cpu) { @@ -457,7 +424,10 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hwirq) { - int err; + unsigned long flags; + unsigned int intr; + int err, i; + u32 map; if (hwirq >= GIC_SHARED_HWIRQ_BASE) { /* verify that shared irqs don't conflict with an IPI irq */ @@ -474,8 +444,14 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, return gic_shared_irq_domain_map(d, virq, hwirq, 0); } - switch (GIC_HWIRQ_TO_LOCAL(hwirq)) { + intr = GIC_HWIRQ_TO_LOCAL(hwirq); + map = GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin; + + switch (intr) { case GIC_LOCAL_INT_TIMER: + /* CONFIG_MIPS_CMP workaround (see __gic_init) */ + map = GIC_MAP_PIN_MAP_TO_PIN | timer_cpu_pin; + /* fall-through */ case GIC_LOCAL_INT_PERFCTR: case GIC_LOCAL_INT_FDC: /* @@ -504,7 +480,17 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, break; } - return gic_local_irq_domain_map(d, virq, hwirq); + if (!gic_local_irq_is_routable(intr)) + return -EPERM; + + spin_lock_irqsave(&gic_lock, flags); + for (i = 0; i < gic_vpes; i++) { + write_gic_vl_other(mips_cm_vp_id(i)); + write_gic_vo_map(intr, map); + } + spin_unlock_irqrestore(&gic_lock, flags); + + return 0; } static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, From da61fcf9d62a05f3508f5646d353a9c2604bac76 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:45 -0700 Subject: [PATCH 0864/1085] irqchip: mips-gic: Use irq_cpu_online to (un)mask all-VP(E) IRQs The gic_all_vpes_local_irq_controller chip currently attempts to operate on all CPUs/VPs in the system when masking or unmasking an interrupt. This has a few drawbacks: - In multi-cluster systems we may not always have access to all CPUs in the system. When all CPUs in a cluster are powered down that cluster's GIC may also power down, in which case we cannot configure its state. - Relatedly, if we power down a cluster after having configured interrupts for CPUs within it then the cluster's GIC may lose state & we need to reconfigure it. The current approach doesn't take this into account. - It's wasteful if we run Linux on fewer VPs than are present in the system. For example if we run a uniprocessor kernel on CPU0 of a system with 16 CPUs then there's no point in us configuring CPUs 1-15. - The implementation is also lacking in that it expects the range 0..gic_vpes-1 to represent valid Linux CPU numbers which may not always be the case - for example if we run on a system with more VPs than the kernel is configured to support. Fix all of these issues by only configuring the affected interrupts for CPUs which are online at the time, and recording the configuration in a new struct gic_all_vpes_chip_data for later use by CPUs being brought online. We register a CPU hotplug state (reusing CPUHP_AP_IRQ_GIC_STARTING which the ARM GIC driver uses, and which seems suitably generic for reuse with the MIPS GIC) and execute irq_cpu_online() in order to configure the interrupts on the newly onlined CPU. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 72 ++++++++++++++++++++++++++-------- include/linux/cpuhotplug.h | 1 + 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 6fdcc1552fab..60f644279803 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include #include @@ -55,6 +56,11 @@ static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS); DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS); +static struct gic_all_vpes_chip_data { + u32 map; + bool mask; +} gic_all_vpes_chip_data[GIC_NUM_LOCAL_INTRS]; + static void gic_clear_pcpu_masks(unsigned int intr) { unsigned int i; @@ -338,13 +344,17 @@ static struct irq_chip gic_local_irq_controller = { static void gic_mask_local_irq_all_vpes(struct irq_data *d) { - int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); - int i; + struct gic_all_vpes_chip_data *cd; unsigned long flags; + int intr, cpu; + + intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); + cd = irq_data_get_irq_chip_data(d); + cd->mask = false; spin_lock_irqsave(&gic_lock, flags); - for (i = 0; i < gic_vpes; i++) { - write_gic_vl_other(mips_cm_vp_id(i)); + for_each_online_cpu(cpu) { + write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_rmask(BIT(intr)); } spin_unlock_irqrestore(&gic_lock, flags); @@ -352,22 +362,40 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d) static void gic_unmask_local_irq_all_vpes(struct irq_data *d) { - int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); - int i; + struct gic_all_vpes_chip_data *cd; unsigned long flags; + int intr, cpu; + + intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); + cd = irq_data_get_irq_chip_data(d); + cd->mask = true; spin_lock_irqsave(&gic_lock, flags); - for (i = 0; i < gic_vpes; i++) { - write_gic_vl_other(mips_cm_vp_id(i)); + for_each_online_cpu(cpu) { + write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_smask(BIT(intr)); } spin_unlock_irqrestore(&gic_lock, flags); } +static void gic_all_vpes_irq_cpu_online(struct irq_data *d) +{ + struct gic_all_vpes_chip_data *cd; + unsigned int intr; + + intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); + cd = irq_data_get_irq_chip_data(d); + + write_gic_vl_map(intr, cd->map); + if (cd->mask) + write_gic_vl_smask(BIT(intr)); +} + static struct irq_chip gic_all_vpes_local_irq_controller = { - .name = "MIPS GIC Local", - .irq_mask = gic_mask_local_irq_all_vpes, - .irq_unmask = gic_unmask_local_irq_all_vpes, + .name = "MIPS GIC Local", + .irq_mask = gic_mask_local_irq_all_vpes, + .irq_unmask = gic_unmask_local_irq_all_vpes, + .irq_cpu_online = gic_all_vpes_irq_cpu_online, }; static void __gic_irq_dispatch(void) @@ -424,9 +452,10 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hwirq) { + struct gic_all_vpes_chip_data *cd; unsigned long flags; unsigned int intr; - int err, i; + int err, cpu; u32 map; if (hwirq >= GIC_SHARED_HWIRQ_BASE) { @@ -459,9 +488,11 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, * the rest of the MIPS kernel code does not use the * percpu IRQ API for them. */ + cd = &gic_all_vpes_chip_data[intr]; + cd->map = map; err = irq_domain_set_hwirq_and_chip(d, virq, hwirq, &gic_all_vpes_local_irq_controller, - NULL); + cd); if (err) return err; @@ -484,8 +515,8 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, return -EPERM; spin_lock_irqsave(&gic_lock, flags); - for (i = 0; i < gic_vpes; i++) { - write_gic_vl_other(mips_cm_vp_id(i)); + for_each_online_cpu(cpu) { + write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_map(intr, map); } spin_unlock_irqrestore(&gic_lock, flags); @@ -622,6 +653,13 @@ static const struct irq_domain_ops gic_ipi_domain_ops = { .match = gic_ipi_domain_match, }; +static int gic_cpu_startup(unsigned int cpu) +{ + /* Invoke irq_cpu_online callbacks to enable desired interrupts */ + irq_cpu_online(); + + return 0; +} static int __init gic_of_init(struct device_node *node, struct device_node *parent) @@ -768,6 +806,8 @@ static int __init gic_of_init(struct device_node *node, } } - return 0; + return cpuhp_setup_state(CPUHP_AP_IRQ_MIPS_GIC_STARTING, + "irqchip/mips/gic:starting", + gic_cpu_startup, NULL); } IRQCHIP_DECLARE(mips_gic, "mti,gic", gic_of_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6d508767e144..1966a45bc453 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -98,6 +98,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_HIP04_STARTING, CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, + CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, From 25ac19e1b076e52c41d713138cb21d6dd2339440 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:46 -0700 Subject: [PATCH 0865/1085] irqchip: mips-gic: Mask local interrupts when CPUs come online We currently walk through the range 0..gic_vpes-1, expecting these values all to be valid Linux CPU numbers to provide to mips_cm_vp_id(), and masking all routable local interrupts during boot. This approach has a few drawbacks: - In multi-cluster systems we won't have access to all CPU's GIC local registers when the driver is probed, since clusters (and their GICs) may be powered down at this point & only brought online later. - In multi-cluster systems we may power down clusters at runtime, for example if we offline all CPUs within it via hotplug, and the cluster's GIC may lose state. We therefore need to reinitialise it when powering back up, which this approach does not take into account. - The range 0..gic_vpes-1 may not all be valid Linux CPU numbers, for example if we run a kernel configured to support fewer CPUs than the system it is running on actually has. In this case we'll get garbage values from mips_cm_vp_id() as we read past the end of the cpu_data array. Fix this and simplify the code somewhat by writing an all-bits-set value to the VP-local reset mask register when a CPU is brought online, before any local interrupts are configured for it. This removes the need for us to access all CPUs during driver probe, removing all of the problems described above. In the name of simplicity we drop the checks for routability of interrupts and simply clear the mask bits for all interrupts. Bits for non-routable local interrupts will have no effect so there's no point performing extra work to avoid modifying them. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 60f644279803..bd732b256f67 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -655,6 +655,9 @@ static const struct irq_domain_ops gic_ipi_domain_ops = { static int gic_cpu_startup(unsigned int cpu) { + /* Clear all local IRQ masks (ie. disable all local interrupts) */ + write_gic_vl_rmask(~0); + /* Invoke irq_cpu_online callbacks to enable desired interrupts */ irq_cpu_online(); @@ -664,7 +667,7 @@ static int gic_cpu_startup(unsigned int cpu) static int __init gic_of_init(struct device_node *node, struct device_node *parent) { - unsigned int cpu_vec, i, j, gicconfig, cpu, v[2]; + unsigned int cpu_vec, i, gicconfig, cpu, v[2]; unsigned long reserved; phys_addr_t gic_base; struct resource res; @@ -797,15 +800,6 @@ static int __init gic_of_init(struct device_node *node, write_gic_rmask(i); } - for (i = 0; i < gic_vpes; i++) { - write_gic_vl_other(mips_cm_vp_id(i)); - for (j = 0; j < GIC_NUM_LOCAL_INTRS; j++) { - if (!gic_local_irq_is_routable(j)) - continue; - write_gic_vo_rmask(BIT(j)); - } - } - return cpuhp_setup_state(CPUHP_AP_IRQ_MIPS_GIC_STARTING, "irqchip/mips/gic:starting", gic_cpu_startup, NULL); From 890f6b55e5a5cc4e1a2efe36026c6fe3fb253b3b Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:47 -0700 Subject: [PATCH 0866/1085] irqchip: mips-gic: Configure EIC when CPUs come online Rather than configuring EIC mode for all CPUs during boot, configure it locally on each when they come online. This will become important with multi-cluster support, since clusters may be powered on & off (for example via hotplug) and would lose the EIC configuration when powered off. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index bd732b256f67..b1320ccb9f94 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -655,6 +655,10 @@ static const struct irq_domain_ops gic_ipi_domain_ops = { static int gic_cpu_startup(unsigned int cpu) { + /* Enable or disable EIC */ + change_gic_vl_ctl(GIC_VX_CTL_EIC, + cpu_has_veic ? GIC_VX_CTL_EIC : 0); + /* Clear all local IRQ masks (ie. disable all local interrupts) */ write_gic_vl_rmask(~0); @@ -667,7 +671,7 @@ static int gic_cpu_startup(unsigned int cpu) static int __init gic_of_init(struct device_node *node, struct device_node *parent) { - unsigned int cpu_vec, i, gicconfig, cpu, v[2]; + unsigned int cpu_vec, i, gicconfig, v[2]; unsigned long reserved; phys_addr_t gic_base; struct resource res; @@ -722,12 +726,6 @@ static int __init gic_of_init(struct device_node *node, gic_vpes = gic_vpes + 1; if (cpu_has_veic) { - /* Set EIC mode for all VPEs */ - for_each_present_cpu(cpu) { - write_gic_vl_other(mips_cm_vp_id(cpu)); - write_gic_vo_ctl(GIC_VX_CTL_EIC); - } - /* Always use vector 1 in EIC mode */ gic_cpu_pin = 0; timer_cpu_pin = gic_cpu_pin; From 25c51dad664d1e69f90541c2558a39fd86a506e6 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:48 -0700 Subject: [PATCH 0867/1085] irqchip: mips-gic: Use num_possible_cpus() to reserve IPIs Reserving a number of IPIs based upon the number of VPs reported by the GIC makes little sense for a few reasons: - The kernel may have been configured with NR_CPUS less than the number of VPs in the cluster, in which case using gic_vpes causes us to reserve more interrupts for IPIs than we will possibly use. - If a kernel is configured without support for multi-threading & runs on a system with multi-threading & multiple VPs per core then we'll similarly reserve more interrupts for IPIs than we will possibly use. - In systems with multiple clusters the GIC can only provide us with the number of VPs in its cluster, not across all clusters. In this case we'll reserve fewer interrupts for IPIs than we need. Fix these issues by using num_possible_cpus() instead, which in all cases is actually indicative of how many IPIs we may need. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index b1320ccb9f94..4304283bfb1a 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -671,7 +671,7 @@ static int gic_cpu_startup(unsigned int cpu) static int __init gic_of_init(struct device_node *node, struct device_node *parent) { - unsigned int cpu_vec, i, gicconfig, v[2]; + unsigned int cpu_vec, i, gicconfig, v[2], num_ipis; unsigned long reserved; phys_addr_t gic_base; struct resource res; @@ -781,10 +781,12 @@ static int __init gic_of_init(struct device_node *node, !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) { bitmap_set(ipi_resrv, v[0], v[1]); } else { - /* Make the last 2 * gic_vpes available for IPIs */ - bitmap_set(ipi_resrv, - gic_shared_intrs - 2 * gic_vpes, - 2 * gic_vpes); + /* + * Reserve 2 interrupts per possible CPU/VP for use as IPIs, + * meeting the requirements of arch/mips SMP. + */ + num_ipis = 2 * num_possible_cpus(); + bitmap_set(ipi_resrv, gic_shared_intrs - num_ipis, num_ipis); } bitmap_copy(ipi_available, ipi_resrv, GIC_MAX_INTRS); From 82857688ca749cc9a91ff1f4495cc20f834a9f7d Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:49 -0700 Subject: [PATCH 0868/1085] irqchip: mips-gic: Remove gic_vpes variable Following the past few patches nothing uses the gic_vpes variable any longer. Remove the dead code. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 4304283bfb1a..48f0f43cd05d 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -49,7 +49,6 @@ static DEFINE_SPINLOCK(gic_lock); static struct irq_domain *gic_irq_domain; static struct irq_domain *gic_ipi_domain; static int gic_shared_intrs; -static int gic_vpes; static unsigned int gic_cpu_pin; static unsigned int timer_cpu_pin; static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; @@ -721,10 +720,6 @@ static int __init gic_of_init(struct device_node *node, gic_shared_intrs >>= __ffs(GIC_CONFIG_NUMINTERRUPTS); gic_shared_intrs = (gic_shared_intrs + 1) * 8; - gic_vpes = gicconfig & GIC_CONFIG_PVPS; - gic_vpes >>= __ffs(GIC_CONFIG_PVPS); - gic_vpes = gic_vpes + 1; - if (cpu_has_veic) { /* Always use vector 1 in EIC mode */ gic_cpu_pin = 0; From 5af3e93e16b39231f04623469eb4ac0e4406c0d1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:50 -0700 Subject: [PATCH 0869/1085] irqchip: mips-gic: Share register writes in gic_set_type() The gic_set_type() function included writes to the MIPS GIC polarity, trigger & dual-trigger registers in each case of a switch statement determining the IRQs type. This is all well & good when we only have a single cluster & thus a single GIC whose register we want to update. It will lead to significant duplication once we have multi-cluster support & multiple GICs to update. Refactor this such that we determine values for the polarity, trigger & dual-trigger registers and then have a single set of register writes following the switch statement. This will allow us to write the same values to each GIC in a multi-cluster system in a later patch, rather than needing to duplicate more register writes in each case. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 46 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 48f0f43cd05d..b2e83461e2a8 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -199,46 +199,46 @@ static void gic_ack_irq(struct irq_data *d) static int gic_set_type(struct irq_data *d, unsigned int type) { - unsigned int irq = GIC_HWIRQ_TO_SHARED(d->hwirq); + unsigned int irq, pol, trig, dual; unsigned long flags; - bool is_edge; + + irq = GIC_HWIRQ_TO_SHARED(d->hwirq); spin_lock_irqsave(&gic_lock, flags); switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_FALLING: - change_gic_pol(irq, GIC_POL_FALLING_EDGE); - change_gic_trig(irq, GIC_TRIG_EDGE); - change_gic_dual(irq, GIC_DUAL_SINGLE); - is_edge = true; + pol = GIC_POL_FALLING_EDGE; + trig = GIC_TRIG_EDGE; + dual = GIC_DUAL_SINGLE; break; case IRQ_TYPE_EDGE_RISING: - change_gic_pol(irq, GIC_POL_RISING_EDGE); - change_gic_trig(irq, GIC_TRIG_EDGE); - change_gic_dual(irq, GIC_DUAL_SINGLE); - is_edge = true; + pol = GIC_POL_RISING_EDGE; + trig = GIC_TRIG_EDGE; + dual = GIC_DUAL_SINGLE; break; case IRQ_TYPE_EDGE_BOTH: - /* polarity is irrelevant in this case */ - change_gic_trig(irq, GIC_TRIG_EDGE); - change_gic_dual(irq, GIC_DUAL_DUAL); - is_edge = true; + pol = 0; /* Doesn't matter */ + trig = GIC_TRIG_EDGE; + dual = GIC_DUAL_DUAL; break; case IRQ_TYPE_LEVEL_LOW: - change_gic_pol(irq, GIC_POL_ACTIVE_LOW); - change_gic_trig(irq, GIC_TRIG_LEVEL); - change_gic_dual(irq, GIC_DUAL_SINGLE); - is_edge = false; + pol = GIC_POL_ACTIVE_LOW; + trig = GIC_TRIG_LEVEL; + dual = GIC_DUAL_SINGLE; break; case IRQ_TYPE_LEVEL_HIGH: default: - change_gic_pol(irq, GIC_POL_ACTIVE_HIGH); - change_gic_trig(irq, GIC_TRIG_LEVEL); - change_gic_dual(irq, GIC_DUAL_SINGLE); - is_edge = false; + pol = GIC_POL_ACTIVE_HIGH; + trig = GIC_TRIG_LEVEL; + dual = GIC_DUAL_SINGLE; break; } - if (is_edge) + change_gic_pol(irq, pol); + change_gic_trig(irq, trig); + change_gic_dual(irq, dual); + + if (trig == GIC_TRIG_EDGE) irq_set_chip_handler_name_locked(d, &gic_edge_irq_controller, handle_edge_irq, NULL); else From 61dc367e5d767e1c56147f6e497d13cc2771abb1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:51 -0700 Subject: [PATCH 0870/1085] irqchip: mips-gic: Make IPI bitmaps static We have 2 bitmaps used to keep track of interrupts dedicated to IPIs in the MIPS GIC irqchip driver. These bitmaps are only used from the one compilation unit of that driver, and so can be made static. Do so in order to avoid polluting the symbol table & global namespace. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index b2e83461e2a8..3ccebb020f40 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -52,8 +52,8 @@ static int gic_shared_intrs; static unsigned int gic_cpu_pin; static unsigned int timer_cpu_pin; static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; -DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS); -DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS); +static DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS); +static DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS); static struct gic_all_vpes_chip_data { u32 map; From ab953b9db3a1169fbc675c8de3d2dab919ce3211 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 1 Nov 2017 22:06:19 -0700 Subject: [PATCH 0871/1085] regulator: qcom_spmi: Include offset when translating voltages This driver converts voltages from a non-linear range in hardware to a linear range in software and vice versa. During the conversion, we exclude certain voltages that are invalid to use because the software interface is more flexible than reality. For example, the FTSMPS2P5 regulators have a voltage range from 80000uV to 1355000uV that software could support, but we only want to use the range of 350000uV to 1355000uV. If we don't account for the hw selectors between 80000uV and 350000uV we'll pick a hw selector of 0 to mean 350000uV when it really means 80000uV. This can cause us to program voltages into the hardware that are significantly lower than what we're expecting. And when we read it back from the hardware we'll have the same problem, voltages that are in the invalid band will end up being calculated as some software selector that represents a larger voltage than what is programmed and the user will be confused. Fix all this by properly offsetting the software selector and hw selector when converting from one number space to another. Fixes: 1b5b19689278 ("regulator: qcom_spmi: Only use selector based regulator ops") Signed-off-by: Stephen Boyd Signed-off-by: Mark Brown --- drivers/regulator/qcom_spmi-regulator.c | 37 +++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/regulator/qcom_spmi-regulator.c b/drivers/regulator/qcom_spmi-regulator.c index 16c5f84e06a7..c372b244f3da 100644 --- a/drivers/regulator/qcom_spmi-regulator.c +++ b/drivers/regulator/qcom_spmi-regulator.c @@ -593,13 +593,20 @@ static int spmi_sw_selector_to_hw(struct spmi_regulator *vreg, u8 *voltage_sel) { const struct spmi_voltage_range *range, *end; + unsigned offset; range = vreg->set_points->range; end = range + vreg->set_points->count; for (; range < end; range++) { if (selector < range->n_voltages) { - *voltage_sel = selector; + /* + * hardware selectors between set point min and real + * min are invalid so we ignore them + */ + offset = range->set_point_min_uV - range->min_uV; + offset /= range->step_uV; + *voltage_sel = selector + offset; *range_sel = range->range_sel; return 0; } @@ -613,15 +620,35 @@ static int spmi_sw_selector_to_hw(struct spmi_regulator *vreg, static int spmi_hw_selector_to_sw(struct spmi_regulator *vreg, u8 hw_sel, const struct spmi_voltage_range *range) { - int sw_sel = hw_sel; + unsigned sw_sel = 0; + unsigned offset, max_hw_sel; const struct spmi_voltage_range *r = vreg->set_points->range; + const struct spmi_voltage_range *end = r + vreg->set_points->count; - while (r != range) { + for (; r < end; r++) { + if (r == range && range->n_voltages) { + /* + * hardware selectors between set point min and real + * min and between set point max and real max are + * invalid so we return an error if they're + * programmed into the hardware + */ + offset = range->set_point_min_uV - range->min_uV; + offset /= range->step_uV; + if (hw_sel < offset) + return -EINVAL; + + max_hw_sel = range->set_point_max_uV - range->min_uV; + max_hw_sel /= range->step_uV; + if (hw_sel > max_hw_sel) + return -EINVAL; + + return sw_sel + hw_sel - offset; + } sw_sel += r->n_voltages; - r++; } - return sw_sel; + return -EINVAL; } static const struct spmi_voltage_range * From fd30b717b86dc30ffe25596f8de6542a02ae9401 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 17:58:54 -0700 Subject: [PATCH 0872/1085] rcu: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Kees Cook --- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree_plugin.h | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..96a3cdaeed91 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1076,7 +1076,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * counter in the element should never be greater than 1, otherwise, the * RCU implementation is broken. */ -static void rcu_torture_timer(unsigned long unused) +static void rcu_torture_timer(struct timer_list *unused) { int idx; unsigned long started; @@ -1163,7 +1163,7 @@ rcu_torture_reader(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); + timer_setup_on_stack(&t, rcu_torture_timer, 0); do { if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..e85946d9843b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2261,9 +2261,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(unsigned long x) +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { - do_nocb_deferred_wakeup_common((struct rcu_data *)x); + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + do_nocb_deferred_wakeup_common(rdp); } /* @@ -2327,8 +2329,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_lock_init(&rdp->nocb_lock); - setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, - (unsigned long)rdp); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } /* From 9b5dfbdd1f51558d321ccf15ac1e45ef1f593e83 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 29 Aug 2017 21:00:21 -0700 Subject: [PATCH 0873/1085] fs/ncpfs: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Petr Vandrovec Cc: Jan Kara Cc: Jens Axboe Cc: Al Viro Signed-off-by: Kees Cook Reviewed-by: Jan Kara --- fs/ncpfs/inode.c | 4 +--- fs/ncpfs/ncp_fs_sb.h | 2 +- fs/ncpfs/sock.c | 6 +++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 6d0f14c86099..129f1937fa2c 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -618,7 +618,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) server->tx.creq = NULL; server->rcv.creq = NULL; - init_timer(&server->timeout_tm); + timer_setup(&server->timeout_tm, ncpdgram_timeout_call, 0); #undef NCP_PACKET_SIZE #define NCP_PACKET_SIZE 131072 error = -ENOMEM; @@ -650,8 +650,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) } else { INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); - server->timeout_tm.data = (unsigned long)server; - server->timeout_tm.function = ncpdgram_timeout_call; } release_sock(sock->sk); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 366fd63cc506..2088d94ead93 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -149,7 +149,7 @@ extern void ncp_tcp_rcv_proc(struct work_struct *work); extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); -extern void ncpdgram_timeout_call(unsigned long server); +extern void ncpdgram_timeout_call(struct timer_list *t); extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 98b6db0ed63e..56c45c78b0b8 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -116,10 +116,10 @@ void ncp_tcp_write_space(struct sock *sk) schedule_work(&server->tx.tq); } -void ncpdgram_timeout_call(unsigned long v) +void ncpdgram_timeout_call(struct timer_list *t) { - struct ncp_server *server = (void*)v; - + struct ncp_server *server = from_timer(server, t, timeout_tm); + schedule_work(&server->timeout_tq); } From d5272003b87d5d6144745543004512cf792e345e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Oct 2017 16:19:27 -0700 Subject: [PATCH 0874/1085] ACPI / APEI: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Tony Luck Cc: Borislav Petkov Cc: Tyler Baicar Cc: Will Deacon Cc: James Morse Cc: "Jonathan (Zhixiong) Zhang" Cc: Shiju Jose Cc: linux-acpi@vger.kernel.org Signed-off-by: Kees Cook Tested-by: Tyler Baicar --- drivers/acpi/apei/ghes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 3c3a37b8503b..ebaa51ba8a22 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -774,9 +774,9 @@ static void ghes_add_timer(struct ghes *ghes) add_timer(&ghes->timer); } -static void ghes_poll_func(unsigned long data) +static void ghes_poll_func(struct timer_list *t) { - struct ghes *ghes = (void *)data; + struct ghes *ghes = from_timer(ghes, t, timer); ghes_proc(ghes); if (!(ghes->flags & GHES_EXITING)) @@ -1147,8 +1147,7 @@ static int ghes_probe(struct platform_device *ghes_dev) switch (generic->notify.type) { case ACPI_HEST_NOTIFY_POLLED: - setup_deferrable_timer(&ghes->timer, ghes_poll_func, - (unsigned long)ghes); + timer_setup(&ghes->timer, ghes_poll_func, TIMER_DEFERRABLE); ghes_add_timer(ghes); break; case ACPI_HEST_NOTIFY_EXTERNAL: From 43b7052426848758a7bc012b8289f358bc6995d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Oct 2017 16:21:12 -0700 Subject: [PATCH 0875/1085] drm/etnaviv: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Lucas Stach Cc: Russell King Cc: Christian Gmeiner Cc: David Airlie Cc: etnaviv@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Kees Cook --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index fc9a6a83dfc7..4b152e0d31a6 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -975,9 +975,9 @@ static void hangcheck_timer_reset(struct etnaviv_gpu *gpu) round_jiffies_up(jiffies + DRM_ETNAVIV_HANGCHECK_JIFFIES)); } -static void hangcheck_handler(unsigned long data) +static void hangcheck_handler(struct timer_list *t) { - struct etnaviv_gpu *gpu = (struct etnaviv_gpu *)data; + struct etnaviv_gpu *gpu = from_timer(gpu, t, hangcheck_timer); u32 fence = gpu->completed_fence; bool progress = false; @@ -1648,8 +1648,7 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master, INIT_WORK(&gpu->recover_work, recover_worker); init_waitqueue_head(&gpu->fence_event); - setup_deferrable_timer(&gpu->hangcheck_timer, hangcheck_handler, - (unsigned long)gpu); + timer_setup(&gpu->hangcheck_timer, hangcheck_handler, TIMER_DEFERRABLE); priv->gpu[priv->num_gpus++] = gpu; From 8da0edf2f90b6c74b69ad420fdd230c9bd2bd1ed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Oct 2017 16:37:39 -0700 Subject: [PATCH 0876/1085] media: pvrusb2: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Mike Isely Cc: Mauro Carvalho Chehab Cc: linux-media@vger.kernel.org Signed-off-by: Kees Cook Acked-By: Mike Isely Acked-by: Hans Verkuil --- drivers/media/usb/pvrusb2/pvrusb2-hdw.c | 64 ++++++++++++++----------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c index ad5b25b89699..8289ee482f49 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c @@ -330,10 +330,10 @@ static void pvr2_hdw_state_log_state(struct pvr2_hdw *); static int pvr2_hdw_cmd_usbstream(struct pvr2_hdw *hdw,int runFl); static int pvr2_hdw_commit_setup(struct pvr2_hdw *hdw); static int pvr2_hdw_get_eeprom_addr(struct pvr2_hdw *hdw); -static void pvr2_hdw_quiescent_timeout(unsigned long); -static void pvr2_hdw_decoder_stabilization_timeout(unsigned long); -static void pvr2_hdw_encoder_wait_timeout(unsigned long); -static void pvr2_hdw_encoder_run_timeout(unsigned long); +static void pvr2_hdw_quiescent_timeout(struct timer_list *); +static void pvr2_hdw_decoder_stabilization_timeout(struct timer_list *); +static void pvr2_hdw_encoder_wait_timeout(struct timer_list *); +static void pvr2_hdw_encoder_run_timeout(struct timer_list *); static int pvr2_issue_simple_cmd(struct pvr2_hdw *,u32); static int pvr2_send_request_ex(struct pvr2_hdw *hdw, unsigned int timeout,int probe_fl, @@ -2373,18 +2373,15 @@ struct pvr2_hdw *pvr2_hdw_create(struct usb_interface *intf, } if (!hdw) goto fail; - setup_timer(&hdw->quiescent_timer, pvr2_hdw_quiescent_timeout, - (unsigned long)hdw); + timer_setup(&hdw->quiescent_timer, pvr2_hdw_quiescent_timeout, 0); - setup_timer(&hdw->decoder_stabilization_timer, - pvr2_hdw_decoder_stabilization_timeout, - (unsigned long)hdw); + timer_setup(&hdw->decoder_stabilization_timer, + pvr2_hdw_decoder_stabilization_timeout, 0); - setup_timer(&hdw->encoder_wait_timer, pvr2_hdw_encoder_wait_timeout, - (unsigned long)hdw); + timer_setup(&hdw->encoder_wait_timer, pvr2_hdw_encoder_wait_timeout, + 0); - setup_timer(&hdw->encoder_run_timer, pvr2_hdw_encoder_run_timeout, - (unsigned long)hdw); + timer_setup(&hdw->encoder_run_timer, pvr2_hdw_encoder_run_timeout, 0); hdw->master_state = PVR2_STATE_DEAD; @@ -3539,10 +3536,16 @@ static void pvr2_ctl_read_complete(struct urb *urb) complete(&hdw->ctl_done); } +struct hdw_timer { + struct timer_list timer; + struct pvr2_hdw *hdw; +}; -static void pvr2_ctl_timeout(unsigned long data) +static void pvr2_ctl_timeout(struct timer_list *t) { - struct pvr2_hdw *hdw = (struct pvr2_hdw *)data; + struct hdw_timer *timer = from_timer(timer, t, timer); + struct pvr2_hdw *hdw = timer->hdw; + if (hdw->ctl_write_pend_flag || hdw->ctl_read_pend_flag) { hdw->ctl_timeout_flag = !0; if (hdw->ctl_write_pend_flag) @@ -3564,7 +3567,10 @@ static int pvr2_send_request_ex(struct pvr2_hdw *hdw, { unsigned int idx; int status = 0; - struct timer_list timer; + struct hdw_timer timer = { + .hdw = hdw, + }; + if (!hdw->ctl_lock_held) { pvr2_trace(PVR2_TRACE_ERROR_LEGS, "Attempted to execute control transfer without lock!!"); @@ -3621,8 +3627,8 @@ static int pvr2_send_request_ex(struct pvr2_hdw *hdw, hdw->ctl_timeout_flag = 0; hdw->ctl_write_pend_flag = 0; hdw->ctl_read_pend_flag = 0; - setup_timer(&timer, pvr2_ctl_timeout, (unsigned long)hdw); - timer.expires = jiffies + timeout; + timer_setup_on_stack(&timer.timer, pvr2_ctl_timeout, 0); + timer.timer.expires = jiffies + timeout; if (write_len && write_data) { hdw->cmd_debug_state = 2; @@ -3677,7 +3683,7 @@ status); } /* Start timer */ - add_timer(&timer); + add_timer(&timer.timer); /* Now wait for all I/O to complete */ hdw->cmd_debug_state = 4; @@ -3687,7 +3693,7 @@ status); hdw->cmd_debug_state = 5; /* Stop timer */ - del_timer_sync(&timer); + del_timer_sync(&timer.timer); hdw->cmd_debug_state = 6; status = 0; @@ -3769,6 +3775,8 @@ status); if ((status < 0) && (!probe_fl)) { pvr2_hdw_render_useless(hdw); } + destroy_timer_on_stack(&timer.timer); + return status; } @@ -4366,9 +4374,9 @@ static int state_eval_encoder_run(struct pvr2_hdw *hdw) /* Timeout function for quiescent timer. */ -static void pvr2_hdw_quiescent_timeout(unsigned long data) +static void pvr2_hdw_quiescent_timeout(struct timer_list *t) { - struct pvr2_hdw *hdw = (struct pvr2_hdw *)data; + struct pvr2_hdw *hdw = from_timer(hdw, t, quiescent_timer); hdw->state_decoder_quiescent = !0; trace_stbit("state_decoder_quiescent",hdw->state_decoder_quiescent); hdw->state_stale = !0; @@ -4377,9 +4385,9 @@ static void pvr2_hdw_quiescent_timeout(unsigned long data) /* Timeout function for decoder stabilization timer. */ -static void pvr2_hdw_decoder_stabilization_timeout(unsigned long data) +static void pvr2_hdw_decoder_stabilization_timeout(struct timer_list *t) { - struct pvr2_hdw *hdw = (struct pvr2_hdw *)data; + struct pvr2_hdw *hdw = from_timer(hdw, t, decoder_stabilization_timer); hdw->state_decoder_ready = !0; trace_stbit("state_decoder_ready", hdw->state_decoder_ready); hdw->state_stale = !0; @@ -4388,9 +4396,9 @@ static void pvr2_hdw_decoder_stabilization_timeout(unsigned long data) /* Timeout function for encoder wait timer. */ -static void pvr2_hdw_encoder_wait_timeout(unsigned long data) +static void pvr2_hdw_encoder_wait_timeout(struct timer_list *t) { - struct pvr2_hdw *hdw = (struct pvr2_hdw *)data; + struct pvr2_hdw *hdw = from_timer(hdw, t, encoder_wait_timer); hdw->state_encoder_waitok = !0; trace_stbit("state_encoder_waitok",hdw->state_encoder_waitok); hdw->state_stale = !0; @@ -4399,9 +4407,9 @@ static void pvr2_hdw_encoder_wait_timeout(unsigned long data) /* Timeout function for encoder run timer. */ -static void pvr2_hdw_encoder_run_timeout(unsigned long data) +static void pvr2_hdw_encoder_run_timeout(struct timer_list *t) { - struct pvr2_hdw *hdw = (struct pvr2_hdw *)data; + struct pvr2_hdw *hdw = from_timer(hdw, t, encoder_run_timer); if (!hdw->state_encoder_runok) { hdw->state_encoder_runok = !0; trace_stbit("state_encoder_runok",hdw->state_encoder_runok); From 4fa42b4e5ddca6a58d8fcf24d51e6fe2f8c50253 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Oct 2017 22:47:37 -0700 Subject: [PATCH 0877/1085] watchdog: cpwd: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Switches to using the global that is used everywhere else. Cc: Wim Van Sebroeck Cc: linux-watchdog@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Guenter Roeck --- drivers/watchdog/cpwd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c index 3d43775548e5..aee0b25cf10d 100644 --- a/drivers/watchdog/cpwd.c +++ b/drivers/watchdog/cpwd.c @@ -230,9 +230,9 @@ static void cpwd_resetbrokentimer(struct cpwd *p, int index) * interrupts within the PLD so me must continually * reset the timers ad infinitum. */ -static void cpwd_brokentimer(unsigned long data) +static void cpwd_brokentimer(struct timer_list *unused) { - struct cpwd *p = (struct cpwd *) data; + struct cpwd *p = cpwd_device; int id, tripped = 0; /* kill a running timer instance, in case we @@ -275,7 +275,7 @@ static void cpwd_stoptimer(struct cpwd *p, int index) if (p->broken) { p->devs[index].runstatus |= WD_STAT_BSTOP; - cpwd_brokentimer((unsigned long) p); + cpwd_brokentimer(NULL); } } } @@ -608,7 +608,7 @@ static int cpwd_probe(struct platform_device *op) } if (p->broken) { - setup_timer(&cpwd_timer, cpwd_brokentimer, (unsigned long)p); + timer_setup(&cpwd_timer, cpwd_brokentimer, 0); cpwd_timer.expires = WD_BTIMEOUT; pr_info("PLD defect workaround enabled for model %s\n", From d1cadcb7cade0f98eb4c3b1e4c808cc8169da5e3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Sep 2017 16:14:07 -0700 Subject: [PATCH 0878/1085] watchdog: lpc18xx_wdt: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Joachim Eastwood Cc: linux-watchdog@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook Acked-by: Guenter Roeck --- drivers/watchdog/lpc18xx_wdt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/lpc18xx_wdt.c b/drivers/watchdog/lpc18xx_wdt.c index 3b8bb59adf02..b4221f43cd94 100644 --- a/drivers/watchdog/lpc18xx_wdt.c +++ b/drivers/watchdog/lpc18xx_wdt.c @@ -78,10 +78,10 @@ static int lpc18xx_wdt_feed(struct watchdog_device *wdt_dev) return 0; } -static void lpc18xx_wdt_timer_feed(unsigned long data) +static void lpc18xx_wdt_timer_feed(struct timer_list *t) { - struct watchdog_device *wdt_dev = (struct watchdog_device *)data; - struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev); + struct lpc18xx_wdt_dev *lpc18xx_wdt = from_timer(lpc18xx_wdt, t, timer); + struct watchdog_device *wdt_dev = &lpc18xx_wdt->wdt_dev; lpc18xx_wdt_feed(wdt_dev); @@ -96,7 +96,9 @@ static void lpc18xx_wdt_timer_feed(unsigned long data) */ static int lpc18xx_wdt_stop(struct watchdog_device *wdt_dev) { - lpc18xx_wdt_timer_feed((unsigned long)wdt_dev); + struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev); + + lpc18xx_wdt_timer_feed(&lpc18xx_wdt->timer); return 0; } @@ -267,8 +269,7 @@ static int lpc18xx_wdt_probe(struct platform_device *pdev) __lpc18xx_wdt_set_timeout(lpc18xx_wdt); - setup_timer(&lpc18xx_wdt->timer, lpc18xx_wdt_timer_feed, - (unsigned long)&lpc18xx_wdt->wdt_dev); + timer_setup(&lpc18xx_wdt->timer, lpc18xx_wdt_timer_feed, 0); watchdog_set_nowayout(&lpc18xx_wdt->wdt_dev, nowayout); watchdog_set_restart_priority(&lpc18xx_wdt->wdt_dev, 128); From 5943cf4a59c831944474803223d5febec58deaad Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Oct 2017 15:47:30 -0700 Subject: [PATCH 0879/1085] powerpc/watchdog: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Kees Cook --- arch/powerpc/kernel/watchdog.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..224a16438baa 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -261,9 +261,8 @@ static void wd_timer_reset(unsigned int cpu, struct timer_list *t) add_timer_on(t, cpu); } -static void wd_timer_fn(unsigned long data) +static void wd_timer_fn(struct timer_list *t) { - struct timer_list *t = this_cpu_ptr(&wd_timer); int cpu = smp_processor_id(); watchdog_timer_interrupt(cpu); @@ -287,7 +286,7 @@ static void start_watchdog_timer_on(unsigned int cpu) per_cpu(wd_timer_tb, cpu) = get_tb(); - setup_pinned_timer(t, wd_timer_fn, 0); + timer_setup(t, wd_timer_fn, TIMER_PINNED); wd_timer_reset(cpu, t); } From 3142692a5e94a45a50abd78b4ae1c52f75eae41e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 16:37:43 -0700 Subject: [PATCH 0880/1085] x86, calgary: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Muli Ben-Yehuda Cc: Jon Mason Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: iommu@lists.linux-foundation.org Signed-off-by: Kees Cook --- arch/x86/kernel/pci-calgary_64.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5286a4a92cf7..35c461f21815 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -898,10 +898,9 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) PHB_ROOT_COMPLEX_STATUS); } -static void calgary_watchdog(unsigned long data) +static void calgary_watchdog(struct timer_list *t) { - struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = pci_iommu(dev->bus); + struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -1016,8 +1015,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - setup_timer(&tbl->watchdog_timer, &calgary_watchdog, - (unsigned long)dev); + timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); mod_timer(&tbl->watchdog_timer, jiffies); } From d8479a21a98baf1bb50426986d15605cee96ec36 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 16:40:49 -0700 Subject: [PATCH 0881/1085] xtensa: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Chris Zankel Cc: Max Filippov Cc: linux-xtensa@linux-xtensa.org Signed-off-by: Kees Cook --- arch/xtensa/platforms/iss/console.c | 9 ++++----- arch/xtensa/platforms/iss/network.c | 13 +++++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c index 0140a22551c8..464c2684c4f1 100644 --- a/arch/xtensa/platforms/iss/console.c +++ b/arch/xtensa/platforms/iss/console.c @@ -47,15 +47,14 @@ static char *serial_name = "ISS serial driver"; * initialization for the tty structure. */ -static void rs_poll(unsigned long); +static void rs_poll(struct timer_list *); static int rs_open(struct tty_struct *tty, struct file * filp) { tty->port = &serial_port; spin_lock_bh(&timer_lock); if (tty->count == 1) { - setup_timer(&serial_timer, rs_poll, - (unsigned long)&serial_port); + timer_setup(&serial_timer, rs_poll, 0); mod_timer(&serial_timer, jiffies + SERIAL_TIMER_VALUE); } spin_unlock_bh(&timer_lock); @@ -92,9 +91,9 @@ static int rs_write(struct tty_struct * tty, return count; } -static void rs_poll(unsigned long priv) +static void rs_poll(struct timer_list *unused) { - struct tty_port *port = (struct tty_port *)priv; + struct tty_port *port = &serial_port; int i = 0; int rd = 1; unsigned char c; diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 66a5d15a9e0e..6363b18e5b8c 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -349,9 +349,9 @@ static int iss_net_poll(void) } -static void iss_net_timer(unsigned long priv) +static void iss_net_timer(struct timer_list *t) { - struct iss_net_private *lp = (struct iss_net_private *)priv; + struct iss_net_private *lp = from_timer(lp, t, timer); iss_net_poll(); spin_lock(&lp->lock); @@ -386,10 +386,8 @@ static int iss_net_open(struct net_device *dev) spin_unlock_bh(&opened_lock); spin_lock_bh(&lp->lock); - init_timer(&lp->timer); + timer_setup(&lp->timer, iss_net_timer, 0); lp->timer_val = ISS_NET_TIMER_VALUE; - lp->timer.data = (unsigned long) lp; - lp->timer.function = iss_net_timer; mod_timer(&lp->timer, jiffies + lp->timer_val); out: @@ -482,7 +480,7 @@ static int iss_net_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } -void iss_net_user_timer_expire(unsigned long _conn) +void iss_net_user_timer_expire(struct timer_list *unused) { } @@ -582,8 +580,7 @@ static int iss_net_configure(int index, char *init) return 1; } - init_timer(&lp->tl); - lp->tl.function = iss_net_user_timer_expire; + timer_setup(&lp->tl, iss_net_user_timer_expire, 0); return 0; From 2c513d4f7da7d5616d9e19232376edd4e18fef24 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 14:24:01 -0700 Subject: [PATCH 0882/1085] ia64: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. One less trivial change was removing the repeated casting for callers of bte_error_handler() by fixing its function declaration and adding a small wrapper for the timer callback instead. Cc: Tony Luck Cc: Fenghua Yu Cc: Ingo Molnar Cc: Joe Perches Cc: Andrew Morton Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: linux-ia64@vger.kernel.org Signed-off-by: Kees Cook --- arch/ia64/include/asm/sn/bte.h | 4 +++- arch/ia64/kernel/mca.c | 8 ++++---- arch/ia64/kernel/salinfo.c | 5 ++--- arch/ia64/sn/kernel/bte.c | 12 ++++++++---- arch/ia64/sn/kernel/bte_error.c | 17 ++++++----------- arch/ia64/sn/kernel/huberror.c | 2 +- arch/ia64/sn/kernel/mca.c | 5 ++--- 7 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/ia64/include/asm/sn/bte.h b/arch/ia64/include/asm/sn/bte.h index cc6c4dbf53af..cd71ab5faf62 100644 --- a/arch/ia64/include/asm/sn/bte.h +++ b/arch/ia64/include/asm/sn/bte.h @@ -17,6 +17,8 @@ #include #include +struct nodepda_s; + #define IBCT_NOTIFY (0x1UL << 4) #define IBCT_ZFIL_MODE (0x1UL << 0) @@ -210,7 +212,7 @@ struct bteinfo_s { */ extern bte_result_t bte_copy(u64, u64, u64, u64, void *); extern bte_result_t bte_unaligned_copy(u64, u64, u64, u64); -extern void bte_error_handler(unsigned long); +extern void bte_error_handler(struct nodepda_s *); #define bte_zero(dest, len, mode, notification) \ bte_copy(0, dest, len, ((mode) | BTE_ZERO_FILL), notification) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 555b11180156..6115464d5f03 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1513,7 +1513,7 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg) * */ static void -ia64_mca_cmc_poll (unsigned long dummy) +ia64_mca_cmc_poll (struct timer_list *unused) { /* Trigger a CMC interrupt cascade */ platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR, @@ -1590,7 +1590,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg) * */ static void -ia64_mca_cpe_poll (unsigned long dummy) +ia64_mca_cpe_poll (struct timer_list *unused) { /* Trigger a CPE interrupt cascade */ platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR, @@ -2098,7 +2098,7 @@ ia64_mca_late_init(void) return 0; /* Setup the CMCI/P vector and handler */ - setup_timer(&cmc_poll_timer, ia64_mca_cmc_poll, 0UL); + timer_setup(&cmc_poll_timer, ia64_mca_cmc_poll, 0); /* Unmask/enable the vector */ cmc_polling_enabled = 0; @@ -2109,7 +2109,7 @@ ia64_mca_late_init(void) #ifdef CONFIG_ACPI /* Setup the CPEI/P vector and handler */ cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI); - setup_timer(&cpe_poll_timer, ia64_mca_cpe_poll, 0UL); + timer_setup(&cpe_poll_timer, ia64_mca_cpe_poll, 0); { unsigned int irq; diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index 63dc9cdc95c5..52c404b08904 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -263,7 +263,7 @@ salinfo_timeout_check(struct salinfo_data *data) } static void -salinfo_timeout (unsigned long arg) +salinfo_timeout(struct timer_list *unused) { ia64_mlogbuf_dump(); salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA); @@ -623,9 +623,8 @@ salinfo_init(void) *sdir++ = salinfo_dir; - init_timer(&salinfo_timer); + timer_setup(&salinfo_timer, salinfo_timeout, 0); salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY; - salinfo_timer.function = &salinfo_timeout; add_timer(&salinfo_timer); i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/salinfo:online", diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c index b2eb48490754..9146192b86f5 100644 --- a/arch/ia64/sn/kernel/bte.c +++ b/arch/ia64/sn/kernel/bte.c @@ -219,7 +219,7 @@ retry_bteop: BTE_LNSTAT_LOAD(bte), *bte->most_rcnt_na) ); bte->bte_error_count++; bte->bh_error = IBLS_ERROR; - bte_error_handler((unsigned long)NODEPDA(bte->bte_cnode)); + bte_error_handler(NODEPDA(bte->bte_cnode)); *bte->most_rcnt_na = BTE_WORD_AVAILABLE; goto retry_bteop; } @@ -414,6 +414,12 @@ EXPORT_SYMBOL(bte_unaligned_copy); * Block Transfer Engine initialization functions. * ***********************************************************************/ +static void bte_recovery_timeout(struct timer_list *t) +{ + struct nodepda_s *nodepda = from_timer(nodepda, t, bte_recovery_timer); + + bte_error_handler(nodepda); +} /* * bte_init_node(nodepda, cnode) @@ -436,9 +442,7 @@ void bte_init_node(nodepda_t * mynodepda, cnodeid_t cnode) * will point at this one bte_recover structure to get the lock. */ spin_lock_init(&mynodepda->bte_recovery_lock); - init_timer(&mynodepda->bte_recovery_timer); - mynodepda->bte_recovery_timer.function = bte_error_handler; - mynodepda->bte_recovery_timer.data = (unsigned long)mynodepda; + timer_setup(&mynodepda->bte_recovery_timer, bte_recovery_timeout, 0); for (i = 0; i < BTES_PER_NODE; i++) { u64 *base_addr; diff --git a/arch/ia64/sn/kernel/bte_error.c b/arch/ia64/sn/kernel/bte_error.c index 4cb09f3f1efc..d92786c09b34 100644 --- a/arch/ia64/sn/kernel/bte_error.c +++ b/arch/ia64/sn/kernel/bte_error.c @@ -27,15 +27,12 @@ * transfers to be queued. */ -void bte_error_handler(unsigned long); - /* * Wait until all BTE related CRBs are completed * and then reset the interfaces. */ -int shub1_bte_error_handler(unsigned long _nodepda) +static int shub1_bte_error_handler(struct nodepda_s *err_nodepda) { - struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda; struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer; nasid_t nasid; int i; @@ -131,9 +128,8 @@ int shub1_bte_error_handler(unsigned long _nodepda) * Wait until all BTE related CRBs are completed * and then reset the interfaces. */ -int shub2_bte_error_handler(unsigned long _nodepda) +static int shub2_bte_error_handler(struct nodepda_s *err_nodepda) { - struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda; struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer; struct bteinfo_s *bte; nasid_t nasid; @@ -170,9 +166,8 @@ int shub2_bte_error_handler(unsigned long _nodepda) * Wait until all BTE related CRBs are completed * and then reset the interfaces. */ -void bte_error_handler(unsigned long _nodepda) +void bte_error_handler(struct nodepda_s *err_nodepda) { - struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda; spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock; int i; unsigned long irq_flags; @@ -199,12 +194,12 @@ void bte_error_handler(unsigned long _nodepda) } if (is_shub1()) { - if (shub1_bte_error_handler(_nodepda)) { + if (shub1_bte_error_handler(err_nodepda)) { spin_unlock_irqrestore(recovery_lock, irq_flags); return; } } else { - if (shub2_bte_error_handler(_nodepda)) { + if (shub2_bte_error_handler(err_nodepda)) { spin_unlock_irqrestore(recovery_lock, irq_flags); return; } @@ -255,6 +250,6 @@ bte_crb_error_handler(cnodeid_t cnode, int btenum, BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n", bte->bte_cnode, bte->bte_num, ioe->ie_errortype)); - bte_error_handler((unsigned long) NODEPDA(cnode)); + bte_error_handler(NODEPDA(cnode)); } diff --git a/arch/ia64/sn/kernel/huberror.c b/arch/ia64/sn/kernel/huberror.c index f925dec2da92..97fa56dddf50 100644 --- a/arch/ia64/sn/kernel/huberror.c +++ b/arch/ia64/sn/kernel/huberror.c @@ -50,7 +50,7 @@ static irqreturn_t hub_eint_handler(int irq, void *arg) if ((int)ret_stuff.v0) panic("%s: Fatal TIO Error", __func__); } else - bte_error_handler((unsigned long)NODEPDA(nasid_to_cnodeid(nasid))); + bte_error_handler(NODEPDA(nasid_to_cnodeid(nasid))); return IRQ_HANDLED; } diff --git a/arch/ia64/sn/kernel/mca.c b/arch/ia64/sn/kernel/mca.c index 5b799d4deb74..bc3bd930c74c 100644 --- a/arch/ia64/sn/kernel/mca.c +++ b/arch/ia64/sn/kernel/mca.c @@ -72,7 +72,7 @@ static void sn_cpei_handler(int irq, void *devid, struct pt_regs *regs) ia64_sn_plat_cpei_handler(); } -static void sn_cpei_timer_handler(unsigned long dummy) +static void sn_cpei_timer_handler(struct timer_list *unused) { sn_cpei_handler(-1, NULL, NULL); mod_timer(&sn_cpei_timer, jiffies + CPEI_INTERVAL); @@ -80,9 +80,8 @@ static void sn_cpei_timer_handler(unsigned long dummy) void sn_init_cpei_timer(void) { - init_timer(&sn_cpei_timer); + timer_setup(&sn_cpei_timer, sn_cpei_timer_handler, 0); sn_cpei_timer.expires = jiffies + CPEI_INTERVAL; - sn_cpei_timer.function = sn_cpei_timer_handler; add_timer(&sn_cpei_timer); } From b7bea32f0cc42b155eac995e60c97cff56390376 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 16:30:38 -0700 Subject: [PATCH 0883/1085] ARM: footbridge: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook --- arch/arm/mach-footbridge/dc21285.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-footbridge/dc21285.c b/arch/arm/mach-footbridge/dc21285.c index 96a3d73ef4bf..8407e4a07c77 100644 --- a/arch/arm/mach-footbridge/dc21285.c +++ b/arch/arm/mach-footbridge/dc21285.c @@ -136,19 +136,14 @@ struct pci_ops dc21285_ops = { static struct timer_list serr_timer; static struct timer_list perr_timer; -static void dc21285_enable_error(unsigned long __data) +static void dc21285_enable_error(struct timer_list *timer) { - switch (__data) { - case IRQ_PCI_SERR: - del_timer(&serr_timer); - break; + del_timer(timer); - case IRQ_PCI_PERR: - del_timer(&perr_timer); - break; - } - - enable_irq(__data); + if (timer == &serr_timer) + enable_irq(IRQ_PCI_SERR) + else if (timer == &perr_timer) + enable_irq(IRQ_PCI_PERR); } /* @@ -323,13 +318,8 @@ void __init dc21285_preinit(void) *CSR_PCICMD = (*CSR_PCICMD & 0xffff) | PCICMD_ERROR_BITS; } - init_timer(&serr_timer); - init_timer(&perr_timer); - - serr_timer.data = IRQ_PCI_SERR; - serr_timer.function = dc21285_enable_error; - perr_timer.data = IRQ_PCI_PERR; - perr_timer.function = dc21285_enable_error; + timer_setup(&serr_timer, dc21285_enable_error, 0); + timer_setup(&perr_timer, dc21285_enable_error, 0); /* * We don't care if these fail. From 96d130824f6f965418c2c36061842c354ab60178 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Oct 2017 17:13:01 -0700 Subject: [PATCH 0884/1085] arm: pxa: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adds a static variable to hold the interrupt private data pointer. Cc: Daniel Mack Cc: Haojian Zhuang Cc: Robert Jarzmik Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook --- arch/arm/mach-pxa/lubbock.c | 15 ++++++--------- arch/arm/mach-pxa/sharpsl_pm.c | 8 ++++---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c index d6159f8ef0c2..df45682e99a5 100644 --- a/arch/arm/mach-pxa/lubbock.c +++ b/arch/arm/mach-pxa/lubbock.c @@ -381,14 +381,11 @@ static struct pxafb_mach_info sharp_lm8v31 = { #define MMC_POLL_RATE msecs_to_jiffies(1000) -static void lubbock_mmc_poll(unsigned long); static irq_handler_t mmc_detect_int; +static void *mmc_detect_int_data; +static struct timer_list mmc_timer; -static struct timer_list mmc_timer = { - .function = lubbock_mmc_poll, -}; - -static void lubbock_mmc_poll(unsigned long data) +static void lubbock_mmc_poll(struct timer_list *unused) { unsigned long flags; @@ -401,7 +398,7 @@ static void lubbock_mmc_poll(unsigned long data) if (LUB_IRQ_SET_CLR & (1 << 0)) mod_timer(&mmc_timer, jiffies + MMC_POLL_RATE); else { - (void) mmc_detect_int(LUBBOCK_SD_IRQ, (void *)data); + (void) mmc_detect_int(LUBBOCK_SD_IRQ, mmc_detect_int_data); enable_irq(LUBBOCK_SD_IRQ); } } @@ -421,8 +418,8 @@ static int lubbock_mci_init(struct device *dev, { /* detect card insert/eject */ mmc_detect_int = detect_int; - init_timer(&mmc_timer); - mmc_timer.data = (unsigned long) data; + mmc_detect_int_data = data; + timer_setup(&mmc_timer, lubbock_mmc_poll, 0); return request_irq(LUBBOCK_SD_IRQ, lubbock_detect_int, 0, "lubbock-sd-detect", data); } diff --git a/arch/arm/mach-pxa/sharpsl_pm.c b/arch/arm/mach-pxa/sharpsl_pm.c index 249b7bd5fbc4..398ba9ba2632 100644 --- a/arch/arm/mach-pxa/sharpsl_pm.c +++ b/arch/arm/mach-pxa/sharpsl_pm.c @@ -341,7 +341,7 @@ static void sharpsl_charge_toggle(struct work_struct *private_) sharpsl_pm.charge_start_time = jiffies; } -static void sharpsl_ac_timer(unsigned long data) +static void sharpsl_ac_timer(struct timer_list *unused) { int acin = sharpsl_pm.machinfo->read_devdata(SHARPSL_STATUS_ACIN); @@ -366,7 +366,7 @@ static irqreturn_t sharpsl_ac_isr(int irq, void *dev_id) return IRQ_HANDLED; } -static void sharpsl_chrg_full_timer(unsigned long data) +static void sharpsl_chrg_full_timer(struct timer_list *unused) { dev_dbg(sharpsl_pm.dev, "Charge Full at time: %lx\n", jiffies); @@ -841,9 +841,9 @@ static int sharpsl_pm_probe(struct platform_device *pdev) sharpsl_pm.charge_mode = CHRG_OFF; sharpsl_pm.flags = 0; - setup_timer(&sharpsl_pm.ac_timer, sharpsl_ac_timer, 0UL); + timer_setup(&sharpsl_pm.ac_timer, sharpsl_ac_timer, 0); - setup_timer(&sharpsl_pm.chrg_full_timer, sharpsl_chrg_full_timer, 0UL); + timer_setup(&sharpsl_pm.chrg_full_timer, sharpsl_chrg_full_timer, 0); led_trigger_register_simple("sharpsl-charge", &sharpsl_charge_led_trigger); From a66b899dfbb51201e6ee176d77f27885dc362fc6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 21:42:26 -0700 Subject: [PATCH 0885/1085] mips: ip22/32: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adds a static variable to hold timeout value. Cc: Ralf Baechle Cc: Paul Gortmaker Cc: Ingo Molnar Cc: James Hogan Cc: Arnd Bergmann Cc: linux-mips@linux-mips.org Signed-off-by: Kees Cook --- arch/mips/sgi-ip22/ip22-reset.c | 26 ++++++++++++-------------- arch/mips/sgi-ip32/ip32-reset.c | 21 ++++++++++----------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c index 03a39ac5ead9..c374f3ceec38 100644 --- a/arch/mips/sgi-ip22/ip22-reset.c +++ b/arch/mips/sgi-ip22/ip22-reset.c @@ -38,6 +38,7 @@ #define PANIC_FREQ (HZ / 8) static struct timer_list power_timer, blink_timer, debounce_timer; +static unsigned long blink_timer_timeout; #define MACHINE_PANICED 1 #define MACHINE_SHUTTING_DOWN 2 @@ -81,21 +82,21 @@ static void __noreturn sgi_machine_halt(void) ArcEnterInteractiveMode(); } -static void power_timeout(unsigned long data) +static void power_timeout(struct timer_list *unused) { sgi_machine_power_off(); } -static void blink_timeout(unsigned long data) +static void blink_timeout(struct timer_list *unused) { /* XXX fix this for fullhouse */ sgi_ioc_reset ^= (SGIOC_RESET_LC0OFF|SGIOC_RESET_LC1OFF); sgioc->reset = sgi_ioc_reset; - mod_timer(&blink_timer, jiffies + data); + mod_timer(&blink_timer, jiffies + blink_timer_timeout); } -static void debounce(unsigned long data) +static void debounce(struct timer_list *unused) { del_timer(&debounce_timer); if (sgint->istat1 & SGINT_ISTAT1_PWR) { @@ -128,11 +129,10 @@ static inline void power_button(void) } machine_state |= MACHINE_SHUTTING_DOWN; - blink_timer.data = POWERDOWN_FREQ; - blink_timeout(POWERDOWN_FREQ); + blink_timer_timeout = POWERDOWN_FREQ; + blink_timeout(&blink_timer); - init_timer(&power_timer); - power_timer.function = power_timeout; + timer_setup(&power_timer, power_timeout, 0); power_timer.expires = jiffies + POWERDOWN_TIMEOUT * HZ; add_timer(&power_timer); } @@ -147,8 +147,7 @@ static irqreturn_t panel_int(int irq, void *dev_id) if (sgint->istat1 & SGINT_ISTAT1_PWR) { /* Wait until interrupt goes away */ disable_irq_nosync(SGI_PANEL_IRQ); - init_timer(&debounce_timer); - debounce_timer.function = debounce; + timer_setup(&debounce_timer, debounce, 0); debounce_timer.expires = jiffies + 5; add_timer(&debounce_timer); } @@ -171,8 +170,8 @@ static int panic_event(struct notifier_block *this, unsigned long event, return NOTIFY_DONE; machine_state |= MACHINE_PANICED; - blink_timer.data = PANIC_FREQ; - blink_timeout(PANIC_FREQ); + blink_timer_timeout = PANIC_FREQ; + blink_timeout(&blink_timer); return NOTIFY_DONE; } @@ -195,8 +194,7 @@ static int __init reboot_setup(void) return res; } - init_timer(&blink_timer); - blink_timer.function = blink_timeout; + timer_setup(&blink_timer, blink_timeout, 0); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); return 0; diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c index b3b442def423..20d8637340be 100644 --- a/arch/mips/sgi-ip32/ip32-reset.c +++ b/arch/mips/sgi-ip32/ip32-reset.c @@ -38,6 +38,7 @@ extern struct platform_device ip32_rtc_device; static struct timer_list power_timer, blink_timer; +static unsigned long blink_timer_timeout; static int has_panicked, shutting_down; static __noreturn void ip32_poweroff(void *data) @@ -71,11 +72,11 @@ static void ip32_machine_restart(char *cmd) unreachable(); } -static void blink_timeout(unsigned long data) +static void blink_timeout(struct timer_list *unused) { unsigned long led = mace->perif.ctrl.misc ^ MACEISA_LED_RED; mace->perif.ctrl.misc = led; - mod_timer(&blink_timer, jiffies + data); + mod_timer(&blink_timer, jiffies + blink_timer_timeout); } static void ip32_machine_halt(void) @@ -83,7 +84,7 @@ static void ip32_machine_halt(void) ip32_poweroff(&ip32_rtc_device); } -static void power_timeout(unsigned long data) +static void power_timeout(struct timer_list *unused) { ip32_poweroff(&ip32_rtc_device); } @@ -99,11 +100,10 @@ void ip32_prepare_poweroff(void) } shutting_down = 1; - blink_timer.data = POWERDOWN_FREQ; - blink_timeout(POWERDOWN_FREQ); + blink_timer_timeout = POWERDOWN_FREQ; + blink_timeout(&blink_timer); - init_timer(&power_timer); - power_timer.function = power_timeout; + timer_setup(&power_timer, power_timeout, 0); power_timer.expires = jiffies + POWERDOWN_TIMEOUT * HZ; add_timer(&power_timer); } @@ -121,8 +121,8 @@ static int panic_event(struct notifier_block *this, unsigned long event, led = mace->perif.ctrl.misc | MACEISA_LED_GREEN; mace->perif.ctrl.misc = led; - blink_timer.data = PANIC_FREQ; - blink_timeout(PANIC_FREQ); + blink_timer_timeout = PANIC_FREQ; + blink_timeout(&blink_timer); return NOTIFY_DONE; } @@ -143,8 +143,7 @@ static __init int ip32_reboot_setup(void) _machine_halt = ip32_machine_halt; pm_power_off = ip32_machine_halt; - init_timer(&blink_timer); - blink_timer.function = blink_timeout; + timer_setup(&blink_timer, blink_timeout, 0); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); return 0; From db275f2a02cd35d7a53564100ccf18ccb8a1130d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 21:26:26 -0700 Subject: [PATCH 0886/1085] sparc/led: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adds a static variable to hold timeout value. Cc: "David S. Miller" Cc: Geliang Tang Cc: Ingo Molnar Cc: sparclinux@vger.kernel.org Signed-off-by: Kees Cook --- arch/sparc/kernel/led.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c index e278bf52963b..519f5ba7ed7e 100644 --- a/arch/sparc/kernel/led.c +++ b/arch/sparc/kernel/led.c @@ -31,19 +31,20 @@ static inline void led_toggle(void) } static struct timer_list led_blink_timer; +static unsigned long led_blink_timer_timeout; -static void led_blink(unsigned long timeout) +static void led_blink(struct timer_list *unused) { + unsigned long timeout = led_blink_timer_timeout; + led_toggle(); /* reschedule */ if (!timeout) { /* blink according to load */ led_blink_timer.expires = jiffies + ((1 + (avenrun[0] >> FSHIFT)) * HZ); - led_blink_timer.data = 0; } else { /* blink at user specified interval */ led_blink_timer.expires = jiffies + (timeout * HZ); - led_blink_timer.data = timeout; } add_timer(&led_blink_timer); } @@ -88,9 +89,11 @@ static ssize_t led_proc_write(struct file *file, const char __user *buffer, } else if (!strcmp(buf, "toggle")) { led_toggle(); } else if ((*buf > '0') && (*buf <= '9')) { - led_blink(simple_strtoul(buf, NULL, 10)); + led_blink_timer_timeout = simple_strtoul(buf, NULL, 10); + led_blink(&led_blink_timer); } else if (!strcmp(buf, "load")) { - led_blink(0); + led_blink_timer_timeout = 0; + led_blink(&led_blink_timer); } else { auxio_set_led(AUXIO_LED_OFF); } @@ -115,8 +118,7 @@ static struct proc_dir_entry *led; static int __init led_init(void) { - init_timer(&led_blink_timer); - led_blink_timer.function = led_blink; + timer_setup(&led_blink_timer, led_blink, 0); led = proc_create("led", 0, NULL, &led_proc_fops); if (!led) From 607a6301074fecfb855f08ecdd65b18d089e4143 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 14:08:22 -0700 Subject: [PATCH 0887/1085] auxdisplay: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Paul Burton Cc: Miguel Ojeda Sandonis Signed-off-by: Kees Cook Reviewed-by: Paul Burton Tested-by: Paul Burton # for img-ascii-lcd --- drivers/auxdisplay/img-ascii-lcd.c | 10 ++++------ drivers/auxdisplay/panel.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c index 25306fa27251..c9e32d29ec81 100644 --- a/drivers/auxdisplay/img-ascii-lcd.c +++ b/drivers/auxdisplay/img-ascii-lcd.c @@ -229,9 +229,9 @@ MODULE_DEVICE_TABLE(of, img_ascii_lcd_matches); * Scroll the current message along the LCD by one character, rearming the * timer if required. */ -static void img_ascii_lcd_scroll(unsigned long arg) +static void img_ascii_lcd_scroll(struct timer_list *t) { - struct img_ascii_lcd_ctx *ctx = (struct img_ascii_lcd_ctx *)arg; + struct img_ascii_lcd_ctx *ctx = from_timer(ctx, t, timer); unsigned int i, ch = ctx->scroll_pos; unsigned int num_chars = ctx->cfg->num_chars; @@ -299,7 +299,7 @@ static int img_ascii_lcd_display(struct img_ascii_lcd_ctx *ctx, ctx->scroll_pos = 0; /* update the LCD */ - img_ascii_lcd_scroll((unsigned long)ctx); + img_ascii_lcd_scroll(&ctx->timer); return 0; } @@ -395,9 +395,7 @@ static int img_ascii_lcd_probe(struct platform_device *pdev) ctx->scroll_rate = HZ / 2; /* initialise a timer for scrolling the message */ - init_timer(&ctx->timer); - ctx->timer.function = img_ascii_lcd_scroll; - ctx->timer.data = (unsigned long)ctx; + timer_setup(&ctx->timer, img_ascii_lcd_scroll, 0); platform_set_drvdata(pdev, ctx); diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index 6911acd896d9..ea7869c0d7f9 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1396,7 +1396,7 @@ static void panel_process_inputs(void) } } -static void panel_scan_timer(void) +static void panel_scan_timer(struct timer_list *unused) { if (keypad.enabled && keypad_initialized) { if (spin_trylock_irq(&pprt_lock)) { @@ -1421,7 +1421,7 @@ static void init_scan_timer(void) if (scan_timer.function) return; /* already started */ - setup_timer(&scan_timer, (void *)&panel_scan_timer, 0); + timer_setup(&scan_timer, panel_scan_timer, 0); scan_timer.expires = jiffies + INPUT_POLL_TIME; add_timer(&scan_timer); } From 200d24d63303cc2131c9da571f4ef72887e3aa82 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2017 15:55:48 -0700 Subject: [PATCH 0888/1085] hwrng/xgene-rng: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Matt Mackall Cc: Herbert Xu Cc: linux-crypto@vger.kernel.org Signed-off-by: Kees Cook --- drivers/char/hw_random/xgene-rng.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/char/hw_random/xgene-rng.c b/drivers/char/hw_random/xgene-rng.c index 3c77645405e5..71755790c32b 100644 --- a/drivers/char/hw_random/xgene-rng.c +++ b/drivers/char/hw_random/xgene-rng.c @@ -100,9 +100,9 @@ struct xgene_rng_dev { struct clk *clk; }; -static void xgene_rng_expired_timer(unsigned long arg) +static void xgene_rng_expired_timer(struct timer_list *t) { - struct xgene_rng_dev *ctx = (struct xgene_rng_dev *) arg; + struct xgene_rng_dev *ctx = from_timer(ctx, t, failure_timer); /* Clear failure counter as timer expired */ disable_irq(ctx->irq); @@ -113,8 +113,6 @@ static void xgene_rng_expired_timer(unsigned long arg) static void xgene_rng_start_timer(struct xgene_rng_dev *ctx) { - ctx->failure_timer.data = (unsigned long) ctx; - ctx->failure_timer.function = xgene_rng_expired_timer; ctx->failure_timer.expires = jiffies + 120 * HZ; add_timer(&ctx->failure_timer); } @@ -292,7 +290,7 @@ static int xgene_rng_init(struct hwrng *rng) struct xgene_rng_dev *ctx = (struct xgene_rng_dev *) rng->priv; ctx->failure_cnt = 0; - init_timer(&ctx->failure_timer); + timer_setup(&ctx->failure_timer, xgene_rng_expired_timer, 0); ctx->revision = readl(ctx->csr_base + RNG_EIP_REV); From 0788f28575801dadbddbfbe11e0bcc8174fffee3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 21 Oct 2017 13:43:53 -0700 Subject: [PATCH 0889/1085] drivers/macintosh: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Benjamin Herrenschmidt Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Kees Cook --- drivers/macintosh/smu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index ea9bdc85a21d..899ec1f4c833 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -103,7 +103,7 @@ static DEFINE_MUTEX(smu_part_access); static int smu_irq_inited; static unsigned long smu_cmdbuf_abs; -static void smu_i2c_retry(unsigned long data); +static void smu_i2c_retry(struct timer_list *t); /* * SMU driver low level stuff @@ -582,9 +582,7 @@ static int smu_late_init(void) if (!smu) return 0; - init_timer(&smu->i2c_timer); - smu->i2c_timer.function = smu_i2c_retry; - smu->i2c_timer.data = (unsigned long)smu; + timer_setup(&smu->i2c_timer, smu_i2c_retry, 0); if (smu->db_node) { smu->db_irq = irq_of_parse_and_map(smu->db_node, 0); @@ -755,7 +753,7 @@ static void smu_i2c_complete_command(struct smu_i2c_cmd *cmd, int fail) } -static void smu_i2c_retry(unsigned long data) +static void smu_i2c_retry(struct timer_list *unused) { struct smu_i2c_cmd *cmd = smu->cmd_i2c_cur; @@ -795,7 +793,7 @@ static void smu_i2c_low_completion(struct smu_cmd *scmd, void *misc) BUG_ON(cmd != smu->cmd_i2c_cur); if (!smu_irq_inited) { mdelay(5); - smu_i2c_retry(0); + smu_i2c_retry(NULL); return; } mod_timer(&smu->i2c_timer, jiffies + msecs_to_jiffies(5)); From 6243d38fc0775e9dc1a38835aef55efb273ac4d5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 14:43:33 -0700 Subject: [PATCH 0890/1085] drivers/memstick: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: Kees Cook --- drivers/memstick/host/jmb38x_ms.c | 10 ++++++---- drivers/memstick/host/r592.c | 7 +++---- drivers/memstick/host/tifm_ms.c | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index 48db922075e2..bcdca9fbef51 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -59,6 +59,7 @@ struct jmb38x_ms_host { unsigned int block_pos; unsigned long timeout_jiffies; struct timer_list timer; + struct memstick_host *msh; struct memstick_request *req; unsigned char cmd_flags; unsigned char io_pos; @@ -592,10 +593,10 @@ static irqreturn_t jmb38x_ms_isr(int irq, void *dev_id) return IRQ_HANDLED; } -static void jmb38x_ms_abort(unsigned long data) +static void jmb38x_ms_abort(struct timer_list *t) { - struct memstick_host *msh = (struct memstick_host *)data; - struct jmb38x_ms_host *host = memstick_priv(msh); + struct jmb38x_ms_host *host = from_timer(host, t, timer); + struct memstick_host *msh = host->msh; unsigned long flags; dev_dbg(&host->chip->pdev->dev, "abort\n"); @@ -878,6 +879,7 @@ static struct memstick_host *jmb38x_ms_alloc_host(struct jmb38x_ms *jm, int cnt) return NULL; host = memstick_priv(msh); + host->msh = msh; host->chip = jm; host->addr = ioremap(pci_resource_start(jm->pdev, cnt), pci_resource_len(jm->pdev, cnt)); @@ -897,7 +899,7 @@ static struct memstick_host *jmb38x_ms_alloc_host(struct jmb38x_ms *jm, int cnt) msh->caps = MEMSTICK_CAP_PAR4 | MEMSTICK_CAP_PAR8; - setup_timer(&host->timer, jmb38x_ms_abort, (unsigned long)msh); + timer_setup(&host->timer, jmb38x_ms_abort, 0); if (!request_irq(host->irq, jmb38x_ms_isr, IRQF_SHARED, host->host_id, msh)) diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index d5cfb503b9d6..627d6e62fe31 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -616,9 +616,9 @@ static void r592_update_card_detect(struct r592_device *dev) } /* Timer routine that fires 1 second after last card detection event, */ -static void r592_detect_timer(long unsigned int data) +static void r592_detect_timer(struct timer_list *t) { - struct r592_device *dev = (struct r592_device *)data; + struct r592_device *dev = from_timer(dev, t, detect_timer); r592_update_card_detect(dev); memstick_detect_change(dev->host); } @@ -770,8 +770,7 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&dev->io_thread_lock); init_completion(&dev->dma_done); INIT_KFIFO(dev->pio_fifo); - setup_timer(&dev->detect_timer, - r592_detect_timer, (long unsigned int)dev); + timer_setup(&dev->detect_timer, r592_detect_timer, 0); /* Host initialization */ host->caps = MEMSTICK_CAP_PAR4; diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c index 7bafa72f8f57..bed205849d02 100644 --- a/drivers/memstick/host/tifm_ms.c +++ b/drivers/memstick/host/tifm_ms.c @@ -538,9 +538,9 @@ static int tifm_ms_set_param(struct memstick_host *msh, return 0; } -static void tifm_ms_abort(unsigned long data) +static void tifm_ms_abort(struct timer_list *t) { - struct tifm_ms *host = (struct tifm_ms *)data; + struct tifm_ms *host = from_timer(host, t, timer); dev_dbg(&host->dev->dev, "status %x\n", readl(host->dev->addr + SOCK_MS_STATUS)); @@ -575,7 +575,7 @@ static int tifm_ms_probe(struct tifm_dev *sock) host->dev = sock; host->timeout_jiffies = msecs_to_jiffies(1000); - setup_timer(&host->timer, tifm_ms_abort, (unsigned long)host); + timer_setup(&host->timer, tifm_ms_abort, 0); tasklet_init(&host->notify, tifm_ms_req_tasklet, (unsigned long)msh); msh->request = tifm_ms_submit_req; From 41760d0e0f1a013e607956eaf22a60ff6dd03784 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 21 Oct 2017 12:09:17 -0700 Subject: [PATCH 0891/1085] drivers/pcmcia: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Florian Fainelli Cc: bcm-kernel-feedback-list@broadcom.com Cc: David Howells Cc: Arnd Bergmann Cc: linux-pcmcia@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook Acked-by: Russell King # for soc_common.c --- drivers/pcmcia/bcm63xx_pcmcia.c | 6 +++--- drivers/pcmcia/bfin_cf_pcmcia.c | 6 +++--- drivers/pcmcia/i82365.c | 6 ++---- drivers/pcmcia/omap_cf.c | 8 ++++---- drivers/pcmcia/pd6729.c | 7 +++---- drivers/pcmcia/soc_common.c | 7 +++---- drivers/pcmcia/tcic.c | 8 +++----- drivers/pcmcia/yenta_socket.c | 7 +++---- 8 files changed, 24 insertions(+), 31 deletions(-) diff --git a/drivers/pcmcia/bcm63xx_pcmcia.c b/drivers/pcmcia/bcm63xx_pcmcia.c index 0802e0bc7d0c..16f573173471 100644 --- a/drivers/pcmcia/bcm63xx_pcmcia.c +++ b/drivers/pcmcia/bcm63xx_pcmcia.c @@ -263,12 +263,12 @@ static int bcm63xx_pcmcia_get_status(struct pcmcia_socket *sock, /* * socket polling timer callback */ -static void bcm63xx_pcmcia_poll(unsigned long data) +static void bcm63xx_pcmcia_poll(struct timer_list *t) { struct bcm63xx_pcmcia_socket *skt; unsigned int stat, events; - skt = (struct bcm63xx_pcmcia_socket *)data; + skt = from_timer(skt, t, timer); spin_lock_bh(&skt->lock); @@ -392,7 +392,7 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) sock->map_size = resource_size(skt->common_res); /* initialize polling timer */ - setup_timer(&skt->timer, bcm63xx_pcmcia_poll, (unsigned long)skt); + timer_setup(&skt->timer, bcm63xx_pcmcia_poll, 0); /* initialize pcmcia control register, drive VS[12] to 0, * leave CB IDSEL to the old value since it is set by the PCI diff --git a/drivers/pcmcia/bfin_cf_pcmcia.c b/drivers/pcmcia/bfin_cf_pcmcia.c index 8b0923fd76c6..00a296d431ba 100644 --- a/drivers/pcmcia/bfin_cf_pcmcia.c +++ b/drivers/pcmcia/bfin_cf_pcmcia.c @@ -86,9 +86,9 @@ static int bfin_cf_ss_init(struct pcmcia_socket *s) } /* the timer is primarily to kick this socket's pccardd */ -static void bfin_cf_timer(unsigned long _cf) +static void bfin_cf_timer(struct timer_list *t) { - struct bfin_cf_socket *cf = (void *)_cf; + struct bfin_cf_socket *cf = from_timer(cf, t, timer); unsigned short present = bfin_cf_present(cf->cd_pfx); if (present != cf->present) { @@ -227,7 +227,7 @@ static int bfin_cf_probe(struct platform_device *pdev) cf->cd_pfx = cd_pfx; - setup_timer(&cf->timer, bfin_cf_timer, (unsigned long)cf); + timer_setup(&cf->timer, bfin_cf_timer, 0); cf->pdev = pdev; platform_set_drvdata(pdev, cf); diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c index fb38cc01859f..891ccea2cccb 100644 --- a/drivers/pcmcia/i82365.c +++ b/drivers/pcmcia/i82365.c @@ -875,7 +875,7 @@ static irqreturn_t pcic_interrupt(int irq, void *dev) return IRQ_RETVAL(handled); } /* pcic_interrupt */ -static void pcic_interrupt_wrapper(u_long data) +static void pcic_interrupt_wrapper(struct timer_list *unused) { pcic_interrupt(0, NULL); poll_timer.expires = jiffies + poll_interval; @@ -1289,9 +1289,7 @@ static int __init init_i82365(void) /* Finally, schedule a polling interrupt */ if (poll_interval != 0) { - poll_timer.function = pcic_interrupt_wrapper; - poll_timer.data = 0; - init_timer(&poll_timer); + timer_setup(&poll_timer, pcic_interrupt_wrapper, 0); poll_timer.expires = jiffies + poll_interval; add_timer(&poll_timer); } diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index 4e2f501e5548..8216ceb51b18 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -80,9 +80,9 @@ static int omap_cf_ss_init(struct pcmcia_socket *s) } /* the timer is primarily to kick this socket's pccardd */ -static void omap_cf_timer(unsigned long _cf) +static void omap_cf_timer(struct timer_list *t) { - struct omap_cf_socket *cf = (void *) _cf; + struct omap_cf_socket *cf = from_timer(cf, t, timer); unsigned present = omap_cf_present(); if (present != cf->present) { @@ -102,7 +102,7 @@ static void omap_cf_timer(unsigned long _cf) */ static irqreturn_t omap_cf_irq(int irq, void *_cf) { - omap_cf_timer((unsigned long)_cf); + omap_cf_timer(&_cf->timer); return IRQ_HANDLED; } @@ -220,7 +220,7 @@ static int __init omap_cf_probe(struct platform_device *pdev) cf = kzalloc(sizeof *cf, GFP_KERNEL); if (!cf) return -ENOMEM; - setup_timer(&cf->timer, omap_cf_timer, (unsigned long)cf); + timer_setup(&cf->timer, omap_cf_timer, 0); cf->pdev = pdev; platform_set_drvdata(pdev, cf); diff --git a/drivers/pcmcia/pd6729.c b/drivers/pcmcia/pd6729.c index 0f70b4d58f9e..959ae3e65ef8 100644 --- a/drivers/pcmcia/pd6729.c +++ b/drivers/pcmcia/pd6729.c @@ -234,9 +234,9 @@ static irqreturn_t pd6729_interrupt(int irq, void *dev) /* socket functions */ -static void pd6729_interrupt_wrapper(unsigned long data) +static void pd6729_interrupt_wrapper(struct timer_list *t) { - struct pd6729_socket *socket = (struct pd6729_socket *) data; + struct pd6729_socket *socket = from_timer(socket, t, poll_timer); pd6729_interrupt(0, (void *)socket); mod_timer(&socket->poll_timer, jiffies + HZ); @@ -707,8 +707,7 @@ static int pd6729_pci_probe(struct pci_dev *dev, } } else { /* poll Card status change */ - setup_timer(&socket->poll_timer, pd6729_interrupt_wrapper, - (unsigned long)socket); + timer_setup(&socket->poll_timer, pd6729_interrupt_wrapper, 0); mod_timer(&socket->poll_timer, jiffies + HZ); } diff --git a/drivers/pcmcia/soc_common.c b/drivers/pcmcia/soc_common.c index b6b316de055c..764650eb8897 100644 --- a/drivers/pcmcia/soc_common.c +++ b/drivers/pcmcia/soc_common.c @@ -456,9 +456,9 @@ static void soc_common_check_status(struct soc_pcmcia_socket *skt) } /* Let's poll for events in addition to IRQs since IRQ only is unreliable... */ -static void soc_common_pcmcia_poll_event(unsigned long dummy) +static void soc_common_pcmcia_poll_event(struct timer_list *t) { - struct soc_pcmcia_socket *skt = (struct soc_pcmcia_socket *)dummy; + struct soc_pcmcia_socket *skt = from_timer(skt, t, poll_timer); debug(skt, 4, "polling for events\n"); mod_timer(&skt->poll_timer, jiffies + SOC_PCMCIA_POLL_PERIOD); @@ -794,8 +794,7 @@ int soc_pcmcia_add_one(struct soc_pcmcia_socket *skt) skt->cs_state = dead_socket; - setup_timer(&skt->poll_timer, soc_common_pcmcia_poll_event, - (unsigned long)skt); + timer_setup(&skt->poll_timer, soc_common_pcmcia_poll_event, 0); skt->poll_timer.expires = jiffies + SOC_PCMCIA_POLL_PERIOD; ret = request_resource(&iomem_resource, &skt->res_skt); diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c index a1ac72d51d70..1a0e3f098759 100644 --- a/drivers/pcmcia/tcic.c +++ b/drivers/pcmcia/tcic.c @@ -98,7 +98,7 @@ module_param(cycle_time, int, 0444); /*====================================================================*/ static irqreturn_t tcic_interrupt(int irq, void *dev); -static void tcic_timer(u_long data); +static void tcic_timer(struct timer_list *unused); static struct pccard_operations tcic_operations; struct tcic_socket { @@ -435,9 +435,7 @@ static int __init init_tcic(void) } /* Set up polling */ - poll_timer.function = &tcic_timer; - poll_timer.data = 0; - init_timer(&poll_timer); + timer_setup(&poll_timer, &tcic_timer, 0); /* Build interrupt mask */ printk(KERN_CONT ", %d sockets\n", sockets); @@ -583,7 +581,7 @@ static irqreturn_t tcic_interrupt(int irq, void *dev) return IRQ_HANDLED; } /* tcic_interrupt */ -static void tcic_timer(u_long data) +static void tcic_timer(struct timer_list *unused) { pr_debug("tcic_timer()\n"); tcic_timer_pending = 0; diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 5d6d9b1549bc..ab3da2262f0f 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -534,9 +534,9 @@ static irqreturn_t yenta_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void yenta_interrupt_wrapper(unsigned long data) +static void yenta_interrupt_wrapper(struct timer_list *t) { - struct yenta_socket *socket = (struct yenta_socket *) data; + struct yenta_socket *socket = from_timer(socket, t, poll_timer); yenta_interrupt(0, (void *)socket); socket->poll_timer.expires = jiffies + HZ; @@ -1233,8 +1233,7 @@ static int yenta_probe(struct pci_dev *dev, const struct pci_device_id *id) if (!socket->cb_irq || request_irq(socket->cb_irq, yenta_interrupt, IRQF_SHARED, "yenta", socket)) { /* No IRQ or request_irq failed. Poll */ socket->cb_irq = 0; /* But zero is a valid IRQ number. */ - setup_timer(&socket->poll_timer, yenta_interrupt_wrapper, - (unsigned long)socket); + timer_setup(&socket->poll_timer, yenta_interrupt_wrapper, 0); mod_timer(&socket->poll_timer, jiffies + HZ); dev_info(&dev->dev, "no PCI IRQ, CardBus support disabled for this socket.\n"); From 25b42fa8f8a45d863f704bbfe3c79a46958cf4dc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 14:49:58 -0700 Subject: [PATCH 0892/1085] drivers/sgi-xp: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Cliff Whickman Signed-off-by: Kees Cook Acked-by: Robin Holt --- drivers/misc/sgi-xp/xpc_main.c | 15 ++++++--------- drivers/misc/sgi-xp/xpc_sn2.c | 15 ++++++--------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c index 7f327121e6d7..0c775d6fcf59 100644 --- a/drivers/misc/sgi-xp/xpc_main.c +++ b/drivers/misc/sgi-xp/xpc_main.c @@ -172,9 +172,9 @@ struct xpc_arch_operations xpc_arch_ops; * Timer function to enforce the timelimit on the partition disengage. */ static void -xpc_timeout_partition_disengage(unsigned long data) +xpc_timeout_partition_disengage(struct timer_list *t) { - struct xpc_partition *part = (struct xpc_partition *)data; + struct xpc_partition *part = from_timer(part, t, disengage_timer); DBUG_ON(time_is_after_jiffies(part->disengage_timeout)); @@ -190,7 +190,7 @@ xpc_timeout_partition_disengage(unsigned long data) * specify when the next timeout should occur. */ static void -xpc_hb_beater(unsigned long dummy) +xpc_hb_beater(struct timer_list *unused) { xpc_arch_ops.increment_heartbeat(); @@ -205,8 +205,7 @@ static void xpc_start_hb_beater(void) { xpc_arch_ops.heartbeat_init(); - init_timer(&xpc_hb_timer); - xpc_hb_timer.function = xpc_hb_beater; + timer_setup(&xpc_hb_timer, xpc_hb_beater, 0); xpc_hb_beater(0); } @@ -931,10 +930,8 @@ xpc_setup_partitions(void) part->act_state = XPC_P_AS_INACTIVE; XPC_SET_REASON(part, 0, 0); - init_timer(&part->disengage_timer); - part->disengage_timer.function = - xpc_timeout_partition_disengage; - part->disengage_timer.data = (unsigned long)part; + timer_setup(&part->disengage_timer, + xpc_timeout_partition_disengage, 0); part->setup_state = XPC_P_SS_UNSET; init_waitqueue_head(&part->teardown_wq); diff --git a/drivers/misc/sgi-xp/xpc_sn2.c b/drivers/misc/sgi-xp/xpc_sn2.c index 7d71c04fc938..5a12d2a54049 100644 --- a/drivers/misc/sgi-xp/xpc_sn2.c +++ b/drivers/misc/sgi-xp/xpc_sn2.c @@ -323,16 +323,16 @@ xpc_handle_notify_IRQ_sn2(int irq, void *dev_id) * was received. */ static void -xpc_check_for_dropped_notify_IRQ_sn2(struct xpc_partition *part) +xpc_check_for_dropped_notify_IRQ_sn2(struct timer_list *t) { - struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + struct xpc_partition *part = + from_timer(part, t, sn.sn2.dropped_notify_IRQ_timer); if (xpc_part_ref(part)) { xpc_check_for_sent_chctl_flags_sn2(part); - part_sn2->dropped_notify_IRQ_timer.expires = jiffies + - XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL; - add_timer(&part_sn2->dropped_notify_IRQ_timer); + t->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL; + add_timer(t); xpc_part_deref(part); } } @@ -1232,10 +1232,7 @@ xpc_setup_ch_structures_sn2(struct xpc_partition *part) /* Setup a timer to check for dropped notify IRQs */ timer = &part_sn2->dropped_notify_IRQ_timer; - init_timer(timer); - timer->function = - (void (*)(unsigned long))xpc_check_for_dropped_notify_IRQ_sn2; - timer->data = (unsigned long)part; + timer_setup(timer, xpc_check_for_dropped_notify_IRQ_sn2, 0); timer->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL; add_timer(timer); From ddc92bec6d7d7e8a07794a8dbeade19476891b53 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 21 Oct 2017 22:39:35 +0900 Subject: [PATCH 0893/1085] dt-bindings: openrisc: Add OpenRISC platform SoC Add devicetree binding documentation for the OpenRISC platform opencores,or1ksim. This is the main OpenRISC reference platform supporting multiple FPGA SoC's. This format is based on some of the mips binding docs as we have similar requirements. Also, update maintainers so openrisc related binding changes are visible to the openrisc team. Acked-by: Rob Herring Suggested-by: Pavel Machek Signed-off-by: Stafford Horne --- .../bindings/openrisc/opencores/or1ksim.txt | 39 +++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 40 insertions(+) create mode 100644 Documentation/devicetree/bindings/openrisc/opencores/or1ksim.txt diff --git a/Documentation/devicetree/bindings/openrisc/opencores/or1ksim.txt b/Documentation/devicetree/bindings/openrisc/opencores/or1ksim.txt new file mode 100644 index 000000000000..4950c794ecbb --- /dev/null +++ b/Documentation/devicetree/bindings/openrisc/opencores/or1ksim.txt @@ -0,0 +1,39 @@ +OpenRISC Generic SoC +==================== + +Boards and FPGA SoC's which support the OpenRISC standard platform. The +platform essentially follows the conventions of the OpenRISC architecture +specification, however some aspects, such as the boot protocol have been defined +by the Linux port. + +Required properties +------------------- + - compatible: Must include "opencores,or1ksim" + +CPU nodes: +---------- +A "cpus" node is required. Required properties: + - #address-cells: Must be 1. + - #size-cells: Must be 0. +A CPU sub-node is also required for at least CPU 0. Since the topology may +be probed via CPS, it is not necessary to specify secondary CPUs. Required +properties: + - compatible: Must be "opencores,or1200-rtlsvn481". + - reg: CPU number. + - clock-frequency: The CPU clock frequency in Hz. +Example: + cpus { + #address-cells = <1>; + #size-cells = <0>; + cpu@0 { + compatible = "opencores,or1200-rtlsvn481"; + reg = <0>; + clock-frequency = <20000000>; + }; + }; + + +Boot protocol +------------- +The bootloader may pass the following arguments to the kernel: + - r3: address of a flattened device-tree blob or 0x0. diff --git a/MAINTAINERS b/MAINTAINERS index 4e2b720ca661..bcb7e2525f54 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10031,6 +10031,7 @@ T: git git://github.com/openrisc/linux.git L: openrisc@lists.librecores.org W: http://openrisc.io S: Maintained +F: Documentation/devicetree/bindings/openrisc/ F: Documentation/openrisc/ F: arch/openrisc/ F: drivers/irqchip/irq-or1k-* From 91993c8c2ed52781a0f42bf7f40e28adc96e2bb2 Mon Sep 17 00:00:00 2001 From: Stefan Kristiansson Date: Sun, 11 May 2014 12:08:37 +0300 Subject: [PATCH 0894/1085] openrisc: use shadow registers to save regs on exception Previously, the area between 0x0-0x100 have been used as a "scratch" memory area to temporarily store regs during exception entry. In a multi-core environment, this will not work. This change is to use shadow registers for nested context. Currently only the "critical" temp load/stores are covered, the EMERGENCY_PRINT ones are left as is (when they are used, it's game over anyway), they need to be handled as well in the future. Signed-off-by: Stefan Kristiansson Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 11 +++++ arch/openrisc/kernel/head.S | 95 +++++++++++++++++++++++++++---------- 2 files changed, 80 insertions(+), 26 deletions(-) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index a0f2e4a323c1..356dd67a86ea 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -124,6 +124,17 @@ config OPENRISC_NO_SPR_SR_DSX Say N here if you know that your OpenRISC processor has SPR_SR_DSX bit implemented. Say Y if you are unsure. +config OPENRISC_HAVE_SHADOW_GPRS + bool "Support for shadow gpr files" if !SMP + default y if SMP + help + Say Y here if your OpenRISC processor features shadowed + register files. They will in such case be used as a + scratch reg storage on exception entry. + + On SMP systems, this feature is mandatory. + On a unicore system it's safe to say N here if you are unsure. + config CMDLINE string "Default kernel command string" default "" diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 1e87913576e3..1e49895408f4 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -49,9 +49,31 @@ /* ============================================[ tmp store locations ]=== */ +#define SPR_SHADOW_GPR(x) ((x) + SPR_GPR_BASE + 32) + /* * emergency_print temporary stores */ +#ifdef CONFIG_OPENRISC_HAVE_SHADOW_GPRS +#define EMERGENCY_PRINT_STORE_GPR4 l.mtspr r0,r4,SPR_SHADOW_GPR(14) +#define EMERGENCY_PRINT_LOAD_GPR4 l.mfspr r4,r0,SPR_SHADOW_GPR(14) + +#define EMERGENCY_PRINT_STORE_GPR5 l.mtspr r0,r5,SPR_SHADOW_GPR(15) +#define EMERGENCY_PRINT_LOAD_GPR5 l.mfspr r5,r0,SPR_SHADOW_GPR(15) + +#define EMERGENCY_PRINT_STORE_GPR6 l.mtspr r0,r6,SPR_SHADOW_GPR(16) +#define EMERGENCY_PRINT_LOAD_GPR6 l.mfspr r6,r0,SPR_SHADOW_GPR(16) + +#define EMERGENCY_PRINT_STORE_GPR7 l.mtspr r0,r7,SPR_SHADOW_GPR(7) +#define EMERGENCY_PRINT_LOAD_GPR7 l.mfspr r7,r0,SPR_SHADOW_GPR(7) + +#define EMERGENCY_PRINT_STORE_GPR8 l.mtspr r0,r8,SPR_SHADOW_GPR(8) +#define EMERGENCY_PRINT_LOAD_GPR8 l.mfspr r8,r0,SPR_SHADOW_GPR(8) + +#define EMERGENCY_PRINT_STORE_GPR9 l.mtspr r0,r9,SPR_SHADOW_GPR(9) +#define EMERGENCY_PRINT_LOAD_GPR9 l.mfspr r9,r0,SPR_SHADOW_GPR(9) + +#else /* !CONFIG_OPENRISC_HAVE_SHADOW_GPRS */ #define EMERGENCY_PRINT_STORE_GPR4 l.sw 0x20(r0),r4 #define EMERGENCY_PRINT_LOAD_GPR4 l.lwz r4,0x20(r0) @@ -70,13 +92,28 @@ #define EMERGENCY_PRINT_STORE_GPR9 l.sw 0x34(r0),r9 #define EMERGENCY_PRINT_LOAD_GPR9 l.lwz r9,0x34(r0) +#endif /* * TLB miss handlers temorary stores */ -#define EXCEPTION_STORE_GPR9 l.sw 0x10(r0),r9 -#define EXCEPTION_LOAD_GPR9 l.lwz r9,0x10(r0) +#ifdef CONFIG_OPENRISC_HAVE_SHADOW_GPRS +#define EXCEPTION_STORE_GPR2 l.mtspr r0,r2,SPR_SHADOW_GPR(2) +#define EXCEPTION_LOAD_GPR2 l.mfspr r2,r0,SPR_SHADOW_GPR(2) +#define EXCEPTION_STORE_GPR3 l.mtspr r0,r3,SPR_SHADOW_GPR(3) +#define EXCEPTION_LOAD_GPR3 l.mfspr r3,r0,SPR_SHADOW_GPR(3) + +#define EXCEPTION_STORE_GPR4 l.mtspr r0,r4,SPR_SHADOW_GPR(4) +#define EXCEPTION_LOAD_GPR4 l.mfspr r4,r0,SPR_SHADOW_GPR(4) + +#define EXCEPTION_STORE_GPR5 l.mtspr r0,r5,SPR_SHADOW_GPR(5) +#define EXCEPTION_LOAD_GPR5 l.mfspr r5,r0,SPR_SHADOW_GPR(5) + +#define EXCEPTION_STORE_GPR6 l.mtspr r0,r6,SPR_SHADOW_GPR(6) +#define EXCEPTION_LOAD_GPR6 l.mfspr r6,r0,SPR_SHADOW_GPR(6) + +#else /* !CONFIG_OPENRISC_HAVE_SHADOW_GPRS */ #define EXCEPTION_STORE_GPR2 l.sw 0x64(r0),r2 #define EXCEPTION_LOAD_GPR2 l.lwz r2,0x64(r0) @@ -92,26 +129,32 @@ #define EXCEPTION_STORE_GPR6 l.sw 0x74(r0),r6 #define EXCEPTION_LOAD_GPR6 l.lwz r6,0x74(r0) +#endif /* * EXCEPTION_HANDLE temporary stores */ +#ifdef CONFIG_OPENRISC_HAVE_SHADOW_GPRS +#define EXCEPTION_T_STORE_GPR30 l.mtspr r0,r30,SPR_SHADOW_GPR(30) +#define EXCEPTION_T_LOAD_GPR30(reg) l.mfspr reg,r0,SPR_SHADOW_GPR(30) + +#define EXCEPTION_T_STORE_GPR10 l.mtspr r0,r10,SPR_SHADOW_GPR(10) +#define EXCEPTION_T_LOAD_GPR10(reg) l.mfspr reg,r0,SPR_SHADOW_GPR(10) + +#define EXCEPTION_T_STORE_SP l.mtspr r0,r1,SPR_SHADOW_GPR(1) +#define EXCEPTION_T_LOAD_SP(reg) l.mfspr reg,r0,SPR_SHADOW_GPR(1) + +#else /* !CONFIG_OPENRISC_HAVE_SHADOW_GPRS */ #define EXCEPTION_T_STORE_GPR30 l.sw 0x78(r0),r30 #define EXCEPTION_T_LOAD_GPR30(reg) l.lwz reg,0x78(r0) #define EXCEPTION_T_STORE_GPR10 l.sw 0x7c(r0),r10 #define EXCEPTION_T_LOAD_GPR10(reg) l.lwz reg,0x7c(r0) -#define EXCEPTION_T_STORE_SP l.sw 0x80(r0),r1 +#define EXCEPTION_T_STORE_SP l.sw 0x80(r0),r1 #define EXCEPTION_T_LOAD_SP(reg) l.lwz reg,0x80(r0) - -/* - * For UNHANLDED_EXCEPTION - */ - -#define EXCEPTION_T_STORE_GPR31 l.sw 0x84(r0),r31 -#define EXCEPTION_T_LOAD_GPR31(reg) l.lwz reg,0x84(r0) +#endif /* =========================================================[ macros ]=== */ @@ -226,7 +269,7 @@ * */ #define UNHANDLED_EXCEPTION(handler) \ - EXCEPTION_T_STORE_GPR31 ;\ + EXCEPTION_T_STORE_GPR30 ;\ EXCEPTION_T_STORE_GPR10 ;\ EXCEPTION_T_STORE_SP ;\ /* temporary store r3, r9 into r1, r10 */ ;\ @@ -255,35 +298,35 @@ /* r1: KSP, r10: current, r31: __pa(KSP) */ ;\ /* r12: temp, syscall indicator, r13 temp */ ;\ l.addi r1,r1,-(INT_FRAME_SIZE) ;\ - /* r1 is KSP, r31 is __pa(KSP) */ ;\ - tophys (r31,r1) ;\ - l.sw PT_GPR12(r31),r12 ;\ + /* r1 is KSP, r30 is __pa(KSP) */ ;\ + tophys (r30,r1) ;\ + l.sw PT_GPR12(r30),r12 ;\ l.mfspr r12,r0,SPR_EPCR_BASE ;\ - l.sw PT_PC(r31),r12 ;\ + l.sw PT_PC(r30),r12 ;\ l.mfspr r12,r0,SPR_ESR_BASE ;\ - l.sw PT_SR(r31),r12 ;\ + l.sw PT_SR(r30),r12 ;\ /* save r31 */ ;\ - EXCEPTION_T_LOAD_GPR31(r12) ;\ - l.sw PT_GPR31(r31),r12 ;\ + EXCEPTION_T_LOAD_GPR30(r12) ;\ + l.sw PT_GPR30(r30),r12 ;\ /* save r10 as was prior to exception */ ;\ EXCEPTION_T_LOAD_GPR10(r12) ;\ - l.sw PT_GPR10(r31),r12 ;\ + l.sw PT_GPR10(r30),r12 ;\ /* save PT_SP as was prior to exception */ ;\ EXCEPTION_T_LOAD_SP(r12) ;\ - l.sw PT_SP(r31),r12 ;\ - l.sw PT_GPR13(r31),r13 ;\ + l.sw PT_SP(r30),r12 ;\ + l.sw PT_GPR13(r30),r13 ;\ /* --> */ ;\ /* save exception r4, set r4 = EA */ ;\ - l.sw PT_GPR4(r31),r4 ;\ + l.sw PT_GPR4(r30),r4 ;\ l.mfspr r4,r0,SPR_EEAR_BASE ;\ /* r12 == 1 if we come from syscall */ ;\ CLEAR_GPR(r12) ;\ /* ----- play a MMU trick ----- */ ;\ - l.ori r31,r0,(EXCEPTION_SR) ;\ - l.mtspr r0,r31,SPR_ESR_BASE ;\ + l.ori r30,r0,(EXCEPTION_SR) ;\ + l.mtspr r0,r30,SPR_ESR_BASE ;\ /* r31: EA address of handler */ ;\ - LOAD_SYMBOL_2_GPR(r31,handler) ;\ - l.mtspr r0,r31,SPR_EPCR_BASE ;\ + LOAD_SYMBOL_2_GPR(r30,handler) ;\ + l.mtspr r0,r30,SPR_EPCR_BASE ;\ l.rfe /* =====================================================[ exceptions] === */ From 489e0f802db708c69004f64d92a3e1b70731614a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 23 Mar 2017 23:27:12 +0900 Subject: [PATCH 0895/1085] openrisc: add 1 and 2 byte cmpxchg support OpenRISC only supports hardware instructions that perform 4 byte atomic operations. For enabling qrwlocks for upcoming SMP support 1 and 2 byte implementations are needed. To do this we leverage the 4 byte atomic operations and shift/mask the 1 and 2 byte areas as needed. This heavily borrows ideas and routines from sh and mips, which do something similar. Cc: Peter Zijlstra Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/cmpxchg.h | 155 +++++++++++++++++++++------- 1 file changed, 119 insertions(+), 36 deletions(-) diff --git a/arch/openrisc/include/asm/cmpxchg.h b/arch/openrisc/include/asm/cmpxchg.h index f0a5d8b844d6..d29f7db53906 100644 --- a/arch/openrisc/include/asm/cmpxchg.h +++ b/arch/openrisc/include/asm/cmpxchg.h @@ -1,32 +1,29 @@ /* + * 1,2 and 4 byte cmpxchg and xchg implementations for OpenRISC. + * * Copyright (C) 2014 Stefan Kristiansson + * Copyright (C) 2017 Stafford Horne * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. + * + * Note: + * The portable implementations of 1 and 2 byte xchg and cmpxchg using a 4 + * byte cmpxchg is sourced heavily from the sh and mips implementations. */ #ifndef __ASM_OPENRISC_CMPXCHG_H #define __ASM_OPENRISC_CMPXCHG_H #include - -/* - * This function doesn't exist, so you'll get a linker error - * if something tries to do an invalid cmpxchg(). - */ -extern void __cmpxchg_called_with_bad_pointer(void); +#include #define __HAVE_ARCH_CMPXCHG 1 -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) +static inline unsigned long cmpxchg_u32(volatile void *ptr, + unsigned long old, unsigned long new) { - if (size != 4) { - __cmpxchg_called_with_bad_pointer(); - return old; - } - __asm__ __volatile__( "1: l.lwa %0, 0(%1) \n" " l.sfeq %0, %2 \n" @@ -43,28 +40,9 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) return old; } -#define cmpxchg(ptr, o, n) \ - ({ \ - (__typeof__(*(ptr))) __cmpxchg((ptr), \ - (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr))); \ - }) - -/* - * This function doesn't exist, so you'll get a linker error if - * something tries to do an invalidly-sized xchg(). - */ -extern void __xchg_called_with_bad_pointer(void); - -static inline unsigned long __xchg(unsigned long val, volatile void *ptr, - int size) +static inline unsigned long xchg_u32(volatile void *ptr, + unsigned long val) { - if (size != 4) { - __xchg_called_with_bad_pointer(); - return val; - } - __asm__ __volatile__( "1: l.lwa %0, 0(%1) \n" " l.swa 0(%1), %2 \n" @@ -77,10 +55,115 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, return val; } +static inline u32 cmpxchg_small(volatile void *ptr, u32 old, u32 new, + int size) +{ + int off = (unsigned long)ptr % sizeof(u32); + volatile u32 *p = ptr - off; +#ifdef __BIG_ENDIAN + int bitoff = (sizeof(u32) - size - off) * BITS_PER_BYTE; +#else + int bitoff = off * BITS_PER_BYTE; +#endif + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; + u32 load32, old32, new32; + u32 ret; + + load32 = READ_ONCE(*p); + + while (true) { + ret = (load32 & bitmask) >> bitoff; + if (old != ret) + return ret; + + old32 = (load32 & ~bitmask) | (old << bitoff); + new32 = (load32 & ~bitmask) | (new << bitoff); + + /* Do 32 bit cmpxchg */ + load32 = cmpxchg_u32(p, old32, new32); + if (load32 == old32) + return old; + } +} + +/* xchg */ + +static inline u32 xchg_small(volatile void *ptr, u32 x, int size) +{ + int off = (unsigned long)ptr % sizeof(u32); + volatile u32 *p = ptr - off; +#ifdef __BIG_ENDIAN + int bitoff = (sizeof(u32) - size - off) * BITS_PER_BYTE; +#else + int bitoff = off * BITS_PER_BYTE; +#endif + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; + u32 oldv, newv; + u32 ret; + + do { + oldv = READ_ONCE(*p); + ret = (oldv & bitmask) >> bitoff; + newv = (oldv & ~bitmask) | (x << bitoff); + } while (cmpxchg_u32(p, oldv, newv) != oldv); + + return ret; +} + +/* + * This function doesn't exist, so you'll get a linker error + * if something tries to do an invalid cmpxchg(). + */ +extern unsigned long __cmpxchg_called_with_bad_pointer(void) + __compiletime_error("Bad argument size for cmpxchg"); + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: + case 2: + return cmpxchg_small(ptr, old, new, size); + case 4: + return cmpxchg_u32(ptr, old, new); + default: + return __cmpxchg_called_with_bad_pointer(); + } +} + +#define cmpxchg(ptr, o, n) \ + ({ \ + (__typeof__(*(ptr))) __cmpxchg((ptr), \ + (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr))); \ + }) + +/* + * This function doesn't exist, so you'll get a linker error if + * something tries to do an invalidly-sized xchg(). + */ +extern unsigned long __xchg_called_with_bad_pointer(void) + __compiletime_error("Bad argument size for xchg"); + +static inline unsigned long __xchg(volatile void *ptr, unsigned long with, + int size) +{ + switch (size) { + case 1: + case 2: + return xchg_small(ptr, with, size); + case 4: + return xchg_u32(ptr, with); + default: + return __xchg_called_with_bad_pointer(); + } +} + #define xchg(ptr, with) \ ({ \ - (__typeof__(*(ptr))) __xchg((unsigned long)(with), \ - (ptr), \ + (__typeof__(*(ptr))) __xchg((ptr), \ + (unsigned long)(with), \ sizeof(*(ptr))); \ }) From b5f8217615bb197af33c0cc71b14783b194fed6e Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Fri, 24 Mar 2017 07:13:03 +0900 Subject: [PATCH 0896/1085] openrisc: use qspinlocks and qrwlocks Enable OpenRISC to use qspinlocks and qrwlocks for upcoming SMP support. Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 2 ++ arch/openrisc/include/asm/Kbuild | 4 ++++ arch/openrisc/include/asm/spinlock.h | 12 +++++++++++- arch/openrisc/include/asm/spinlock_types.h | 7 +++++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 arch/openrisc/include/asm/spinlock_types.h diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 356dd67a86ea..b49acda5e8f4 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -28,6 +28,8 @@ config OPENRISC select OR1K_PIC select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1 select NO_BOOTMEM + select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_QUEUED_RWLOCKS config CPU_BIG_ENDIAN def_bool y diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 5bea416a7792..5f066780d870 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -28,6 +28,10 @@ generic-y += module.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h +generic-y += qspinlock_types.h +generic-y += qspinlock.h +generic-y += qrwlock_types.h +generic-y += qrwlock.h generic-y += sections.h generic-y += segment.h generic-y += string.h diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h index fd00a3a24123..9b761e0e22c3 100644 --- a/arch/openrisc/include/asm/spinlock.h +++ b/arch/openrisc/include/asm/spinlock.h @@ -19,6 +19,16 @@ #ifndef __ASM_OPENRISC_SPINLOCK_H #define __ASM_OPENRISC_SPINLOCK_H -#error "or32 doesn't do SMP yet" +#include + +#include + +#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) + +#define arch_spin_relax(lock) cpu_relax() +#define arch_read_relax(lock) cpu_relax() +#define arch_write_relax(lock) cpu_relax() + #endif diff --git a/arch/openrisc/include/asm/spinlock_types.h b/arch/openrisc/include/asm/spinlock_types.h new file mode 100644 index 000000000000..7c6fb1208c88 --- /dev/null +++ b/arch/openrisc/include/asm/spinlock_types.h @@ -0,0 +1,7 @@ +#ifndef _ASM_OPENRISC_SPINLOCK_TYPES_H +#define _ASM_OPENRISC_SPINLOCK_TYPES_H + +#include +#include + +#endif /* _ASM_OPENRISC_SPINLOCK_TYPES_H */ From fab8be88ac0478b0157859f74fad5088c292356b Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 7 Sep 2017 22:55:18 +0900 Subject: [PATCH 0897/1085] dt-bindings: add openrisc to vendor prefixes list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add OpenRISC.io to vendor prefixes. This is reserved for softcores developed by the OpenRISC community. The OpenRISC community has separated from OpenCores.org requiring a new prefix. Reviewed-by: Andreas Färber Acked-by: Rob Herring Signed-off-by: Stafford Horne --- Documentation/devicetree/bindings/vendor-prefixes.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 1afd298eddd7..b1eeca851d6f 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -246,6 +246,7 @@ onion Onion Corporation onnn ON Semiconductor Corp. ontat On Tat Industrial Company opencores OpenCores.org +openrisc OpenRISC.io option Option NV ORCL Oracle Corporation ortustech Ortus Technology Co., Ltd. From 9b54470afd836278a7e6f0f08194e2e2dca4b6eb Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Mon, 30 Oct 2017 21:38:35 +0900 Subject: [PATCH 0898/1085] irqchip: add initial support for ompic IPI driver for the Open Multi-Processor Interrupt Controller (ompic) as described in the Multi-core support section of the OpenRISC 1.2 architecture specification: https://github.com/openrisc/doc/raw/master/openrisc-arch-1.2-rev0.pdf Each OpenRISC core contains a full interrupt controller which is used in the SMP architecture for interrupt balancing. This IPI device, the ompic, is the only external device required for enabling SMP on OpenRISC. Pending ops are stored in a memory bit mask which can allow multiple pending operations to be set and serviced at a time. This is mostly borrowed from the alpha IPI implementation. Reviewed-by: Marc Zyngier Acked-by: Rob Herring Signed-off-by: Stefan Kristiansson [shorne@gmail.com: converted ops to bitmask, wrote commit message] Signed-off-by: Stafford Horne --- .../interrupt-controller/openrisc,ompic.txt | 22 ++ MAINTAINERS | 1 + arch/openrisc/Kconfig | 1 + drivers/irqchip/Kconfig | 3 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-ompic.c | 202 ++++++++++++++++++ 6 files changed, 230 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/openrisc,ompic.txt create mode 100644 drivers/irqchip/irq-ompic.c diff --git a/Documentation/devicetree/bindings/interrupt-controller/openrisc,ompic.txt b/Documentation/devicetree/bindings/interrupt-controller/openrisc,ompic.txt new file mode 100644 index 000000000000..caec07cc7149 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/openrisc,ompic.txt @@ -0,0 +1,22 @@ +Open Multi-Processor Interrupt Controller + +Required properties: + +- compatible : This should be "openrisc,ompic" +- reg : Specifies base physical address and size of the register space. The + size is based on the number of cores the controller has been configured + to handle, this should be set to 8 bytes per cpu core. +- interrupt-controller : Identifies the node as an interrupt controller. +- #interrupt-cells : This should be set to 0 as this will not be an irq + parent. +- interrupts : Specifies the interrupt line to which the ompic is wired. + +Example: + +ompic: interrupt-controller@98000000 { + compatible = "openrisc,ompic"; + reg = <0x98000000 16>; + interrupt-controller; + #interrupt-cells = <0>; + interrupts = <1>; +}; diff --git a/MAINTAINERS b/MAINTAINERS index bcb7e2525f54..bb129d5f0d3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10034,6 +10034,7 @@ S: Maintained F: Documentation/devicetree/bindings/openrisc/ F: Documentation/openrisc/ F: arch/openrisc/ +F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* OPENVSWITCH diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index b49acda5e8f4..34eb4e90f56c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -30,6 +30,7 @@ config OPENRISC select NO_BOOTMEM select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_RWLOCKS + select OMPIC if SMP config CPU_BIG_ENDIAN def_bool y diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 9d8a1dd2e2c2..a2ca82f6c2dd 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -151,6 +151,9 @@ config CLPS711X_IRQCHIP select SPARSE_IRQ default y +config OMPIC + bool + config OR1K_PIC bool select IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 845abc107ad5..771f8e7f46f8 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_DW_APB_ICTL) += irq-dw-apb-ictl.o obj-$(CONFIG_METAG) += irq-metag-ext.o obj-$(CONFIG_METAG_PERFCOUNTER_IRQS) += irq-metag.o obj-$(CONFIG_CLPS711X_IRQCHIP) += irq-clps711x.o +obj-$(CONFIG_OMPIC) += irq-ompic.o obj-$(CONFIG_OR1K_PIC) += irq-or1k-pic.o obj-$(CONFIG_ORION_IRQCHIP) += irq-orion.o obj-$(CONFIG_OMAP_IRQCHIP) += irq-omap-intc.o diff --git a/drivers/irqchip/irq-ompic.c b/drivers/irqchip/irq-ompic.c new file mode 100644 index 000000000000..cf6d0c455518 --- /dev/null +++ b/drivers/irqchip/irq-ompic.c @@ -0,0 +1,202 @@ +/* + * Open Multi-Processor Interrupt Controller driver + * + * Copyright (C) 2014 Stefan Kristiansson + * Copyright (C) 2017 Stafford Horne + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + * + * The ompic device handles IPI communication between cores in multi-core + * OpenRISC systems. + * + * Registers + * + * For each CPU the ompic has 2 registers. The control register for sending + * and acking IPIs and the status register for receiving IPIs. The register + * layouts are as follows: + * + * Control register + * +---------+---------+----------+---------+ + * | 31 | 30 | 29 .. 16 | 15 .. 0 | + * ----------+---------+----------+---------- + * | IRQ ACK | IRQ GEN | DST CORE | DATA | + * +---------+---------+----------+---------+ + * + * Status register + * +----------+-------------+----------+---------+ + * | 31 | 30 | 29 .. 16 | 15 .. 0 | + * -----------+-------------+----------+---------+ + * | Reserved | IRQ Pending | SRC CORE | DATA | + * +----------+-------------+----------+---------+ + * + * Architecture + * + * - The ompic generates a level interrupt to the CPU PIC when a message is + * ready. Messages are delivered via the memory bus. + * - The ompic does not have any interrupt input lines. + * - The ompic is wired to the same irq line on each core. + * - Devices are wired to the same irq line on each core. + * + * +---------+ +---------+ + * | CPU | | CPU | + * | Core 0 |<==\ (memory access) /==>| Core 1 | + * | [ PIC ]| | | | [ PIC ]| + * +----^-^--+ | | +----^-^--+ + * | | v v | | + * <====|=|=================================|=|==> (memory bus) + * | | ^ ^ | | + * (ipi | +------|---------+--------|-------|-+ (device irq) + * irq | | | | | + * core0)| +------|---------|--------|-------+ (ipi irq core1) + * | | | | | + * +----o-o-+ | +--------+ | + * | ompic |<===/ | Device |<===/ + * | IPI | +--------+ + * +--------+* + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define OMPIC_CPUBYTES 8 +#define OMPIC_CTRL(cpu) (0x0 + (cpu * OMPIC_CPUBYTES)) +#define OMPIC_STAT(cpu) (0x4 + (cpu * OMPIC_CPUBYTES)) + +#define OMPIC_CTRL_IRQ_ACK (1 << 31) +#define OMPIC_CTRL_IRQ_GEN (1 << 30) +#define OMPIC_CTRL_DST(cpu) (((cpu) & 0x3fff) << 16) + +#define OMPIC_STAT_IRQ_PENDING (1 << 30) + +#define OMPIC_DATA(x) ((x) & 0xffff) + +DEFINE_PER_CPU(unsigned long, ops); + +static void __iomem *ompic_base; + +static inline u32 ompic_readreg(void __iomem *base, loff_t offset) +{ + return ioread32be(base + offset); +} + +static void ompic_writereg(void __iomem *base, loff_t offset, u32 data) +{ + iowrite32be(data, base + offset); +} + +static void ompic_raise_softirq(const struct cpumask *mask, + unsigned int ipi_msg) +{ + unsigned int dst_cpu; + unsigned int src_cpu = smp_processor_id(); + + for_each_cpu(dst_cpu, mask) { + set_bit(ipi_msg, &per_cpu(ops, dst_cpu)); + + /* + * On OpenRISC the atomic set_bit() call implies a memory + * barrier. Otherwise we would need: smp_wmb(); paired + * with the read in ompic_ipi_handler. + */ + + ompic_writereg(ompic_base, OMPIC_CTRL(src_cpu), + OMPIC_CTRL_IRQ_GEN | + OMPIC_CTRL_DST(dst_cpu) | + OMPIC_DATA(1)); + } +} + +static irqreturn_t ompic_ipi_handler(int irq, void *dev_id) +{ + unsigned int cpu = smp_processor_id(); + unsigned long *pending_ops = &per_cpu(ops, cpu); + unsigned long ops; + + ompic_writereg(ompic_base, OMPIC_CTRL(cpu), OMPIC_CTRL_IRQ_ACK); + while ((ops = xchg(pending_ops, 0)) != 0) { + + /* + * On OpenRISC the atomic xchg() call implies a memory + * barrier. Otherwise we may need an smp_rmb(); paired + * with the write in ompic_raise_softirq. + */ + + do { + unsigned long ipi_msg; + + ipi_msg = __ffs(ops); + ops &= ~(1UL << ipi_msg); + + handle_IPI(ipi_msg); + } while (ops); + } + + return IRQ_HANDLED; +} + +static int __init ompic_of_init(struct device_node *node, + struct device_node *parent) +{ + struct resource res; + int irq; + int ret; + + /* Validate the DT */ + if (ompic_base) { + pr_err("ompic: duplicate ompic's are not supported"); + return -EEXIST; + } + + if (of_address_to_resource(node, 0, &res)) { + pr_err("ompic: reg property requires an address and size"); + return -EINVAL; + } + + if (resource_size(&res) < (num_possible_cpus() * OMPIC_CPUBYTES)) { + pr_err("ompic: reg size, currently %d must be at least %d", + resource_size(&res), + (num_possible_cpus() * OMPIC_CPUBYTES)); + return -EINVAL; + } + + /* Setup the device */ + ompic_base = ioremap(res.start, resource_size(&res)); + if (IS_ERR(ompic_base)) { + pr_err("ompic: unable to map registers"); + return PTR_ERR(ompic_base); + } + + irq = irq_of_parse_and_map(node, 0); + if (irq <= 0) { + pr_err("ompic: unable to parse device irq"); + ret = -EINVAL; + goto out_unmap; + } + + ret = request_irq(irq, ompic_ipi_handler, IRQF_PERCPU, + "ompic_ipi", NULL); + if (ret) + goto out_irq_disp; + + set_smp_cross_call(ompic_raise_softirq); + + return 0; + +out_irq_disp: + irq_dispose_mapping(irq); +out_unmap: + iounmap(ompic_base); + ompic_base = NULL; + return ret; +} +IRQCHIP_DECLARE(ompic, "openrisc,ompic", ompic_of_init); From 8e6d08e0a15e7d4d4b608b56597350d4cdd77710 Mon Sep 17 00:00:00 2001 From: Stefan Kristiansson Date: Sun, 11 May 2014 21:49:34 +0300 Subject: [PATCH 0899/1085] openrisc: initial SMP support This patch introduces the SMP support for the OpenRISC architecture. The SMP architecture requires cores which have multi-core features which have been introduced a few years back including: - New SPRS SPR_COREID SPR_NUMCORES - Shadow SPRs - Atomic Instructions - Cache Coherency - A wired in IPI controller This patch adds all of the SMP specific changes to core infrastructure, it looks big but it needs to go all together as its hard to split this one up. Boot loader spinning of second cpu is not supported yet, it's assumed that Linux is booted straight after cpu reset. The bulk of these changes are trivial changes to refactor to use per cpu data structures throughout. The addition of the smp.c and changes in time.c are the changes. Some specific notes: MM changes ---------- The reason why this is created as an array, and not with DEFINE_PER_CPU is that doing it this way, we'll save a load in the tlb-miss handler (the load from __per_cpu_offset). TLB Flush --------- The SMP implementation of flush_tlb_* works by sending out a function-call IPI to all the non-local cpus by using the generic on_each_cpu() function. Currently, all flush_tlb_* functions will result in a flush_tlb_all(), which has always been the behaviour in the UP case. CPU INFO -------- This creates a per cpu cpuinfo struct and fills it out accordingly for each activated cpu. show_cpuinfo is also updated to reflect new version information in later versions of the spec. SMP API ------- This imitates the arm64 implementation by having a smp_cross_call callback that can be set by set_smp_cross_call to initiate an IPI and a handle_IPI function that is expected to be called from an IPI irqchip driver. Signed-off-by: Stefan Kristiansson [shorne@gmail.com: added cpu stop, checkpatch fixes, wrote commit message] Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 17 +- arch/openrisc/include/asm/cpuinfo.h | 7 +- arch/openrisc/include/asm/mmu_context.h | 2 +- arch/openrisc/include/asm/pgtable.h | 2 +- arch/openrisc/include/asm/serial.h | 2 +- arch/openrisc/include/asm/smp.h | 26 +++ arch/openrisc/include/asm/spr_defs.h | 14 ++ arch/openrisc/include/asm/time.h | 15 ++ arch/openrisc/include/asm/tlbflush.h | 25 ++- arch/openrisc/kernel/Makefile | 1 + arch/openrisc/kernel/dma.c | 14 +- arch/openrisc/kernel/head.S | 97 +++++++++- arch/openrisc/kernel/setup.c | 163 ++++++++++------ arch/openrisc/kernel/smp.c | 235 ++++++++++++++++++++++++ arch/openrisc/kernel/time.c | 51 +++-- arch/openrisc/lib/delay.c | 2 +- arch/openrisc/mm/fault.c | 4 +- arch/openrisc/mm/init.c | 2 +- arch/openrisc/mm/tlb.c | 16 +- 19 files changed, 583 insertions(+), 112 deletions(-) create mode 100644 arch/openrisc/include/asm/smp.h create mode 100644 arch/openrisc/include/asm/time.h create mode 100644 arch/openrisc/kernel/smp.c diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 34eb4e90f56c..2b3898ede888 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -21,8 +21,10 @@ config OPENRISC select HAVE_UID16 select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS + select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER + select GENERIC_SMP_IDLE_THREAD select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW select OR1K_PIC @@ -107,8 +109,19 @@ config OPENRISC_HAVE_INST_DIV endmenu config NR_CPUS - int - default "1" + int "Maximum number of CPUs (2-32)" + range 2 32 + depends on SMP + default "2" + +config SMP + bool "Symmetric Multi-Processing support" + help + This enables support for systems with more than one CPU. If you have + a system with only one CPU, say N. If you have a system with more + than one CPU, say Y. + + If you don't know what to do here, say N. source kernel/Kconfig.hz source kernel/Kconfig.preempt diff --git a/arch/openrisc/include/asm/cpuinfo.h b/arch/openrisc/include/asm/cpuinfo.h index ec10679d6429..4ea0a33eba6c 100644 --- a/arch/openrisc/include/asm/cpuinfo.h +++ b/arch/openrisc/include/asm/cpuinfo.h @@ -19,7 +19,7 @@ #ifndef __ASM_OPENRISC_CPUINFO_H #define __ASM_OPENRISC_CPUINFO_H -struct cpuinfo { +struct cpuinfo_or1k { u32 clock_frequency; u32 icache_size; @@ -29,8 +29,11 @@ struct cpuinfo { u32 dcache_size; u32 dcache_block_size; u32 dcache_ways; + + u16 coreid; }; -extern struct cpuinfo cpuinfo; +extern struct cpuinfo_or1k cpuinfo_or1k[NR_CPUS]; +extern void setup_cpuinfo(void); #endif /* __ASM_OPENRISC_CPUINFO_H */ diff --git a/arch/openrisc/include/asm/mmu_context.h b/arch/openrisc/include/asm/mmu_context.h index e94b814d2e3c..c380d8caf84f 100644 --- a/arch/openrisc/include/asm/mmu_context.h +++ b/arch/openrisc/include/asm/mmu_context.h @@ -34,7 +34,7 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, * registers like cr3 on the i386 */ -extern volatile pgd_t *current_pgd; /* defined in arch/openrisc/mm/fault.c */ +extern volatile pgd_t *current_pgd[]; /* defined in arch/openrisc/mm/fault.c */ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 71a6f08de8f2..eff5ba2a5af2 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -94,7 +94,7 @@ extern void paging_init(void); * 64 MB of vmalloc area is comparable to what's available on other arches. */ -#define VMALLOC_START (PAGE_OFFSET-0x04000000) +#define VMALLOC_START (PAGE_OFFSET-0x04000000UL) #define VMALLOC_END (PAGE_OFFSET) #define VMALLOC_VMADDR(x) ((unsigned long)(x)) diff --git a/arch/openrisc/include/asm/serial.h b/arch/openrisc/include/asm/serial.h index 270a45241639..cb5932f5447a 100644 --- a/arch/openrisc/include/asm/serial.h +++ b/arch/openrisc/include/asm/serial.h @@ -29,7 +29,7 @@ * it needs to be correct to get the early console working. */ -#define BASE_BAUD (cpuinfo.clock_frequency/16) +#define BASE_BAUD (cpuinfo_or1k[smp_processor_id()].clock_frequency/16) #endif /* __KERNEL__ */ diff --git a/arch/openrisc/include/asm/smp.h b/arch/openrisc/include/asm/smp.h new file mode 100644 index 000000000000..e21d2f12b5b6 --- /dev/null +++ b/arch/openrisc/include/asm/smp.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2014 Stefan Kristiansson + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#ifndef __ASM_OPENRISC_SMP_H +#define __ASM_OPENRISC_SMP_H + +#include +#include + +#define raw_smp_processor_id() (current_thread_info()->cpu) +#define hard_smp_processor_id() mfspr(SPR_COREID) + +extern void smp_init_cpus(void); + +extern void arch_send_call_function_single_ipi(int cpu); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); + +extern void set_smp_cross_call(void (*)(const struct cpumask *, unsigned int)); +extern void handle_IPI(unsigned int ipi_msg); + +#endif /* __ASM_OPENRISC_SMP_H */ diff --git a/arch/openrisc/include/asm/spr_defs.h b/arch/openrisc/include/asm/spr_defs.h index 367dac70326a..154b5a1ee579 100644 --- a/arch/openrisc/include/asm/spr_defs.h +++ b/arch/openrisc/include/asm/spr_defs.h @@ -51,6 +51,11 @@ #define SPR_ICCFGR (SPRGROUP_SYS + 6) #define SPR_DCFGR (SPRGROUP_SYS + 7) #define SPR_PCCFGR (SPRGROUP_SYS + 8) +#define SPR_VR2 (SPRGROUP_SYS + 9) +#define SPR_AVR (SPRGROUP_SYS + 10) +#define SPR_EVBAR (SPRGROUP_SYS + 11) +#define SPR_AECR (SPRGROUP_SYS + 12) +#define SPR_AESR (SPRGROUP_SYS + 13) #define SPR_NPC (SPRGROUP_SYS + 16) /* CZ 21/06/01 */ #define SPR_SR (SPRGROUP_SYS + 17) /* CZ 21/06/01 */ #define SPR_PPC (SPRGROUP_SYS + 18) /* CZ 21/06/01 */ @@ -61,6 +66,8 @@ #define SPR_EEAR_LAST (SPRGROUP_SYS + 63) #define SPR_ESR_BASE (SPRGROUP_SYS + 64) #define SPR_ESR_LAST (SPRGROUP_SYS + 79) +#define SPR_COREID (SPRGROUP_SYS + 128) +#define SPR_NUMCORES (SPRGROUP_SYS + 129) #define SPR_GPR_BASE (SPRGROUP_SYS + 1024) /* Data MMU group */ @@ -135,11 +142,18 @@ #define SPR_VR_CFG 0x00ff0000 /* Processor configuration */ #define SPR_VR_RES 0x0000ffc0 /* Reserved */ #define SPR_VR_REV 0x0000003f /* Processor revision */ +#define SPR_VR_UVRP 0x00000040 /* Updated Version Registers Present */ #define SPR_VR_VER_OFF 24 #define SPR_VR_CFG_OFF 16 #define SPR_VR_REV_OFF 0 +/* + * Bit definitions for the Version Register 2 + */ +#define SPR_VR2_CPUID 0xff000000 /* Processor ID */ +#define SPR_VR2_VER 0x00ffffff /* Processor version */ + /* * Bit definitions for the Unit Present Register * diff --git a/arch/openrisc/include/asm/time.h b/arch/openrisc/include/asm/time.h new file mode 100644 index 000000000000..fe83a34a7d68 --- /dev/null +++ b/arch/openrisc/include/asm/time.h @@ -0,0 +1,15 @@ +/* + * OpenRISC timer API + * + * Copyright (C) 2017 by Stafford Horne (shorne@gmail.com) + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#ifndef __ASM_OR1K_TIME_H +#define __ASM_OR1K_TIME_H + +extern void openrisc_clockevent_init(void); + +#endif /* __ASM_OR1K_TIME_H */ diff --git a/arch/openrisc/include/asm/tlbflush.h b/arch/openrisc/include/asm/tlbflush.h index 6a2accd6cb67..94227f0eaf6d 100644 --- a/arch/openrisc/include/asm/tlbflush.h +++ b/arch/openrisc/include/asm/tlbflush.h @@ -33,13 +33,26 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(mm, start, end) flushes a range of pages */ +extern void local_flush_tlb_all(void); +extern void local_flush_tlb_mm(struct mm_struct *mm); +extern void local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr); +extern void local_flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end); -void flush_tlb_all(void); -void flush_tlb_mm(struct mm_struct *mm); -void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); -void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end); +#ifndef CONFIG_SMP +#define flush_tlb_all local_flush_tlb_all +#define flush_tlb_mm local_flush_tlb_mm +#define flush_tlb_page local_flush_tlb_page +#define flush_tlb_range local_flush_tlb_range +#else +extern void flush_tlb_all(void); +extern void flush_tlb_mm(struct mm_struct *mm); +extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); +extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +#endif static inline void flush_tlb(void) { diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index ec6d9d37cefd..7d94643c878d 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -8,6 +8,7 @@ obj-y := setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ sys_call_table.o +obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_OF) += prom.o diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b10369b7e31b..a945f00011b4 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -32,6 +32,7 @@ page_set_nocache(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { unsigned long cl; + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; pte_val(*pte) |= _PAGE_CI; @@ -42,7 +43,7 @@ page_set_nocache(pte_t *pte, unsigned long addr, flush_tlb_page(NULL, addr); /* Flush page out of dcache */ - for (cl = __pa(addr); cl < __pa(next); cl += cpuinfo.dcache_block_size) + for (cl = __pa(addr); cl < __pa(next); cl += cpuinfo->dcache_block_size) mtspr(SPR_DCBFR, cl); return 0; @@ -140,6 +141,7 @@ or1k_map_page(struct device *dev, struct page *page, { unsigned long cl; dma_addr_t addr = page_to_phys(page) + offset; + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; if (attrs & DMA_ATTR_SKIP_CPU_SYNC) return addr; @@ -148,13 +150,13 @@ or1k_map_page(struct device *dev, struct page *page, case DMA_TO_DEVICE: /* Flush the dcache for the requested range */ for (cl = addr; cl < addr + size; - cl += cpuinfo.dcache_block_size) + cl += cpuinfo->dcache_block_size) mtspr(SPR_DCBFR, cl); break; case DMA_FROM_DEVICE: /* Invalidate the dcache for the requested range */ for (cl = addr; cl < addr + size; - cl += cpuinfo.dcache_block_size) + cl += cpuinfo->dcache_block_size) mtspr(SPR_DCBIR, cl); break; default: @@ -213,9 +215,10 @@ or1k_sync_single_for_cpu(struct device *dev, { unsigned long cl; dma_addr_t addr = dma_handle; + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; /* Invalidate the dcache for the requested range */ - for (cl = addr; cl < addr + size; cl += cpuinfo.dcache_block_size) + for (cl = addr; cl < addr + size; cl += cpuinfo->dcache_block_size) mtspr(SPR_DCBIR, cl); } @@ -226,9 +229,10 @@ or1k_sync_single_for_device(struct device *dev, { unsigned long cl; dma_addr_t addr = dma_handle; + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; /* Flush the dcache for the requested range */ - for (cl = addr; cl < addr + size; cl += cpuinfo.dcache_block_size) + for (cl = addr; cl < addr + size; cl += cpuinfo->dcache_block_size) mtspr(SPR_DCBFR, cl); } diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 1e49895408f4..a9972dc103f8 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -158,12 +158,38 @@ /* =========================================================[ macros ]=== */ - +#ifdef CONFIG_SMP +#define GET_CURRENT_PGD(reg,t1) \ + LOAD_SYMBOL_2_GPR(reg,current_pgd) ;\ + l.mfspr t1,r0,SPR_COREID ;\ + l.slli t1,t1,2 ;\ + l.add reg,reg,t1 ;\ + tophys (t1,reg) ;\ + l.lwz reg,0(t1) +#else #define GET_CURRENT_PGD(reg,t1) \ LOAD_SYMBOL_2_GPR(reg,current_pgd) ;\ tophys (t1,reg) ;\ l.lwz reg,0(t1) +#endif +/* Load r10 from current_thread_info_set - clobbers r1 and r30 */ +#ifdef CONFIG_SMP +#define GET_CURRENT_THREAD_INFO \ + LOAD_SYMBOL_2_GPR(r1,current_thread_info_set) ;\ + tophys (r30,r1) ;\ + l.mfspr r10,r0,SPR_COREID ;\ + l.slli r10,r10,2 ;\ + l.add r30,r30,r10 ;\ + /* r10: current_thread_info */ ;\ + l.lwz r10,0(r30) +#else +#define GET_CURRENT_THREAD_INFO \ + LOAD_SYMBOL_2_GPR(r1,current_thread_info_set) ;\ + tophys (r30,r1) ;\ + /* r10: current_thread_info */ ;\ + l.lwz r10,0(r30) +#endif /* * DSCR: this is a common hook for handling exceptions. it will save @@ -206,10 +232,7 @@ l.bnf 2f /* kernel_mode */ ;\ EXCEPTION_T_STORE_SP /* delay slot */ ;\ 1: /* user_mode: */ ;\ - LOAD_SYMBOL_2_GPR(r1,current_thread_info_set) ;\ - tophys (r30,r1) ;\ - /* r10: current_thread_info */ ;\ - l.lwz r10,0(r30) ;\ + GET_CURRENT_THREAD_INFO ;\ tophys (r30,r10) ;\ l.lwz r1,(TI_KSP)(r30) ;\ /* fall through */ ;\ @@ -530,6 +553,12 @@ _start: CLEAR_GPR(r30) CLEAR_GPR(r31) +#ifdef CONFIG_SMP + l.mfspr r26,r0,SPR_COREID + l.sfeq r26,r0 + l.bnf secondary_wait + l.nop +#endif /* * set up initial ksp and current */ @@ -681,6 +710,64 @@ _flush_tlb: l.jr r9 l.nop +#ifdef CONFIG_SMP +secondary_wait: + l.mfspr r25,r0,SPR_COREID + l.movhi r3,hi(secondary_release) + l.ori r3,r3,lo(secondary_release) + tophys(r4, r3) + l.lwz r3,0(r4) + l.sfeq r25,r3 + l.bnf secondary_wait + l.nop + /* fall through to secondary_init */ + +secondary_init: + /* + * set up initial ksp and current + */ + LOAD_SYMBOL_2_GPR(r10, secondary_thread_info) + tophys (r30,r10) + l.lwz r10,0(r30) + l.addi r1,r10,THREAD_SIZE + tophys (r30,r10) + l.sw TI_KSP(r30),r1 + + l.jal _ic_enable + l.nop + + l.jal _dc_enable + l.nop + + l.jal _flush_tlb + l.nop + + /* + * enable dmmu & immu + */ + l.mfspr r30,r0,SPR_SR + l.movhi r28,hi(SPR_SR_DME | SPR_SR_IME) + l.ori r28,r28,lo(SPR_SR_DME | SPR_SR_IME) + l.or r30,r30,r28 + /* + * This is a bit tricky, we need to switch over from physical addresses + * to virtual addresses on the fly. + * To do that, we first set up ESR with the IME and DME bits set. + * Then EPCR is set to secondary_start and then a l.rfe is issued to + * "jump" to that. + */ + l.mtspr r0,r30,SPR_ESR_BASE + LOAD_SYMBOL_2_GPR(r30, secondary_start) + l.mtspr r0,r30,SPR_EPCR_BASE + l.rfe + +secondary_start: + LOAD_SYMBOL_2_GPR(r30, secondary_start_kernel) + l.jr r30 + l.nop + +#endif + /* ========================================[ cache ]=== */ /* alignment here so we don't change memory offsets with diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index dbf5ee95a0d5..9d28ab14d139 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -93,7 +93,7 @@ static void __init setup_memory(void) memblock_dump_all(); } -struct cpuinfo cpuinfo; +struct cpuinfo_or1k cpuinfo_or1k[NR_CPUS]; static void print_cpuinfo(void) { @@ -101,12 +101,13 @@ static void print_cpuinfo(void) unsigned long vr = mfspr(SPR_VR); unsigned int version; unsigned int revision; + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; version = (vr & SPR_VR_VER) >> 24; revision = (vr & SPR_VR_REV); printk(KERN_INFO "CPU: OpenRISC-%x (revision %d) @%d MHz\n", - version, revision, cpuinfo.clock_frequency / 1000000); + version, revision, cpuinfo->clock_frequency / 1000000); if (!(upr & SPR_UPR_UP)) { printk(KERN_INFO @@ -117,15 +118,15 @@ static void print_cpuinfo(void) if (upr & SPR_UPR_DCP) printk(KERN_INFO "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo.dcache_size, cpuinfo.dcache_block_size, - cpuinfo.dcache_ways); + cpuinfo->dcache_size, cpuinfo->dcache_block_size, + cpuinfo->dcache_ways); else printk(KERN_INFO "-- dcache disabled\n"); if (upr & SPR_UPR_ICP) printk(KERN_INFO "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo.icache_size, cpuinfo.icache_block_size, - cpuinfo.icache_ways); + cpuinfo->icache_size, cpuinfo->icache_block_size, + cpuinfo->icache_ways); else printk(KERN_INFO "-- icache disabled\n"); @@ -153,38 +154,58 @@ static void print_cpuinfo(void) printk(KERN_INFO "-- custom unit(s)\n"); } +static struct device_node *setup_find_cpu_node(int cpu) +{ + u32 hwid; + struct device_node *cpun; + struct device_node *cpus = of_find_node_by_path("/cpus"); + + for_each_available_child_of_node(cpus, cpun) { + if (of_property_read_u32(cpun, "reg", &hwid)) + continue; + if (hwid == cpu) + return cpun; + } + + return NULL; +} + void __init setup_cpuinfo(void) { struct device_node *cpu; unsigned long iccfgr, dccfgr; unsigned long cache_set_size; + int cpu_id = smp_processor_id(); + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu_id]; - cpu = of_find_compatible_node(NULL, NULL, "opencores,or1200-rtlsvn481"); + cpu = setup_find_cpu_node(cpu_id); if (!cpu) - panic("No compatible CPU found in device tree...\n"); + panic("Couldn't find CPU%d in device tree...\n", cpu_id); iccfgr = mfspr(SPR_ICCFGR); - cpuinfo.icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW); + cpuinfo->icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW); cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); - cpuinfo.icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); - cpuinfo.icache_size = - cache_set_size * cpuinfo.icache_ways * cpuinfo.icache_block_size; + cpuinfo->icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); + cpuinfo->icache_size = + cache_set_size * cpuinfo->icache_ways * cpuinfo->icache_block_size; dccfgr = mfspr(SPR_DCCFGR); - cpuinfo.dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW); + cpuinfo->dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW); cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); - cpuinfo.dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); - cpuinfo.dcache_size = - cache_set_size * cpuinfo.dcache_ways * cpuinfo.dcache_block_size; + cpuinfo->dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); + cpuinfo->dcache_size = + cache_set_size * cpuinfo->dcache_ways * cpuinfo->dcache_block_size; if (of_property_read_u32(cpu, "clock-frequency", - &cpuinfo.clock_frequency)) { + &cpuinfo->clock_frequency)) { printk(KERN_WARNING "Device tree missing CPU 'clock-frequency' parameter." "Assuming frequency 25MHZ" "This is probably not what you want."); } + cpuinfo->coreid = mfspr(SPR_COREID); + of_node_put(cpu); print_cpuinfo(); @@ -251,8 +272,8 @@ void __init detect_unit_config(unsigned long upr, unsigned long mask, void calibrate_delay(void) { const int *val; - struct device_node *cpu = NULL; - cpu = of_find_compatible_node(NULL, NULL, "opencores,or1200-rtlsvn481"); + struct device_node *cpu = setup_find_cpu_node(smp_processor_id()); + val = of_get_property(cpu, "clock-frequency", NULL); if (!val) panic("no cpu 'clock-frequency' parameter in device tree"); @@ -268,6 +289,10 @@ void __init setup_arch(char **cmdline_p) setup_cpuinfo(); +#ifdef CONFIG_SMP + smp_init_cpus(); +#endif + /* process 1's initial memory region is the kernel code/data */ init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long)_etext; @@ -302,54 +327,78 @@ void __init setup_arch(char **cmdline_p) static int show_cpuinfo(struct seq_file *m, void *v) { - unsigned long vr; - int version, revision; + unsigned int vr, cpucfgr; + unsigned int avr; + unsigned int version; + struct cpuinfo_or1k *cpuinfo = v; vr = mfspr(SPR_VR); - version = (vr & SPR_VR_VER) >> 24; - revision = vr & SPR_VR_REV; + cpucfgr = mfspr(SPR_CPUCFGR); + +#ifdef CONFIG_SMP + seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid); +#endif + if (vr & SPR_VR_UVRP) { + vr = mfspr(SPR_VR2); + version = vr & SPR_VR2_VER; + avr = mfspr(SPR_AVR); + seq_printf(m, "cpu architecture\t: " + "OpenRISC 1000 (%d.%d-rev%d)\n", + (avr >> 24) & 0xff, + (avr >> 16) & 0xff, + (avr >> 8) & 0xff); + seq_printf(m, "cpu implementation id\t: 0x%x\n", + (vr & SPR_VR2_CPUID) >> 24); + seq_printf(m, "cpu version\t\t: 0x%x\n", version); + } else { + version = (vr & SPR_VR_VER) >> 24; + seq_printf(m, "cpu\t\t\t: OpenRISC-%x\n", version); + seq_printf(m, "revision\t\t: %d\n", vr & SPR_VR_REV); + } + seq_printf(m, "frequency\t\t: %ld\n", loops_per_jiffy * HZ); + seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache_size); + seq_printf(m, "dcache block size\t: %d bytes\n", + cpuinfo->dcache_block_size); + seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache_ways); + seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache_size); + seq_printf(m, "icache block size\t: %d bytes\n", + cpuinfo->icache_block_size); + seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache_ways); + seq_printf(m, "immu\t\t\t: %d entries, %lu ways\n", + 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), + 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW)); + seq_printf(m, "dmmu\t\t\t: %d entries, %lu ways\n", + 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2), + 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW)); + seq_printf(m, "bogomips\t\t: %lu.%02lu\n", + (loops_per_jiffy * HZ) / 500000, + ((loops_per_jiffy * HZ) / 5000) % 100); + + seq_puts(m, "features\t\t: "); + seq_printf(m, "%s ", cpucfgr & SPR_CPUCFGR_OB32S ? "orbis32" : ""); + seq_printf(m, "%s ", cpucfgr & SPR_CPUCFGR_OB64S ? "orbis64" : ""); + seq_printf(m, "%s ", cpucfgr & SPR_CPUCFGR_OF32S ? "orfpx32" : ""); + seq_printf(m, "%s ", cpucfgr & SPR_CPUCFGR_OF64S ? "orfpx64" : ""); + seq_printf(m, "%s ", cpucfgr & SPR_CPUCFGR_OV64S ? "orvdx64" : ""); + seq_puts(m, "\n"); + + seq_puts(m, "\n"); - seq_printf(m, - "cpu\t\t: OpenRISC-%x\n" - "revision\t: %d\n" - "frequency\t: %ld\n" - "dcache size\t: %d bytes\n" - "dcache block size\t: %d bytes\n" - "dcache ways\t: %d\n" - "icache size\t: %d bytes\n" - "icache block size\t: %d bytes\n" - "icache ways\t: %d\n" - "immu\t\t: %d entries, %lu ways\n" - "dmmu\t\t: %d entries, %lu ways\n" - "bogomips\t: %lu.%02lu\n", - version, - revision, - loops_per_jiffy * HZ, - cpuinfo.dcache_size, - cpuinfo.dcache_block_size, - cpuinfo.dcache_ways, - cpuinfo.icache_size, - cpuinfo.icache_block_size, - cpuinfo.icache_ways, - 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), - 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW), - 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2), - 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW), - (loops_per_jiffy * HZ) / 500000, - ((loops_per_jiffy * HZ) / 5000) % 100); return 0; } -static void *c_start(struct seq_file *m, loff_t * pos) +static void *c_start(struct seq_file *m, loff_t *pos) { - /* We only have one CPU... */ - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return &cpuinfo_or1k[*pos]; + return NULL; } -static void *c_next(struct seq_file *m, void *v, loff_t * pos) +static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - ++*pos; - return NULL; + (*pos)++; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c new file mode 100644 index 000000000000..fd724123229a --- /dev/null +++ b/arch/openrisc/kernel/smp.c @@ -0,0 +1,235 @@ +/* + * Copyright (C) 2014 Stefan Kristiansson + * Copyright (C) 2017 Stafford Horne + * + * Based on arm64 and arc implementations + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void (*smp_cross_call)(const struct cpumask *, unsigned int); + +unsigned long secondary_release = -1; +struct thread_info *secondary_thread_info; + +enum ipi_msg_type { + IPI_RESCHEDULE, + IPI_CALL_FUNC, + IPI_CALL_FUNC_SINGLE, +}; + +static DEFINE_SPINLOCK(boot_lock); + +static void boot_secondary(unsigned int cpu, struct task_struct *idle) +{ + /* + * set synchronisation state between this boot processor + * and the secondary one + */ + spin_lock(&boot_lock); + + secondary_release = cpu; + + /* + * now the secondary core is starting up let it run its + * calibrations, then wait for it to finish + */ + spin_unlock(&boot_lock); +} + +void __init smp_prepare_boot_cpu(void) +{ +} + +void __init smp_init_cpus(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + set_cpu_possible(i, true); +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + int i; + + /* + * Initialise the present map, which describes the set of CPUs + * actually populated at the present time. + */ + for (i = 0; i < max_cpus; i++) + set_cpu_present(i, true); +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +} + +static DECLARE_COMPLETION(cpu_running); + +int __cpu_up(unsigned int cpu, struct task_struct *idle) +{ + if (smp_cross_call == NULL) { + pr_warn("CPU%u: failed to start, IPI controller missing", + cpu); + return -EIO; + } + + secondary_thread_info = task_thread_info(idle); + current_pgd[cpu] = init_mm.pgd; + + boot_secondary(cpu, idle); + if (!wait_for_completion_timeout(&cpu_running, + msecs_to_jiffies(1000))) { + pr_crit("CPU%u: failed to start\n", cpu); + return -EIO; + } + + return 0; +} + +asmlinkage __init void secondary_start_kernel(void) +{ + struct mm_struct *mm = &init_mm; + unsigned int cpu = smp_processor_id(); + /* + * All kernel threads share the same mm context; grab a + * reference and switch to it. + */ + atomic_inc(&mm->mm_count); + current->active_mm = mm; + cpumask_set_cpu(cpu, mm_cpumask(mm)); + + pr_info("CPU%u: Booted secondary processor\n", cpu); + + setup_cpuinfo(); + openrisc_clockevent_init(); + + notify_cpu_starting(cpu); + + /* + * OK, now it's safe to let the boot CPU continue + */ + set_cpu_online(cpu, true); + complete(&cpu_running); + + local_irq_enable(); + + /* + * OK, it's off to the idle thread for us + */ + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +void handle_IPI(unsigned int ipi_msg) +{ + unsigned int cpu = smp_processor_id(); + + switch (ipi_msg) { + case IPI_RESCHEDULE: + scheduler_ipi(); + break; + + case IPI_CALL_FUNC: + generic_smp_call_function_interrupt(); + break; + + case IPI_CALL_FUNC_SINGLE: + generic_smp_call_function_single_interrupt(); + break; + + default: + WARN(1, "CPU%u: Unknown IPI message 0x%x\n", cpu, ipi_msg); + break; + } +} + +void smp_send_reschedule(int cpu) +{ + smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); +} + +static void stop_this_cpu(void *dummy) +{ + /* Remove this CPU */ + set_cpu_online(smp_processor_id(), false); + + local_irq_disable(); + /* CPU Doze */ + if (mfspr(SPR_UPR) & SPR_UPR_PMP) + mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME); + /* If that didn't work, infinite loop */ + while (1) + ; +} + +void smp_send_stop(void) +{ + smp_call_function(stop_this_cpu, NULL, 0); +} + +/* not supported, yet */ +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) +{ + smp_cross_call = fn; +} + +void arch_send_call_function_single_ipi(int cpu) +{ + smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); +} + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + smp_cross_call(mask, IPI_CALL_FUNC); +} + +/* TLB flush operations - Performed on each CPU*/ +static inline void ipi_flush_tlb_all(void *ignored) +{ + local_flush_tlb_all(); +} + +void flush_tlb_all(void) +{ + on_each_cpu(ipi_flush_tlb_all, NULL, 1); +} + +/* + * FIXME: implement proper functionality instead of flush_tlb_all. + * *But*, as things currently stands, the local_tlb_flush_* functions will + * all boil down to local_tlb_flush_all anyway. + */ +void flush_tlb_mm(struct mm_struct *mm) +{ + on_each_cpu(ipi_flush_tlb_all, NULL, 1); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) +{ + on_each_cpu(ipi_flush_tlb_all, NULL, 1); +} + +void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + on_each_cpu(ipi_flush_tlb_all, NULL, 1); +} diff --git a/arch/openrisc/kernel/time.c b/arch/openrisc/kernel/time.c index 687c11d048d7..ab04eaedbf8d 100644 --- a/arch/openrisc/kernel/time.c +++ b/arch/openrisc/kernel/time.c @@ -53,13 +53,32 @@ static int openrisc_timer_set_next_event(unsigned long delta, * timers) we cannot enable the PERIODIC feature. The tick timer can run using * one-shot events, so no problem. */ +DEFINE_PER_CPU(struct clock_event_device, clockevent_openrisc_timer); -static struct clock_event_device clockevent_openrisc_timer = { - .name = "openrisc_timer_clockevent", - .features = CLOCK_EVT_FEAT_ONESHOT, - .rating = 300, - .set_next_event = openrisc_timer_set_next_event, -}; +void openrisc_clockevent_init(void) +{ + unsigned int cpu = smp_processor_id(); + struct clock_event_device *evt = + &per_cpu(clockevent_openrisc_timer, cpu); + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu]; + + mtspr(SPR_TTMR, SPR_TTMR_CR); + +#ifdef CONFIG_SMP + evt->broadcast = tick_broadcast; +#endif + evt->name = "openrisc_timer_clockevent", + evt->features = CLOCK_EVT_FEAT_ONESHOT, + evt->rating = 300, + evt->set_next_event = openrisc_timer_set_next_event, + + evt->cpumask = cpumask_of(cpu); + + /* We only have 28 bits */ + clockevents_config_and_register(evt, cpuinfo->clock_frequency, + 100, 0x0fffffff); + +} static inline void timer_ack(void) { @@ -83,7 +102,9 @@ static inline void timer_ack(void) irqreturn_t __irq_entry timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - struct clock_event_device *evt = &clockevent_openrisc_timer; + unsigned int cpu = smp_processor_id(); + struct clock_event_device *evt = + &per_cpu(clockevent_openrisc_timer, cpu); timer_ack(); @@ -99,24 +120,12 @@ irqreturn_t __irq_entry timer_interrupt(struct pt_regs *regs) return IRQ_HANDLED; } -static __init void openrisc_clockevent_init(void) -{ - clockevent_openrisc_timer.cpumask = cpumask_of(0); - - /* We only have 28 bits */ - clockevents_config_and_register(&clockevent_openrisc_timer, - cpuinfo.clock_frequency, - 100, 0x0fffffff); - -} - /** * Clocksource: Based on OpenRISC timer/counter * * This sets up the OpenRISC Tick Timer as a clock source. The tick timer * is 32 bits wide and runs at the CPU clock frequency. */ - static u64 openrisc_timer_read(struct clocksource *cs) { return (u64) mfspr(SPR_TTCR); @@ -132,7 +141,9 @@ static struct clocksource openrisc_timer = { static int __init openrisc_timer_init(void) { - if (clocksource_register_hz(&openrisc_timer, cpuinfo.clock_frequency)) + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; + + if (clocksource_register_hz(&openrisc_timer, cpuinfo->clock_frequency)) panic("failed to register clocksource"); /* Enable the incrementer: 'continuous' mode with interrupt disabled */ diff --git a/arch/openrisc/lib/delay.c b/arch/openrisc/lib/delay.c index 8b13fdf43ec6..a92bd621aa1f 100644 --- a/arch/openrisc/lib/delay.c +++ b/arch/openrisc/lib/delay.c @@ -25,7 +25,7 @@ int read_current_timer(unsigned long *timer_value) { - *timer_value = mfspr(SPR_TTCR); + *timer_value = get_cycles(); return 0; } diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index e310ab499385..d0021dfae20a 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -33,7 +33,7 @@ unsigned long pte_errors; /* updated by do_page_fault() */ /* __PHX__ :: - check the vmalloc_fault in do_page_fault() * - also look into include/asm-or32/mmu_context.h */ -volatile pgd_t *current_pgd; +volatile pgd_t *current_pgd[NR_CPUS]; extern void die(char *, struct pt_regs *, long); @@ -319,7 +319,7 @@ vmalloc_fault: phx_mmu("vmalloc_fault"); */ - pgd = (pgd_t *)current_pgd + offset; + pgd = (pgd_t *)current_pgd[smp_processor_id()] + offset; pgd_k = init_mm.pgd + offset; /* Since we're two-level, we don't need to do both diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index f67d82b9d22f..6972d5d6f23f 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -147,7 +147,7 @@ void __init paging_init(void) * (even if it is most probably not used until the next * switch_mm) */ - current_pgd = init_mm.pgd; + current_pgd[smp_processor_id()] = init_mm.pgd; end = (unsigned long)__va(max_low_pfn * PAGE_SIZE); diff --git a/arch/openrisc/mm/tlb.c b/arch/openrisc/mm/tlb.c index 683bd4d31c7c..6c253a2e86bc 100644 --- a/arch/openrisc/mm/tlb.c +++ b/arch/openrisc/mm/tlb.c @@ -49,7 +49,7 @@ * */ -void flush_tlb_all(void) +void local_flush_tlb_all(void) { int i; unsigned long num_tlb_sets; @@ -86,7 +86,7 @@ void flush_tlb_all(void) #define flush_itlb_page_no_eir(addr) \ mtspr_off(SPR_ITLBMR_BASE(0), ITLB_OFFSET(addr), 0); -void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { if (have_dtlbeir) flush_dtlb_page_eir(addr); @@ -99,8 +99,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) flush_itlb_page_no_eir(addr); } -void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +void local_flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { int addr; bool dtlbeir; @@ -129,13 +129,13 @@ void flush_tlb_range(struct vm_area_struct *vma, * This should be changed to loop over over mm and call flush_tlb_range. */ -void flush_tlb_mm(struct mm_struct *mm) +void local_flush_tlb_mm(struct mm_struct *mm) { /* Was seeing bugs with the mm struct passed to us. Scrapped most of this function. */ /* Several architctures do this */ - flush_tlb_all(); + local_flush_tlb_all(); } /* called in schedule() just before actually doing the switch_to */ @@ -149,14 +149,14 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, * might be invalid at points where we still need to derefer * the pgd. */ - current_pgd = next->pgd; + current_pgd[smp_processor_id()] = next->pgd; /* We don't have context support implemented, so flush all * entries belonging to previous map */ if (prev != next) - flush_tlb_mm(prev); + local_flush_tlb_mm(prev); } From b441aab7aa0e15955c432736b08a218a6a4c77f0 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Wed, 12 Jul 2017 17:20:38 +0900 Subject: [PATCH 0900/1085] openrisc: fix initial preempt state for secondary cpu tasks During SMP testing we were getting the below warning after booting the secondary cpu: [ 0.060000] BUG: scheduling while atomic: swapper/1/0/0x00000000 This change follows similar patterns from other architectures to start the schduler with preempt disabled. Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/thread_info.h | 2 +- arch/openrisc/kernel/smp.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 6e619a79a401..c229aa6bb502 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -74,7 +74,7 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .ksp = 0, \ } diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index fd724123229a..154c94a0cfbc 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -128,6 +128,7 @@ asmlinkage __init void secondary_start_kernel(void) local_irq_enable(); + preempt_disable(); /* * OK, it's off to the idle thread for us */ From c056718464512da06d7f65a27d5e4f1707b24c80 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 24 Jun 2017 07:09:59 +0900 Subject: [PATCH 0901/1085] openrisc: sleep instead of spin on secondary wait Currently we do a spin on secondary cpus when waiting to boot. This theoretically causes issues with power consumption and does cause issues with qemu cycle burning (it starves cpu 0 from actually being able to boot.) This change puts each secondary cpu to sleep if they have a power management unit, then signals them to wake via IPI when its time to boot. If the cpus have no power management unit they will loop as before. Note: The wakeup IPI requires a special interrupt handler as on secondary cpu's the interrupt infrastructure is not yet established. This interrupt handler is set and reset by updating SPR_EVBAR. Signed-off-by: Stafford Horne --- arch/openrisc/kernel/head.S | 51 +++++++++++++++++++++++++++++++++++-- arch/openrisc/kernel/smp.c | 5 ++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index a9972dc103f8..fb02b2a1d6f2 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -712,9 +712,45 @@ _flush_tlb: #ifdef CONFIG_SMP secondary_wait: + /* Doze the cpu until we are asked to run */ + /* If we dont have power management skip doze */ + l.mfspr r25,r0,SPR_UPR + l.andi r25,r25,SPR_UPR_PMP + l.sfeq r25,r0 + l.bf secondary_check_release + l.nop + + /* Setup special secondary exception handler */ + LOAD_SYMBOL_2_GPR(r3, _secondary_evbar) + tophys(r25,r3) + l.mtspr r0,r25,SPR_EVBAR + + /* Enable Interrupts */ + l.mfspr r25,r0,SPR_SR + l.ori r25,r25,SPR_SR_IEE + l.mtspr r0,r25,SPR_SR + + /* Unmask interrupts interrupts */ + l.mfspr r25,r0,SPR_PICMR + l.ori r25,r25,0xffff + l.mtspr r0,r25,SPR_PICMR + + /* Doze */ + l.mfspr r25,r0,SPR_PMR + LOAD_SYMBOL_2_GPR(r3, SPR_PMR_DME) + l.or r25,r25,r3 + l.mtspr r0,r25,SPR_PMR + + /* Wakeup - Restore exception handler */ + l.mtspr r0,r0,SPR_EVBAR + +secondary_check_release: + /* + * Check if we actually got the release signal, if not go-back to + * sleep. + */ l.mfspr r25,r0,SPR_COREID - l.movhi r3,hi(secondary_release) - l.ori r3,r3,lo(secondary_release) + LOAD_SYMBOL_2_GPR(r3, secondary_release) tophys(r4, r3) l.lwz r3,0(r4) l.sfeq r25,r3 @@ -1663,6 +1699,17 @@ ENTRY(_early_uart_init) l.jr r9 l.nop + .align 0x1000 + .global _secondary_evbar +_secondary_evbar: + + .space 0x800 + /* Just disable interrupts and Return */ + l.ori r3,r0,SPR_SR_SM + l.mtspr r0,r3,SPR_ESR_BASE + l.rfe + + .section .rodata _string_unhandled_exception: .string "\n\rRunarunaround: Unhandled exception 0x\0" diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 154c94a0cfbc..685b4934fa39 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -26,6 +26,7 @@ unsigned long secondary_release = -1; struct thread_info *secondary_thread_info; enum ipi_msg_type { + IPI_WAKEUP, IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CALL_FUNC_SINGLE, @@ -42,6 +43,7 @@ static void boot_secondary(unsigned int cpu, struct task_struct *idle) spin_lock(&boot_lock); secondary_release = cpu; + smp_cross_call(cpumask_of(cpu), IPI_WAKEUP); /* * now the secondary core is starting up let it run its @@ -140,6 +142,9 @@ void handle_IPI(unsigned int ipi_msg) unsigned int cpu = smp_processor_id(); switch (ipi_msg) { + case IPI_WAKEUP: + break; + case IPI_RESCHEDULE: scheduler_ipi(); break; From 4ee93d80ad73980826d582c7c37caa9597822001 Mon Sep 17 00:00:00 2001 From: Jan Henrik Weinstock Date: Wed, 4 Nov 2015 17:26:10 +0100 Subject: [PATCH 0902/1085] openrisc: add cacheflush support to fix icache aliasing On OpenRISC the icache does not snoop data stores. This can cause aliasing as reported by Jan. This patch fixes the issue to ensure icache is properly synchronized when code is written to memory. It supports both SMP and UP flushing. This supports dcache flush as well for architectures that do not support write-through caches; most OpenRISC implementations do implement write-through cache however. Dcache flushes are done only on a single core as OpenRISC dcaches all support snooping of bus stores. Signed-off-by: Jan Henrik Weinstock [shorne@gmail.com: Squashed patches and wrote commit message] Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 11 +++ arch/openrisc/include/asm/Kbuild | 1 - arch/openrisc/include/asm/cacheflush.h | 96 ++++++++++++++++++++++++++ arch/openrisc/include/asm/pgtable.h | 16 +++-- arch/openrisc/kernel/smp.c | 15 ++++ arch/openrisc/mm/Makefile | 2 +- arch/openrisc/mm/cache.c | 61 ++++++++++++++++ 7 files changed, 194 insertions(+), 8 deletions(-) create mode 100644 arch/openrisc/include/asm/cacheflush.h create mode 100644 arch/openrisc/mm/cache.c diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 2b3898ede888..bfff04ae7f7d 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -77,6 +77,17 @@ config OR1K_1200 endchoice +config DCACHE_WRITETHROUGH + bool "Have write through data caches" + default n + help + Select this if your implementation features write through data caches. + Selecting 'N' here will allow the kernel to force flushing of data + caches at relevant times. Most OpenRISC implementations support write- + through data caches. + + If unsure say N here + config OPENRISC_BUILTIN_DTB string "Builtin DTB" default "" diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 5f066780d870..6eb16719549e 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -1,7 +1,6 @@ generic-y += barrier.h generic-y += bug.h generic-y += bugs.h -generic-y += cacheflush.h generic-y += checksum.h generic-y += clkdev.h generic-y += current.h diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h new file mode 100644 index 000000000000..70f46fd7a074 --- /dev/null +++ b/arch/openrisc/include/asm/cacheflush.h @@ -0,0 +1,96 @@ +/* + * OpenRISC Linux + * + * Linux architectural port borrowing liberally from similar works of + * others. All original copyrights apply as per the original source + * declaration. + * + * OpenRISC implementation: + * Copyright (C) Jan Henrik Weinstock + * et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __ASM_CACHEFLUSH_H +#define __ASM_CACHEFLUSH_H + +#include + +/* + * Helper function for flushing or invalidating entire pages from data + * and instruction caches. SMP needs a little extra work, since we need + * to flush the pages on all cpus. + */ +extern void local_dcache_page_flush(struct page *page); +extern void local_icache_page_inv(struct page *page); + +/* + * Data cache flushing always happen on the local cpu. Instruction cache + * invalidations need to be broadcasted to all other cpu in the system in + * case of SMP configurations. + */ +#ifndef CONFIG_SMP +#define dcache_page_flush(page) local_dcache_page_flush(page) +#define icache_page_inv(page) local_icache_page_inv(page) +#else /* CONFIG_SMP */ +#define dcache_page_flush(page) local_dcache_page_flush(page) +#define icache_page_inv(page) smp_icache_page_inv(page) +extern void smp_icache_page_inv(struct page *page); +#endif /* CONFIG_SMP */ + +/* + * Synchronizes caches. Whenever a cpu writes executable code to memory, this + * should be called to make sure the processor sees the newly written code. + */ +static inline void sync_icache_dcache(struct page *page) +{ + if (!IS_ENABLED(CONFIG_DCACHE_WRITETHROUGH)) + dcache_page_flush(page); + icache_page_inv(page); +} + +/* + * Pages with this bit set need not be flushed/invalidated, since + * they have not changed since last flush. New pages start with + * PG_arch_1 not set and are therefore dirty by default. + */ +#define PG_dc_clean PG_arch_1 + +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_page(struct page *page) +{ + clear_bit(PG_dc_clean, &page->flags); +} + +/* + * Other interfaces are not required since we do not have virtually + * indexed or tagged caches. So we can use the default here. + */ +#define flush_cache_all() do { } while (0) +#define flush_cache_mm(mm) do { } while (0) +#define flush_cache_dup_mm(mm) do { } while (0) +#define flush_cache_range(vma, start, end) do { } while (0) +#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) +#define flush_icache_range(start, end) do { } while (0) +#define flush_icache_page(vma, pg) do { } while (0) +#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) +#define flush_cache_vmap(start, end) do { } while (0) +#define flush_cache_vunmap(start, end) do { } while (0) + +#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ + do { \ + memcpy(dst, src, len); \ + if (vma->vm_flags & VM_EXEC) \ + sync_icache_dcache(page); \ + } while (0) + +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + memcpy(dst, src, len) + +#endif /* __ASM_CACHEFLUSH_H */ diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index eff5ba2a5af2..21c71303012f 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -416,15 +416,19 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* defined in head.S */ struct vm_area_struct; -/* - * or32 doesn't have any external MMU info: the kernel page - * tables contain all the necessary information. - * - * Actually I am not sure on what this could be used for. - */ +static inline void update_tlb(struct vm_area_struct *vma, + unsigned long address, pte_t *pte) +{ +} + +extern void update_cache(struct vm_area_struct *vma, + unsigned long address, pte_t *pte); + static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { + update_tlb(vma, address, pte); + update_cache(vma, address, pte); } /* __PHX__ FIXME, SWAP, this probably doesn't work */ diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 685b4934fa39..4763b8b9161e 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include static void (*smp_cross_call)(const struct cpumask *, unsigned int); @@ -239,3 +240,17 @@ void flush_tlb_range(struct vm_area_struct *vma, { on_each_cpu(ipi_flush_tlb_all, NULL, 1); } + +/* Instruction cache invalidate - performed on each cpu */ +static void ipi_icache_page_inv(void *arg) +{ + struct page *page = arg; + + local_icache_page_inv(page); +} + +void smp_icache_page_inv(struct page *page) +{ + on_each_cpu(ipi_icache_page_inv, page, 1); +} +EXPORT_SYMBOL(smp_icache_page_inv); diff --git a/arch/openrisc/mm/Makefile b/arch/openrisc/mm/Makefile index 324ba2634529..a31b2a42e966 100644 --- a/arch/openrisc/mm/Makefile +++ b/arch/openrisc/mm/Makefile @@ -2,4 +2,4 @@ # Makefile for the linux openrisc-specific parts of the memory manager. # -obj-y := fault.o tlb.o init.o ioremap.o +obj-y := fault.o cache.o tlb.o init.o ioremap.o diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c new file mode 100644 index 000000000000..b747bf1fc1b6 --- /dev/null +++ b/arch/openrisc/mm/cache.c @@ -0,0 +1,61 @@ +/* + * OpenRISC cache.c + * + * Linux architectural port borrowing liberally from similar works of + * others. All original copyrights apply as per the original source + * declaration. + * + * Modifications for the OpenRISC architecture: + * Copyright (C) 2015 Jan Henrik Weinstock + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +static void cache_loop(struct page *page, const unsigned int reg) +{ + unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT; + unsigned long line = paddr & ~(L1_CACHE_BYTES - 1); + + while (line < paddr + PAGE_SIZE) { + mtspr(reg, line); + line += L1_CACHE_BYTES; + } +} + +void local_dcache_page_flush(struct page *page) +{ + cache_loop(page, SPR_DCBFR); +} +EXPORT_SYMBOL(local_dcache_page_flush); + +void local_icache_page_inv(struct page *page) +{ + cache_loop(page, SPR_ICBIR); +} +EXPORT_SYMBOL(local_icache_page_inv); + +void update_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *pte) +{ + unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); + int dirty = !test_and_set_bit(PG_dc_clean, &page->flags); + + /* + * Since icaches do not snoop for updated data on OpenRISC, we + * must write back and invalidate any dirty pages manually. We + * can skip data pages, since they will not end up in icaches. + */ + if ((vma->vm_flags & VM_EXEC) && dirty) + sync_icache_dcache(page); +} + From 306e5e50a32140452849fff42fca104511fd6668 Mon Sep 17 00:00:00 2001 From: Stefan Kristiansson Date: Sun, 18 May 2014 10:54:47 +0300 Subject: [PATCH 0903/1085] openrisc: add simple_smp dts and defconfig for simulators Simple enough to be compatible with simulation environments, such as verilated systems, QEMU and other targets supporting OpenRISC SMP. This also supports our base FPGA SoC's if the cpu frequency is upped to 50Mhz. Signed-off-by: Stefan Kristiansson [shorne@gmail.com: Added defconfig] Signed-off-by: Stafford Horne --- arch/openrisc/boot/dts/simple_smp.dts | 63 +++++++++++++++++++++ arch/openrisc/configs/simple_smp_defconfig | 66 ++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 arch/openrisc/boot/dts/simple_smp.dts create mode 100644 arch/openrisc/configs/simple_smp_defconfig diff --git a/arch/openrisc/boot/dts/simple_smp.dts b/arch/openrisc/boot/dts/simple_smp.dts new file mode 100644 index 000000000000..defbb92714ec --- /dev/null +++ b/arch/openrisc/boot/dts/simple_smp.dts @@ -0,0 +1,63 @@ +/dts-v1/; +/ { + compatible = "opencores,or1ksim"; + #address-cells = <1>; + #size-cells = <1>; + interrupt-parent = <&pic>; + + aliases { + uart0 = &serial0; + }; + + chosen { + bootargs = "earlycon"; + stdout-path = "uart0:115200"; + }; + + memory@0 { + device_type = "memory"; + reg = <0x00000000 0x02000000>; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + cpu@0 { + compatible = "opencores,or1200-rtlsvn481"; + reg = <0>; + clock-frequency = <20000000>; + }; + cpu@1 { + compatible = "opencores,or1200-rtlsvn481"; + reg = <1>; + clock-frequency = <20000000>; + }; + }; + + ompic: ompic@98000000 { + compatible = "openrisc,ompic"; + reg = <0x98000000 16>; + interrupt-controller; + #interrupt-cells = <0>; + interrupts = <1>; + }; + + /* + * OR1K PIC is built into CPU and accessed via special purpose + * registers. It is not addressable and, hence, has no 'reg' + * property. + */ + pic: pic { + compatible = "opencores,or1k-pic-level"; + #interrupt-cells = <1>; + interrupt-controller; + }; + + serial0: serial@90000000 { + compatible = "opencores,uart16550-rtlsvn105", "ns16550a"; + reg = <0x90000000 0x100>; + interrupts = <2>; + clock-frequency = <20000000>; + }; + +}; diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig new file mode 100644 index 000000000000..b6e3c7e158e7 --- /dev/null +++ b/arch/openrisc/configs/simple_smp_defconfig @@ -0,0 +1,66 @@ +CONFIG_CROSS_COMPILE="or1k-linux-" +CONFIG_LOCALVERSION="-simple-smp" +CONFIG_NO_HZ=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_EXPERT=y +# CONFIG_KALLSYMS is not set +# CONFIG_EPOLL is not set +# CONFIG_TIMERFD is not set +# CONFIG_EVENTFD is not set +# CONFIG_AIO is not set +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SLOB=y +CONFIG_MODULES=y +# CONFIG_BLOCK is not set +CONFIG_OPENRISC_BUILTIN_DTB="simple_smp" +CONFIG_SMP=y +CONFIG_HZ_100=y +CONFIG_OPENRISC_HAVE_SHADOW_GPRS=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_DIAG is not set +CONFIG_TCP_CONG_ADVANCED=y +# CONFIG_TCP_CONG_BIC is not set +# CONFIG_TCP_CONG_CUBIC is not set +# CONFIG_TCP_CONG_WESTWOOD is not set +# CONFIG_TCP_CONG_HTCP is not set +# CONFIG_IPV6 is not set +# CONFIG_WIRELESS is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_PREVENT_FIRMWARE_BUILD is not set +# CONFIG_FW_LOADER is not set +CONFIG_NETDEVICES=y +CONFIG_ETHOC=y +CONFIG_MICREL_PHY=y +# CONFIG_WLAN is not set +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_DNOTIFY is not set +CONFIG_TMPFS=y +CONFIG_NFS_FS=y +CONFIG_XZ_DEC=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +# CONFIG_RCU_TRACE is not set From eecac38b0423a69715073ecbde581dafd1abb28b Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Mon, 24 Jul 2017 21:44:35 +0900 Subject: [PATCH 0904/1085] openrisc: support framepointers and STACKTRACE_SUPPORT For lockdep support a reliable stack trace mechanism is needed. This patch adds support in OpenRISC for the stacktrace framework, implemented by a simple unwinder api. The unwinder api supports both framepointer and basic stack tracing. The unwinder is now used to replace the stack_dump() implementation as well. The new traces are inline with other architectures trace format: Call trace: [] show_stack+0x3c/0x58 [] dump_stack+0xa8/0xe4 [] __cpu_up+0x64/0x130 [] bringup_cpu+0x3c/0x178 [] cpuhp_invoke_callback+0xa8/0x1fc [] cpuhp_up_callbacks+0x44/0x14c [] cpu_up+0x14c/0x1bc [] smp_init+0x104/0x15c [] ? kernel_init+0x0/0x140 [] kernel_init_freeable+0xbc/0x25c [] ? kernel_init+0x0/0x140 [] kernel_init+0x1c/0x140 [] ? schedule_tail+0x18/0xa0 [] ret_from_fork+0x1c/0x9c Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 4 + arch/openrisc/include/asm/unwinder.h | 20 +++++ arch/openrisc/kernel/Makefile | 3 +- arch/openrisc/kernel/stacktrace.c | 86 ++++++++++++++++++++++ arch/openrisc/kernel/traps.c | 54 ++------------ arch/openrisc/kernel/unwinder.c | 105 +++++++++++++++++++++++++++ 6 files changed, 224 insertions(+), 48 deletions(-) create mode 100644 arch/openrisc/include/asm/unwinder.h create mode 100644 arch/openrisc/kernel/stacktrace.c create mode 100644 arch/openrisc/kernel/unwinder.c diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index bfff04ae7f7d..399f55e82dcb 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -33,6 +33,7 @@ config OPENRISC select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_RWLOCKS select OMPIC if SMP + select ARCH_WANT_FRAME_POINTERS config CPU_BIG_ENDIAN def_bool y @@ -60,6 +61,9 @@ config TRACE_IRQFLAGS_SUPPORT config GENERIC_CSUM def_bool y +config STACKTRACE_SUPPORT + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/openrisc/include/asm/unwinder.h b/arch/openrisc/include/asm/unwinder.h new file mode 100644 index 000000000000..165ec6f02ab8 --- /dev/null +++ b/arch/openrisc/include/asm/unwinder.h @@ -0,0 +1,20 @@ +/* + * OpenRISC unwinder.h + * + * Architecture API for unwinding stacks. + * + * Copyright (C) 2017 Stafford Horne + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#ifndef __ASM_OPENRISC_UNWINDER_H +#define __ASM_OPENRISC_UNWINDER_H + +void unwind_stack(void *data, unsigned long *stack, + void (*trace)(void *data, unsigned long addr, + int reliable)); + +#endif /* __ASM_OPENRISC_UNWINDER_H */ diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index 7d94643c878d..b4b51a07016a 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -6,9 +6,10 @@ extra-y := head.o vmlinux.lds obj-y := setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ - sys_call_table.o + sys_call_table.o unwinder.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_OF) += prom.o diff --git a/arch/openrisc/kernel/stacktrace.c b/arch/openrisc/kernel/stacktrace.c new file mode 100644 index 000000000000..43f140a28bc7 --- /dev/null +++ b/arch/openrisc/kernel/stacktrace.c @@ -0,0 +1,86 @@ +/* + * Stack trace utility for OpenRISC + * + * Copyright (C) 2017 Stafford Horne + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + * + * Losely based on work from sh and powerpc. + */ + +#include +#include +#include +#include + +#include +#include + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +static void +save_stack_address(void *data, unsigned long addr, int reliable) +{ + struct stack_trace *trace = data; + + if (!reliable) + return; + + if (trace->skip > 0) { + trace->skip--; + return; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = addr; +} + +void save_stack_trace(struct stack_trace *trace) +{ + unwind_stack(trace, (unsigned long *) &trace, save_stack_address); +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +static void +save_stack_address_nosched(void *data, unsigned long addr, int reliable) +{ + struct stack_trace *trace = (struct stack_trace *)data; + + if (!reliable) + return; + + if (in_sched_functions(addr)) + return; + + if (trace->skip > 0) { + trace->skip--; + return; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = addr; +} + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + unsigned long *sp = NULL; + + if (tsk == current) + sp = (unsigned long *) &sp; + else + sp = (unsigned long *) KSTK_ESP(tsk); + + unwind_stack(trace, sp, save_stack_address_nosched); +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +void +save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + unwind_stack(trace, (unsigned long *) regs->sp, + save_stack_address_nosched); +} +EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 803e9e756f77..4085d72fa5ae 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -38,6 +38,7 @@ #include #include #include +#include extern char _etext, _stext; @@ -45,61 +46,20 @@ int kstack_depth_to_print = 0x180; int lwa_flag; unsigned long __user *lwa_addr; -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +void print_trace(void *data, unsigned long addr, int reliable) { - return p > (void *)tinfo && p < (void *)tinfo + THREAD_SIZE - 3; -} - -void show_trace(struct task_struct *task, unsigned long *stack) -{ - struct thread_info *context; - unsigned long addr; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - - while (valid_stack_ptr(context, stack)) { - addr = *stack++; - if (__kernel_text_address(addr)) { - printk(" [<%08lx>]", addr); - print_symbol(" %s", addr); - printk("\n"); - } - } - printk(" =======================\n"); + pr_emerg("[<%p>] %s%pS\n", (void *) addr, reliable ? "" : "? ", + (void *) addr); } /* displays a short stack trace */ void show_stack(struct task_struct *task, unsigned long *esp) { - unsigned long addr, *stack; - int i; - if (esp == NULL) esp = (unsigned long *)&esp; - stack = esp; - - printk("Stack dump [0x%08lx]:\n", (unsigned long)esp); - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (__get_user(addr, stack)) { - /* This message matches "failing address" marked - s390 in ksymoops, so lines containing it will - not be filtered out by ksymoops. */ - printk("Failing address 0x%lx\n", (unsigned long)stack); - break; - } - stack++; - - printk("sp + %02d: 0x%08lx\n", i * 4, addr); - } - printk("\n"); - - show_trace(task, esp); - - return; + pr_emerg("Call trace:\n"); + unwind_stack(NULL, esp, print_trace); } void show_trace_task(struct task_struct *tsk) @@ -115,7 +75,7 @@ void show_registers(struct pt_regs *regs) int in_kernel = 1; unsigned long esp; - esp = (unsigned long)(®s->sp); + esp = (unsigned long)(regs->sp); if (user_mode(regs)) in_kernel = 0; diff --git a/arch/openrisc/kernel/unwinder.c b/arch/openrisc/kernel/unwinder.c new file mode 100644 index 000000000000..8ae15c2c1845 --- /dev/null +++ b/arch/openrisc/kernel/unwinder.c @@ -0,0 +1,105 @@ +/* + * OpenRISC unwinder.c + * + * Reusable arch specific api for unwinding stacks. + * + * Copyright (C) 2017 Stafford Horne + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include +#include + +#include + +#ifdef CONFIG_FRAME_POINTER +struct or1k_frameinfo { + unsigned long *fp; + unsigned long ra; + unsigned long top; +}; + +/* + * Verify a frameinfo structure. The return address should be a valid text + * address. The frame pointer may be null if its the last frame, otherwise + * the frame pointer should point to a location in the stack after the the + * top of the next frame up. + */ +static inline int or1k_frameinfo_valid(struct or1k_frameinfo *frameinfo) +{ + return (frameinfo->fp == NULL || + (!kstack_end(frameinfo->fp) && + frameinfo->fp > &frameinfo->top)) && + __kernel_text_address(frameinfo->ra); +} + +/* + * Create a stack trace doing scanning which is frame pointer aware. We can + * get reliable stack traces by matching the previously found frame + * pointer with the top of the stack address every time we find a valid + * or1k_frameinfo. + * + * Ideally the stack parameter will be passed as FP, but it can not be + * guaranteed. Therefore we scan each address looking for the first sign + * of a return address. + * + * The OpenRISC stack frame looks something like the following. The + * location SP is held in r1 and location FP is held in r2 when frame pointers + * enabled. + * + * SP -> (top of stack) + * - (callee saved registers) + * - (local variables) + * FP-8 -> previous FP \ + * FP-4 -> return address |- or1k_frameinfo + * FP -> (previous top of stack) / + */ +void unwind_stack(void *data, unsigned long *stack, + void (*trace)(void *data, unsigned long addr, int reliable)) +{ + unsigned long *next_fp = NULL; + struct or1k_frameinfo *frameinfo = NULL; + int reliable = 0; + + while (!kstack_end(stack)) { + frameinfo = container_of(stack, + struct or1k_frameinfo, + top); + + if (__kernel_text_address(frameinfo->ra)) { + if (or1k_frameinfo_valid(frameinfo) && + (next_fp == NULL || + next_fp == &frameinfo->top)) { + reliable = 1; + next_fp = frameinfo->fp; + } else + reliable = 0; + + trace(data, frameinfo->ra, reliable); + } + stack++; + } +} + +#else /* CONFIG_FRAME_POINTER */ + +/* + * Create a stack trace by doing a simple scan treating all text addresses + * as return addresses. + */ +void unwind_stack(void *data, unsigned long *stack, + void (*trace)(void *data, unsigned long addr, int reliable)) +{ + unsigned long addr; + + while (!kstack_end(stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) + trace(data, addr, 0); + } +} +#endif /* CONFIG_FRAME_POINTER */ + From 78cdfb5cf15e0f9fb4c2a9176a13a907a1d024f0 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Mon, 24 Jul 2017 21:55:16 +0900 Subject: [PATCH 0905/1085] openrisc: enable LOCKDEP_SUPPORT and irqflags tracing Lockdep is needed for proving the spinlocks and rwlocks work fine on our platform. It also requires calling the trace_hardirqs_off() and trace_hardirqs_on() pair of routines when entering and exiting an interrupt. For OpenRISC the interrupt stack frame does not support frame pointers, so to call trace_hardirqs_on() and trace_hardirqs_off() here the macro's build up a stack frame each time. There is one necessary small change in _sys_call_handler to move interrupt enabling later so they can get re-enabled during syscall restart. This was done to fix lockdep warnings that are now possible due to this patch. Signed-off-by: Stafford Horne --- arch/openrisc/Kconfig | 3 ++ arch/openrisc/kernel/entry.S | 74 ++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 399f55e82dcb..f8cfb3ba9e89 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -64,6 +64,9 @@ config GENERIC_CSUM config STACKTRACE_SUPPORT def_bool y +config LOCKDEP_SUPPORT + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index 1b7160c79646..690d55272ba6 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -42,6 +42,61 @@ /* =========================================================[ macros ]=== */ +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * Trace irq on/off creating a stack frame. + */ +#define TRACE_IRQS_OP(trace_op) \ + l.sw -8(r1),r2 /* store frame pointer */ ;\ + l.sw -4(r1),r9 /* store return address */ ;\ + l.addi r2,r1,0 /* move sp to fp */ ;\ + l.jal trace_op ;\ + l.addi r1,r1,-8 ;\ + l.ori r1,r2,0 /* restore sp */ ;\ + l.lwz r9,-4(r1) /* restore return address */ ;\ + l.lwz r2,-8(r1) /* restore fp */ ;\ +/* + * Trace irq on/off and save registers we need that would otherwise be + * clobbered. + */ +#define TRACE_IRQS_SAVE(t1,trace_op) \ + l.sw -12(r1),t1 /* save extra reg */ ;\ + l.sw -8(r1),r2 /* store frame pointer */ ;\ + l.sw -4(r1),r9 /* store return address */ ;\ + l.addi r2,r1,0 /* move sp to fp */ ;\ + l.jal trace_op ;\ + l.addi r1,r1,-12 ;\ + l.ori r1,r2,0 /* restore sp */ ;\ + l.lwz r9,-4(r1) /* restore return address */ ;\ + l.lwz r2,-8(r1) /* restore fp */ ;\ + l.lwz t1,-12(r1) /* restore extra reg */ + +#define TRACE_IRQS_OFF TRACE_IRQS_OP(trace_hardirqs_off) +#define TRACE_IRQS_ON TRACE_IRQS_OP(trace_hardirqs_on) +#define TRACE_IRQS_ON_SYSCALL \ + TRACE_IRQS_SAVE(r10,trace_hardirqs_on) ;\ + l.lwz r3,PT_GPR3(r1) ;\ + l.lwz r4,PT_GPR4(r1) ;\ + l.lwz r5,PT_GPR5(r1) ;\ + l.lwz r6,PT_GPR6(r1) ;\ + l.lwz r7,PT_GPR7(r1) ;\ + l.lwz r8,PT_GPR8(r1) ;\ + l.lwz r11,PT_GPR11(r1) +#define TRACE_IRQS_OFF_ENTRY \ + l.lwz r5,PT_SR(r1) ;\ + l.andi r3,r5,(SPR_SR_IEE|SPR_SR_TEE) ;\ + l.sfeq r5,r0 /* skip trace if irqs were already off */;\ + l.bf 1f ;\ + l.nop ;\ + TRACE_IRQS_SAVE(r4,trace_hardirqs_off) ;\ +1: +#else +#define TRACE_IRQS_OFF +#define TRACE_IRQS_ON +#define TRACE_IRQS_OFF_ENTRY +#define TRACE_IRQS_ON_SYSCALL +#endif + /* * We need to disable interrupts at beginning of RESTORE_ALL * since interrupt might come in after we've loaded EPC return address @@ -124,6 +179,7 @@ handler: ;\ /* r30 already save */ ;\ /* l.sw PT_GPR30(r1),r30*/ ;\ l.sw PT_GPR31(r1),r31 ;\ + TRACE_IRQS_OFF_ENTRY ;\ /* Store -1 in orig_gpr11 for non-syscall exceptions */ ;\ l.addi r30,r0,-1 ;\ l.sw PT_ORIG_GPR11(r1),r30 @@ -557,9 +613,6 @@ _string_syscall_return: .align 4 ENTRY(_sys_call_handler) - /* syscalls run with interrupts enabled */ - ENABLE_INTERRUPTS(r29) // enable interrupts, r29 is temp - /* r1, EPCR, ESR a already saved */ l.sw PT_GPR2(r1),r2 /* r3-r8 must be saved because syscall restart relies @@ -597,6 +650,10 @@ ENTRY(_sys_call_handler) /* l.sw PT_GPR30(r1),r30 */ _syscall_check_trace_enter: + /* syscalls run with interrupts enabled */ + TRACE_IRQS_ON_SYSCALL + ENABLE_INTERRUPTS(r29) // enable interrupts, r29 is temp + /* If TIF_SYSCALL_TRACE is set, then we want to do syscall tracing */ l.lwz r30,TI_FLAGS(r10) l.andi r30,r30,_TIF_SYSCALL_TRACE @@ -657,6 +714,7 @@ _syscall_check_trace_leave: _syscall_check_work: /* Here we need to disable interrupts */ DISABLE_INTERRUPTS(r27,r29) + TRACE_IRQS_OFF l.lwz r30,TI_FLAGS(r10) l.andi r30,r30,_TIF_WORK_MASK l.sfne r30,r0 @@ -871,6 +929,7 @@ UNHANDLED_EXCEPTION(_vector_0x1f00,0x1f00) _resume_userspace: DISABLE_INTERRUPTS(r3,r4) + TRACE_IRQS_OFF l.lwz r4,TI_FLAGS(r10) l.andi r13,r4,_TIF_WORK_MASK l.sfeqi r13,0 @@ -909,6 +968,15 @@ _work_pending: l.lwz r8,PT_GPR8(r1) _restore_all: +#ifdef CONFIG_TRACE_IRQFLAGS + l.lwz r4,PT_SR(r1) + l.andi r3,r4,(SPR_SR_IEE|SPR_SR_TEE) + l.sfeq r3,r0 /* skip trace if irqs were off */ + l.bf skip_hardirqs_on + l.nop + TRACE_IRQS_ON +skip_hardirqs_on: +#endif RESTORE_ALL /* This returns to userspace code */ From 4553474d977d1ee8a81067cfbc588f1df84ce3e9 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Fri, 7 Jul 2017 06:06:30 +0900 Subject: [PATCH 0906/1085] openrisc: add tick timer multi-core sync logic In case timers are not in sync when cpus start (i.e. hot plug / offset resets) we need to synchronize the secondary cpus internal timer with the main cpu. This is needed as in OpenRISC SMP there is only one clocksource registered which reads from the same ttcr register on each cpu. This synchronization routine heavily borrows from mips implementation that does something similar. Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/time.h | 8 ++ arch/openrisc/kernel/Makefile | 2 +- arch/openrisc/kernel/smp.c | 3 + arch/openrisc/kernel/sync-timer.c | 120 ++++++++++++++++++++++++++++++ arch/openrisc/kernel/time.c | 15 +++- 5 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 arch/openrisc/kernel/sync-timer.c diff --git a/arch/openrisc/include/asm/time.h b/arch/openrisc/include/asm/time.h index fe83a34a7d68..313ee975774b 100644 --- a/arch/openrisc/include/asm/time.h +++ b/arch/openrisc/include/asm/time.h @@ -12,4 +12,12 @@ extern void openrisc_clockevent_init(void); +extern void openrisc_timer_set(unsigned long count); +extern void openrisc_timer_set_next(unsigned long delta); + +#ifdef CONFIG_SMP +extern void synchronise_count_master(int cpu); +extern void synchronise_count_slave(int cpu); +#endif + #endif /* __ASM_OR1K_TIME_H */ diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index b4b51a07016a..9028e5a1fdd7 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ sys_call_table.o unwinder.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o sync-timer.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_OF) += prom.o diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 4763b8b9161e..4d80ce6fa045 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -100,6 +100,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_crit("CPU%u: failed to start\n", cpu); return -EIO; } + synchronise_count_master(cpu); return 0; } @@ -129,6 +130,8 @@ asmlinkage __init void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); + synchronise_count_slave(cpu); + local_irq_enable(); preempt_disable(); diff --git a/arch/openrisc/kernel/sync-timer.c b/arch/openrisc/kernel/sync-timer.c new file mode 100644 index 000000000000..ed8d835caca1 --- /dev/null +++ b/arch/openrisc/kernel/sync-timer.c @@ -0,0 +1,120 @@ +/* + * OR1K timer synchronisation + * + * Based on work from MIPS implementation. + * + * All CPUs will have their count registers synchronised to the CPU0 next time + * value. This can cause a small timewarp for CPU0. All other CPU's should + * not have done anything significant (but they may have had interrupts + * enabled briefly - prom_smp_finish() should not be responsible for enabling + * interrupts...) + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +static unsigned int initcount; +static atomic_t count_count_start = ATOMIC_INIT(0); +static atomic_t count_count_stop = ATOMIC_INIT(0); + +#define COUNTON 100 +#define NR_LOOPS 3 + +void synchronise_count_master(int cpu) +{ + int i; + unsigned long flags; + + pr_info("Synchronize counters for CPU %u: ", cpu); + + local_irq_save(flags); + + /* + * We loop a few times to get a primed instruction cache, + * then the last pass is more or less synchronised and + * the master and slaves each set their cycle counters to a known + * value all at once. This reduces the chance of having random offsets + * between the processors, and guarantees that the maximum + * delay between the cycle counters is never bigger than + * the latency of information-passing (cachelines) between + * two CPUs. + */ + + for (i = 0; i < NR_LOOPS; i++) { + /* slaves loop on '!= 2' */ + while (atomic_read(&count_count_start) != 1) + mb(); + atomic_set(&count_count_stop, 0); + smp_wmb(); + + /* Let the slave writes its count register */ + atomic_inc(&count_count_start); + + /* Count will be initialised to current timer */ + if (i == 1) + initcount = get_cycles(); + + /* + * Everyone initialises count in the last loop: + */ + if (i == NR_LOOPS-1) + openrisc_timer_set(initcount); + + /* + * Wait for slave to leave the synchronization point: + */ + while (atomic_read(&count_count_stop) != 1) + mb(); + atomic_set(&count_count_start, 0); + smp_wmb(); + atomic_inc(&count_count_stop); + } + /* Arrange for an interrupt in a short while */ + openrisc_timer_set_next(COUNTON); + + local_irq_restore(flags); + + /* + * i386 code reported the skew here, but the + * count registers were almost certainly out of sync + * so no point in alarming people + */ + pr_cont("done.\n"); +} + +void synchronise_count_slave(int cpu) +{ + int i; + + /* + * Not every cpu is online at the time this gets called, + * so we first wait for the master to say everyone is ready + */ + + for (i = 0; i < NR_LOOPS; i++) { + atomic_inc(&count_count_start); + while (atomic_read(&count_count_start) != 2) + mb(); + + /* + * Everyone initialises count in the last loop: + */ + if (i == NR_LOOPS-1) + openrisc_timer_set(initcount); + + atomic_inc(&count_count_stop); + while (atomic_read(&count_count_stop) != 2) + mb(); + } + /* Arrange for an interrupt in a short while */ + openrisc_timer_set_next(COUNTON); +} +#undef NR_LOOPS diff --git a/arch/openrisc/kernel/time.c b/arch/openrisc/kernel/time.c index ab04eaedbf8d..6baecea27080 100644 --- a/arch/openrisc/kernel/time.c +++ b/arch/openrisc/kernel/time.c @@ -27,8 +27,14 @@ #include -static int openrisc_timer_set_next_event(unsigned long delta, - struct clock_event_device *dev) +/* Test the timer ticks to count, used in sync routine */ +inline void openrisc_timer_set(unsigned long count) +{ + mtspr(SPR_TTCR, count); +} + +/* Set the timer to trigger in delta cycles */ +inline void openrisc_timer_set_next(unsigned long delta) { u32 c; @@ -44,7 +50,12 @@ static int openrisc_timer_set_next_event(unsigned long delta, * Keep timer in continuous mode always. */ mtspr(SPR_TTMR, SPR_TTMR_CR | SPR_TTMR_IE | c); +} +static int openrisc_timer_set_next_event(unsigned long delta, + struct clock_event_device *dev) +{ + openrisc_timer_set_next(delta); return 0; } From afa83808817357e10b0a4e6d5b89f7b899dabd49 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Tue, 31 Oct 2017 18:22:05 +0100 Subject: [PATCH 0907/1085] openrisc: pass endianness info to sparse openrisc is big-endian only but sparse assumes the same endianness as the building machine. This is problematic for code which expect __BYTE_ORDER__ being correctly predefined by the compiler which sparse can then pre-process differently from what gcc would, depending on the building machine endianness. Fix this by letting sparse know about the architecture endianness. To: Jonas Bonn To: Stefan Kristiansson To: Stafford Horne Signed-off-by: Luc Van Oostenryck Signed-off-by: Stafford Horne --- arch/openrisc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/openrisc/Makefile b/arch/openrisc/Makefile index 89076a66eee2..cf8802962864 100644 --- a/arch/openrisc/Makefile +++ b/arch/openrisc/Makefile @@ -25,6 +25,7 @@ LDFLAGS_vmlinux := LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) KBUILD_CFLAGS += -pipe -ffixed-r10 -D__linux__ +CHECKFLAGS += -mbig-endian ifeq ($(CONFIG_OPENRISC_HAVE_INST_MUL),y) KBUILD_CFLAGS += $(call cc-option,-mhard-mul) From 610f01b9a88a9ef8b506709a825c17395c56a62a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Fri, 3 Nov 2017 12:22:27 +0900 Subject: [PATCH 0908/1085] openrisc: fix possible deadlock scenario during timer sync OpenRISC borrows its timer sync logic from MIPS, Matt helped to review the OpenRISC implementation and noted that we may suffer the same deadlock case that MIPS has faced. The case being: "the MIPS timer synchronization code contained the possibility of deadlock. If you mark a CPU online before it goes into the synchronize loop, then the boot CPU can schedule a different thread and send IPIs to all "online" CPUs. It gets stuck waiting for the secondary to ack it's IPI, since this secondary CPU has not enabled IRQs yet, and is stuck waiting for the master to synchronise with it. The system then deadlocks." Fix this by moving set_cpu_online() to after timer sync. Reported-by: Matt Redfearn Signed-off-by: Stafford Horne --- arch/openrisc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 4d80ce6fa045..7d518ee8bddc 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -127,10 +127,10 @@ asmlinkage __init void secondary_start_kernel(void) /* * OK, now it's safe to let the boot CPU continue */ - set_cpu_online(cpu, true); complete(&cpu_running); synchronise_count_slave(cpu); + set_cpu_online(cpu, true); local_irq_enable(); From 648a6f4495b183d4e0d8983ff768facb9a2185bb Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 26 Oct 2017 16:36:45 +0200 Subject: [PATCH 0909/1085] s390/qdio: use atomic_cmpxchg qdio uses atomic_read to find an unused indicator and atomic_set to flag it as used. This could lead to multiple users getting the same indicator. Use atomic_cmpxchg instead. Signed-off-by: Sebastian Ott Acked-by: Ursula Braun Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_thinint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index c61164f4528e..3e587bf426b0 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -56,10 +56,8 @@ static u32 *get_indicator(void) int i; for (i = 0; i < TIQDIO_NR_NONSHARED_IND; i++) - if (!atomic_read(&q_indicators[i].count)) { - atomic_set(&q_indicators[i].count, 1); + if (!atomic_cmpxchg(&q_indicators[i].count, 0, 1)) return &q_indicators[i].ind; - } /* use the shared indicator */ atomic_inc(&q_indicators[TIQDIO_SHARED_IND].count); From 30e8eb867122183076b7994f9b83920f9ec5451b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 26 Oct 2017 16:47:25 +0200 Subject: [PATCH 0910/1085] s390/qdio: sanitize put_indicator qdio maintains an array of struct indicator_t. put_indicator takes a pointer to a member of a struct indicator_t within that array, calculates the index, and uses the array and the index to get the struct indicator_t. Simply use the pointer directly. Although the pointer happens to point to the first member of that struct use the container_of macro. Signed-off-by: Sebastian Ott Acked-by: Ursula Braun Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_thinint.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 3e587bf426b0..f99d1c36168d 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -66,13 +66,11 @@ static u32 *get_indicator(void) static void put_indicator(u32 *addr) { - int i; + struct indicator_t *ind = container_of(addr, struct indicator_t, ind); if (!addr) return; - i = ((unsigned long)addr - (unsigned long)q_indicators) / - sizeof(struct indicator_t); - atomic_dec(&q_indicators[i].count); + atomic_dec(&ind->count); } void tiqdio_add_input_queues(struct qdio_irq *irq_ptr) From 42bdd7061a6e24d7b21d3d21973615fecef544ef Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 16 Oct 2017 12:27:58 +0200 Subject: [PATCH 0911/1085] spi: fix IDR collision on systems with both fixed and dynamic SPI bus numbers On systems where some controllers get a dynamic ID assigned and some have a fixed number from DT, the current implemention might run into an IDR collision if the dynamic controllers gets probed first and get an IDR number, which is later requested by the controller with the fixed numbering. When this happens the fixed controller will fail to register with the SPI core. Fix this by skipping all known alias numbers when assigning the dynamic IDs. Fixes: 9b61e302210e (spi: Pick spi bus number from Linux idr or spi alias) Signed-off-by: Lucas Stach Signed-off-by: Mark Brown --- drivers/spi/spi.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 759c9725495a..3ff0ee88c467 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -45,7 +45,6 @@ #define CREATE_TRACE_POINTS #include -#define SPI_DYN_FIRST_BUS_NUM 0 static DEFINE_IDR(spi_master_idr); @@ -2086,7 +2085,7 @@ int spi_register_controller(struct spi_controller *ctlr) struct device *dev = ctlr->dev.parent; struct boardinfo *bi; int status = -ENODEV; - int id; + int id, first_dynamic; if (!dev) return -ENODEV; @@ -2116,9 +2115,15 @@ int spi_register_controller(struct spi_controller *ctlr) } } if (ctlr->bus_num < 0) { + first_dynamic = of_alias_get_highest_id("spi"); + if (first_dynamic < 0) + first_dynamic = 0; + else + first_dynamic++; + mutex_lock(&board_lock); - id = idr_alloc(&spi_master_idr, ctlr, SPI_DYN_FIRST_BUS_NUM, 0, - GFP_KERNEL); + id = idr_alloc(&spi_master_idr, ctlr, first_dynamic, + 0, GFP_KERNEL); mutex_unlock(&board_lock); if (WARN(id < 0, "couldn't get idr")) return id; From f25637a6b89e59eddf79f6df39b23e202753f555 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 3 Nov 2017 12:38:04 +0100 Subject: [PATCH 0912/1085] regmap: Add a config option for hwspinlock Unlike other lock types hwspinlocks are optional and can be built modular so we can't use them unconditionally in regmap so add a config option that drivers that want to use hwspinlocks with regmap can select which will ensure that hwspinlock is built in. Signed-off-by: Mark Brown --- drivers/base/regmap/Kconfig | 4 ++++ drivers/base/regmap/regmap.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig index 073c0b77e5b3..2d5e849f79c9 100644 --- a/drivers/base/regmap/Kconfig +++ b/drivers/base/regmap/Kconfig @@ -5,6 +5,7 @@ config REGMAP default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ) select IRQ_DOMAIN if REGMAP_IRQ + select HWSPINLOCK if REGMAP_HWSPINLOCK bool config REGCACHE_COMPRESSED @@ -36,3 +37,6 @@ config REGMAP_MMIO config REGMAP_IRQ bool + +config REGMAP_HWSPINLOCK + bool diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 999e981a174a..ff6ef6a579c6 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -414,6 +414,7 @@ static unsigned int regmap_parse_64_native(const void *buf) } #endif +#ifdef REGMAP_HWSPINLOCK static void regmap_lock_hwlock(void *__map) { struct regmap *map = __map; @@ -456,6 +457,7 @@ static void regmap_unlock_hwlock_irqrestore(void *__map) hwspin_unlock_irqrestore(map->hwlock, &map->spinlock_flags); } +#endif static void regmap_lock_mutex(void *__map) { @@ -672,6 +674,7 @@ struct regmap *__regmap_init(struct device *dev, map->unlock = config->unlock; map->lock_arg = config->lock_arg; } else if (config->hwlock_id) { +#ifdef REGMAP_HWSPINLOCK map->hwlock = hwspin_lock_request_specific(config->hwlock_id); if (!map->hwlock) { ret = -ENXIO; @@ -694,6 +697,10 @@ struct regmap *__regmap_init(struct device *dev, } map->lock_arg = map; +#else + ret = -EINVAL; + goto err; +#endif } else { if ((bus && bus->fast_io) || config->fast_io) { From ec7ed7708e009e046d1e16ed53ba4d6048748d07 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Sat, 28 Oct 2017 00:23:01 +0200 Subject: [PATCH 0913/1085] spi: spi-fsl-dspi: enabling Coldfire mcf5441x dspi Signed-off-by: Angelo Dureghello Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 2 +- drivers/spi/spi-fsl-dspi.c | 66 ++++++++++++++++++++++---------- include/linux/spi/spi-fsl-dspi.h | 31 +++++++++++++++ 3 files changed, 77 insertions(+), 22 deletions(-) create mode 100644 include/linux/spi/spi-fsl-dspi.h diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index a75f2a2cf780..a8b761979673 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -379,7 +379,7 @@ config SPI_FSL_DSPI tristate "Freescale DSPI controller" select REGMAP_MMIO depends on HAS_DMA - depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST + depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || M5441x || COMPILE_TEST help This enables support for the Freescale DSPI controller in master mode. VF610 platform uses the controller. diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index d89127f4a46d..f652f70cb8db 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -151,6 +152,11 @@ static const struct fsl_dspi_devtype_data ls2085a_data = { .max_clock_factor = 8, }; +static const struct fsl_dspi_devtype_data coldfire_data = { + .trans_mode = DSPI_EOQ_MODE, + .max_clock_factor = 8, +}; + struct fsl_dspi_dma { /* Length of transfer in words of DSPI_FIFO_SIZE */ u32 curr_xfer_len; @@ -741,6 +747,7 @@ static int dspi_setup(struct spi_device *spi) { struct chip_data *chip; struct fsl_dspi *dspi = spi_master_get_devdata(spi->master); + struct fsl_dspi_platform_data *pdata; u32 cs_sck_delay = 0, sck_cs_delay = 0; unsigned char br = 0, pbr = 0, pcssck = 0, cssck = 0; unsigned char pasc = 0, asc = 0, fmsz = 0; @@ -761,11 +768,18 @@ static int dspi_setup(struct spi_device *spi) return -ENOMEM; } - of_property_read_u32(spi->dev.of_node, "fsl,spi-cs-sck-delay", - &cs_sck_delay); + pdata = dev_get_platdata(&dspi->pdev->dev); - of_property_read_u32(spi->dev.of_node, "fsl,spi-sck-cs-delay", - &sck_cs_delay); + if (!pdata) { + of_property_read_u32(spi->dev.of_node, "fsl,spi-cs-sck-delay", + &cs_sck_delay); + + of_property_read_u32(spi->dev.of_node, "fsl,spi-sck-cs-delay", + &sck_cs_delay); + } else { + cs_sck_delay = pdata->cs_sck_delay; + sck_cs_delay = pdata->sck_cs_delay; + } chip->mcr_val = SPI_MCR_MASTER | SPI_MCR_PCSIS | SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF; @@ -949,6 +963,7 @@ static int dspi_probe(struct platform_device *pdev) struct fsl_dspi *dspi; struct resource *res; void __iomem *base; + struct fsl_dspi_platform_data *pdata; int ret = 0, cs_num, bus_num; master = spi_alloc_master(&pdev->dev, sizeof(struct fsl_dspi)); @@ -969,25 +984,34 @@ static int dspi_probe(struct platform_device *pdev) master->bits_per_word_mask = SPI_BPW_MASK(4) | SPI_BPW_MASK(8) | SPI_BPW_MASK(16); - ret = of_property_read_u32(np, "spi-num-chipselects", &cs_num); - if (ret < 0) { - dev_err(&pdev->dev, "can't get spi-num-chipselects\n"); - goto out_master_put; - } - master->num_chipselect = cs_num; + pdata = dev_get_platdata(&pdev->dev); + if (pdata) { + master->num_chipselect = pdata->cs_num; + master->bus_num = pdata->bus_num; - ret = of_property_read_u32(np, "bus-num", &bus_num); - if (ret < 0) { - dev_err(&pdev->dev, "can't get bus-num\n"); - goto out_master_put; - } - master->bus_num = bus_num; + dspi->devtype_data = &coldfire_data; + } else { - dspi->devtype_data = of_device_get_match_data(&pdev->dev); - if (!dspi->devtype_data) { - dev_err(&pdev->dev, "can't get devtype_data\n"); - ret = -EFAULT; - goto out_master_put; + ret = of_property_read_u32(np, "spi-num-chipselects", &cs_num); + if (ret < 0) { + dev_err(&pdev->dev, "can't get spi-num-chipselects\n"); + goto out_master_put; + } + master->num_chipselect = cs_num; + + ret = of_property_read_u32(np, "bus-num", &bus_num); + if (ret < 0) { + dev_err(&pdev->dev, "can't get bus-num\n"); + goto out_master_put; + } + master->bus_num = bus_num; + + dspi->devtype_data = of_device_get_match_data(&pdev->dev); + if (!dspi->devtype_data) { + dev_err(&pdev->dev, "can't get devtype_data\n"); + ret = -EFAULT; + goto out_master_put; + } } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); diff --git a/include/linux/spi/spi-fsl-dspi.h b/include/linux/spi/spi-fsl-dspi.h new file mode 100644 index 000000000000..74c9bae20bf2 --- /dev/null +++ b/include/linux/spi/spi-fsl-dspi.h @@ -0,0 +1,31 @@ +/* + * Freescale DSPI controller driver + * + * Copyright (c) 2017 Angelo Dureghello + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef SPI_FSL_DSPI_HEADER_H +#define SPI_FSL_DSPI_HEADER_H + +/** + * struct fsl_dspi_platform_data - platform data for the Freescale DSPI driver + * @bus_num: board specific identifier for this DSPI driver. + * @cs_num: number of chip selects supported by this DSPI driver. + */ +struct fsl_dspi_platform_data { + u32 cs_num; + u32 bus_num; + u32 sck_cs_delay; + u32 cs_sck_delay; +}; + +#endif /* SPI_FSL_DSPI_HEADER_H */ From 4132b8b9107badf75bf30c8b4b3ac2cebd03389d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Nov 2017 13:35:25 +0000 Subject: [PATCH 0914/1085] spi: s3c64xx: remove redundant pointer sci The pointer sci is assigned but never read, hence it is redundant and can be removed. Cleans up clang warning: drivers/spi/spi-s3c64xx.c:791:2: warning: Value stored to 'sci' is never read Signed-off-by: Colin Ian King Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index b392cca8fa4f..de7df20f8712 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -752,7 +752,6 @@ static int s3c64xx_spi_setup(struct spi_device *spi) { struct s3c64xx_spi_csinfo *cs = spi->controller_data; struct s3c64xx_spi_driver_data *sdd; - struct s3c64xx_spi_info *sci; int err; sdd = spi_master_get_devdata(spi->master); @@ -788,8 +787,6 @@ static int s3c64xx_spi_setup(struct spi_device *spi) spi_set_ctldata(spi, cs); } - sci = sdd->cntrlr_info; - pm_runtime_get_sync(&sdd->pdev->dev); /* Check if we can provide the requested rate */ From 6ae6678344af52cf1c05475ff3ec2f43b8b532ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Nov 2017 13:58:29 +0000 Subject: [PATCH 0915/1085] spi: sh-msiof: remove redundant pointer dev The pointer dev is assigned but never read, hence it is redundant and can be removed. Cleans up clang warning: drivers/spi/spi-sh-msiof.c:1198:2: warning: Value stored to 'dev' is never read Signed-off-by: Colin Ian King Signed-off-by: Mark Brown --- drivers/spi/spi-sh-msiof.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 2c5ea4c57508..66043611e86d 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -1190,12 +1190,10 @@ free_tx_chan: static void sh_msiof_release_dma(struct sh_msiof_spi_priv *p) { struct spi_master *master = p->master; - struct device *dev; if (!master->dma_tx) return; - dev = &p->pdev->dev; dma_unmap_single(master->dma_rx->device->dev, p->rx_dma_addr, PAGE_SIZE, DMA_FROM_DEVICE); dma_unmap_single(master->dma_tx->device->dev, p->tx_dma_addr, From 267f3e4f18f1ccf9e8cf72c3b4689df03025516d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 3 Nov 2017 19:50:20 +0000 Subject: [PATCH 0916/1085] regmap: Also protect hwspinlock in error handling path The previous patch to allow the hwspinlock code to be disabled missed handling the free in the error path, do so using the better IS_ENABLED() pattern as suggested by Baolin. While we're at it also check that we have a hardware spinlock before freeing it - the core code reports an error when freeing an invalid lock. Suggested-by: Baolin Wang Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index ff6ef6a579c6..5ff549fa880b 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1116,7 +1116,8 @@ err_range: regmap_range_exit(map); kfree(map->work_buf); err_hwlock: - hwspin_lock_free(map->hwlock); + if (IS_ENABLED(REGMAP_HWSPINLOCK) && map->hwlock) + hwspin_lock_free(map->hwlock); err_map: kfree(map); err: From e8419c40a5ad58eaa112173f554148accc6794f3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 3 Nov 2017 19:53:56 +0000 Subject: [PATCH 0917/1085] regmap: Clean up hwspinlock on regmap exit We should free any hwspinlocks when we destroy the regmap, do so. Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 5ff549fa880b..bfe2f250d011 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1305,6 +1305,8 @@ void regmap_exit(struct regmap *map) kfree(async->work_buf); kfree(async); } + if (IS_ENABLED(REGMAP_HWSPINLOCK) && map->hwlock) + hwspin_lock_free(map->hwlock); kfree(map); } EXPORT_SYMBOL_GPL(regmap_exit); From 54b943e69691329805349440b157b36e34051a57 Mon Sep 17 00:00:00 2001 From: Joel Date: Thu, 2 Nov 2017 14:53:47 +1100 Subject: [PATCH 0918/1085] hwmon: (aspeed-pwm-tacho) Sort headers Sort the headers in preperation for future changes. Signed-off-by: Joel Stanley Signed-off-by: Guenter Roeck --- drivers/hwmon/aspeed-pwm-tacho.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/aspeed-pwm-tacho.c b/drivers/hwmon/aspeed-pwm-tacho.c index f914e5f41048..63a95e23ca81 100644 --- a/drivers/hwmon/aspeed-pwm-tacho.c +++ b/drivers/hwmon/aspeed-pwm-tacho.c @@ -7,19 +7,19 @@ */ #include +#include #include #include -#include #include #include #include #include #include -#include #include +#include #include -#include #include +#include #include /* ASPEED PWM & FAN Tach Register Definition */ From 4d420a6a9ddd72bd25baa6e667dd0581506eeacb Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Fri, 3 Nov 2017 15:53:02 +1100 Subject: [PATCH 0919/1085] pmbus: Add driver for Maxim MAX31785 Intelligent Fan Controller The Maxim MAX31785 is a PMBus device providing closed-loop, multi-channel fan management with temperature and remote voltage sensing. It supports various fan control features, including PWM frequency control, temperature hysteresis, dual tachometer measurements, and fan health monitoring. This patch presents a basic driver using only the existing features of the PMBus subsystem. Signed-off-by: Andrew Jeffery [groeck: Modified description to clarify that fan control is not yet provided] Signed-off-by: Guenter Roeck --- Documentation/hwmon/max31785 | 51 +++++++++++++++ drivers/hwmon/pmbus/Kconfig | 10 +++ drivers/hwmon/pmbus/Makefile | 1 + drivers/hwmon/pmbus/max31785.c | 116 +++++++++++++++++++++++++++++++++ 4 files changed, 178 insertions(+) create mode 100644 Documentation/hwmon/max31785 create mode 100644 drivers/hwmon/pmbus/max31785.c diff --git a/Documentation/hwmon/max31785 b/Documentation/hwmon/max31785 new file mode 100644 index 000000000000..45fb6093dec2 --- /dev/null +++ b/Documentation/hwmon/max31785 @@ -0,0 +1,51 @@ +Kernel driver max31785 +====================== + +Supported chips: + * Maxim MAX31785, MAX31785A + Prefix: 'max31785' or 'max31785a' + Addresses scanned: - + Datasheet: https://datasheets.maximintegrated.com/en/ds/MAX31785.pdf + +Author: Andrew Jeffery + +Description +----------- + +The Maxim MAX31785 is a PMBus device providing closed-loop, multi-channel fan +management with temperature and remote voltage sensing. Various fan control +features are provided, including PWM frequency control, temperature hysteresis, +dual tachometer measurements, and fan health monitoring. + +For dual rotor fan configuration, the MAX31785 exposes the slowest rotor of the +two in the fan[1-4]_input attributes. + +Usage Notes +----------- + +This driver does not probe for PMBus devices. You will have to instantiate +devices explicitly. + +Sysfs attributes +---------------- + +fan[1-4]_alarm Fan alarm. +fan[1-4]_fault Fan fault. +fan[1-4]_input Fan RPM. + +in[1-6]_crit Critical maximum output voltage +in[1-6]_crit_alarm Output voltage critical high alarm +in[1-6]_input Measured output voltage +in[1-6]_label "vout[18-23]" +in[1-6]_lcrit Critical minimum output voltage +in[1-6]_lcrit_alarm Output voltage critical low alarm +in[1-6]_max Maximum output voltage +in[1-6]_max_alarm Output voltage high alarm +in[1-6]_min Minimum output voltage +in[1-6]_min_alarm Output voltage low alarm + +temp[1-11]_crit Critical high temperature +temp[1-11]_crit_alarm Chip temperature critical high alarm +temp[1-11]_input Measured temperature +temp[1-11]_max Maximum temperature +temp[1-11]_max_alarm Chip temperature high alarm diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig index 40019325b517..08479006c7f9 100644 --- a/drivers/hwmon/pmbus/Kconfig +++ b/drivers/hwmon/pmbus/Kconfig @@ -114,6 +114,16 @@ config SENSORS_MAX20751 This driver can also be built as a module. If so, the module will be called max20751. +config SENSORS_MAX31785 + tristate "Maxim MAX31785 and compatibles" + default n + help + If you say yes here you get hardware monitoring support for Maxim + MAX31785. + + This driver can also be built as a module. If so, the module will + be called max31785. + config SENSORS_MAX34440 tristate "Maxim MAX34440 and compatibles" default n diff --git a/drivers/hwmon/pmbus/Makefile b/drivers/hwmon/pmbus/Makefile index 459a6be3390e..a8bf0e490db9 100644 --- a/drivers/hwmon/pmbus/Makefile +++ b/drivers/hwmon/pmbus/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_SENSORS_LTC2978) += ltc2978.o obj-$(CONFIG_SENSORS_LTC3815) += ltc3815.o obj-$(CONFIG_SENSORS_MAX16064) += max16064.o obj-$(CONFIG_SENSORS_MAX20751) += max20751.o +obj-$(CONFIG_SENSORS_MAX31785) += max31785.o obj-$(CONFIG_SENSORS_MAX34440) += max34440.o obj-$(CONFIG_SENSORS_MAX8688) += max8688.o obj-$(CONFIG_SENSORS_TPS40422) += tps40422.o diff --git a/drivers/hwmon/pmbus/max31785.c b/drivers/hwmon/pmbus/max31785.c new file mode 100644 index 000000000000..9313849d5160 --- /dev/null +++ b/drivers/hwmon/pmbus/max31785.c @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "pmbus.h" + +enum max31785_regs { + MFR_REVISION = 0x9b, +}; + +#define MAX31785_NR_PAGES 23 + +#define MAX31785_FAN_FUNCS \ + (PMBUS_HAVE_FAN12 | PMBUS_HAVE_STATUS_FAN12) + +#define MAX31785_TEMP_FUNCS \ + (PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_TEMP) + +#define MAX31785_VOUT_FUNCS \ + (PMBUS_HAVE_VOUT | PMBUS_HAVE_STATUS_VOUT) + +static const struct pmbus_driver_info max31785_info = { + .pages = MAX31785_NR_PAGES, + + /* RPM */ + .format[PSC_FAN] = direct, + .m[PSC_FAN] = 1, + .b[PSC_FAN] = 0, + .R[PSC_FAN] = 0, + .func[0] = MAX31785_FAN_FUNCS, + .func[1] = MAX31785_FAN_FUNCS, + .func[2] = MAX31785_FAN_FUNCS, + .func[3] = MAX31785_FAN_FUNCS, + .func[4] = MAX31785_FAN_FUNCS, + .func[5] = MAX31785_FAN_FUNCS, + + .format[PSC_TEMPERATURE] = direct, + .m[PSC_TEMPERATURE] = 1, + .b[PSC_TEMPERATURE] = 0, + .R[PSC_TEMPERATURE] = 2, + .func[6] = MAX31785_TEMP_FUNCS, + .func[7] = MAX31785_TEMP_FUNCS, + .func[8] = MAX31785_TEMP_FUNCS, + .func[9] = MAX31785_TEMP_FUNCS, + .func[10] = MAX31785_TEMP_FUNCS, + .func[11] = MAX31785_TEMP_FUNCS, + .func[12] = MAX31785_TEMP_FUNCS, + .func[13] = MAX31785_TEMP_FUNCS, + .func[14] = MAX31785_TEMP_FUNCS, + .func[15] = MAX31785_TEMP_FUNCS, + .func[16] = MAX31785_TEMP_FUNCS, + + .format[PSC_VOLTAGE_OUT] = direct, + .m[PSC_VOLTAGE_OUT] = 1, + .b[PSC_VOLTAGE_OUT] = 0, + .R[PSC_VOLTAGE_OUT] = 0, + .func[17] = MAX31785_VOUT_FUNCS, + .func[18] = MAX31785_VOUT_FUNCS, + .func[19] = MAX31785_VOUT_FUNCS, + .func[20] = MAX31785_VOUT_FUNCS, + .func[21] = MAX31785_VOUT_FUNCS, + .func[22] = MAX31785_VOUT_FUNCS, +}; + +static int max31785_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct device *dev = &client->dev; + struct pmbus_driver_info *info; + s64 ret; + + info = devm_kzalloc(dev, sizeof(struct pmbus_driver_info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + *info = max31785_info; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 255); + if (ret < 0) + return ret; + + return pmbus_do_probe(client, id, info); +} + +static const struct i2c_device_id max31785_id[] = { + { "max31785", 0 }, + { "max31785a", 0 }, + { }, +}; + +MODULE_DEVICE_TABLE(i2c, max31785_id); + +static struct i2c_driver max31785_driver = { + .driver = { + .name = "max31785", + }, + .probe = max31785_probe, + .remove = pmbus_do_remove, + .id_table = max31785_id, +}; + +module_i2c_driver(max31785_driver); + +MODULE_AUTHOR("Andrew Jeffery "); +MODULE_DESCRIPTION("PMBus driver for the Maxim MAX31785"); +MODULE_LICENSE("GPL"); From 34d8751fd4ffa34e85ee7e85d34168b3f3f62b42 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 3 Nov 2017 11:08:36 -0400 Subject: [PATCH 0920/1085] MAINTAINERS: update the IMA, EVM, trusted-keys, encrypted-keys entries Update the mailing list information. Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- MAINTAINERS | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2c43cfabd438..1b41b99be98e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5213,8 +5213,7 @@ F: fs/ext4/ Extended Verification Module (EVM) M: Mimi Zohar -L: linux-ima-devel@lists.sourceforge.net -L: linux-security-module@vger.kernel.org +L: linux-integrity@vger.kernel.org S: Supported F: security/integrity/evm/ @@ -6840,9 +6839,7 @@ L: linux-crypto@vger.kernel.org INTEGRITY MEASUREMENT ARCHITECTURE (IMA) M: Mimi Zohar M: Dmitry Kasatkin -L: linux-ima-devel@lists.sourceforge.net -L: linux-ima-user@lists.sourceforge.net -L: linux-security-module@vger.kernel.org +L: linux-integrity@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git S: Supported F: security/integrity/ima/ @@ -7625,7 +7622,7 @@ F: kernel/kexec* KEYS-ENCRYPTED M: Mimi Zohar -L: linux-security-module@vger.kernel.org +L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org S: Supported F: Documentation/security/keys/trusted-encrypted.rst @@ -7634,7 +7631,7 @@ F: security/keys/encrypted-keys/ KEYS-TRUSTED M: Mimi Zohar -L: linux-security-module@vger.kernel.org +L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org S: Supported F: Documentation/security/keys/trusted-encrypted.rst From b4d9421098f863bc7f92a7be8fb21e32fa5df99e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Oct 2017 10:07:05 -0400 Subject: [PATCH 0921/1085] ftrace/docs: Add documentation on how to use ftrace from within the kernel With the coming removal of jprobes, using ftrace callbacks is one of the utilities that replace the jprobes functionality. Having a document that explains how to use ftrace as such will help in the transition from jprobes to ftrace. This document is for kernel developers that require attaching a callback to a function within the kernel. Link: http://lkml.kernel.org/r/150724519527.5014.10207042218696587159.stgit@devbox Signed-off-by: Steven Rostedt (VMware) [jc: fixed one formatting issue that broke the docs build] Signed-off-by: Jonathan Corbet --- Documentation/trace/ftrace-uses.rst | 293 ++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 Documentation/trace/ftrace-uses.rst diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst new file mode 100644 index 000000000000..8494a801d341 --- /dev/null +++ b/Documentation/trace/ftrace-uses.rst @@ -0,0 +1,293 @@ +================================= +Using ftrace to hook to functions +================================= + +.. Copyright 2017 VMware Inc. +.. Author: Steven Rostedt +.. License: The GNU Free Documentation License, Version 1.2 +.. (dual licensed under the GPL v2) + +Written for: 4.14 + +Introduction +============ + +The ftrace infrastructure was originially created to attach callbacks to the +beginning of functions in order to record and trace the flow of the kernel. +But callbacks to the start of a function can have other use cases. Either +for live kernel patching, or for security monitoring. This document describes +how to use ftrace to implement your own function callbacks. + + +The ftrace context +================== + +WARNING: The ability to add a callback to almost any function within the +kernel comes with risks. A callback can be called from any context +(normal, softirq, irq, and NMI). Callbacks can also be called just before +going to idle, during CPU bring up and takedown, or going to user space. +This requires extra care to what can be done inside a callback. A callback +can be called outside the protective scope of RCU. + +The ftrace infrastructure has some protections agains recursions and RCU +but one must still be very careful how they use the callbacks. + + +The ftrace_ops structure +======================== + +To register a function callback, a ftrace_ops is required. This structure +is used to tell ftrace what function should be called as the callback +as well as what protections the callback will perform and not require +ftrace to handle. + +There is only one field that is needed to be set when registering +an ftrace_ops with ftrace:: + +.. code-block: c + + struct ftrace_ops ops = { + .func = my_callback_func, + .flags = MY_FTRACE_FLAGS + .private = any_private_data_structure, + }; + +Both .flags and .private are optional. Only .func is required. + +To enable tracing call:: + +.. c:function:: register_ftrace_function(&ops); + +To disable tracing call:: + +.. c:function:: unregister_ftrace_function(&ops); + +The above is defined by including the header:: + +.. c:function:: #include + +The registered callback will start being called some time after the +register_ftrace_function() is called and before it returns. The exact time +that callbacks start being called is dependent upon architecture and scheduling +of services. The callback itself will have to handle any synchronization if it +must begin at an exact moment. + +The unregister_ftrace_function() will guarantee that the callback is +no longer being called by functions after the unregister_ftrace_function() +returns. Note that to perform this guarantee, the unregister_ftrace_function() +may take some time to finish. + + +The callback function +===================== + +The prototype of the callback function is as follows (as of v4.14):: + +.. code-block: c + + void callback_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); + +@ip + This is the instruction pointer of the function that is being traced. + (where the fentry or mcount is within the function) + +@parent_ip + This is the instruction pointer of the function that called the + the function being traced (where the call of the function occurred). + +@op + This is a pointer to ftrace_ops that was used to register the callback. + This can be used to pass data to the callback via the private pointer. + +@regs + If the FTRACE_OPS_FL_SAVE_REGS or FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED + flags are set in the ftrace_ops structure, then this will be pointing + to the pt_regs structure like it would be if an breakpoint was placed + at the start of the function where ftrace was tracing. Otherwise it + either contains garbage, or NULL. + + +The ftrace FLAGS +================ + +The ftrace_ops flags are all defined and documented in include/linux/ftrace.h. +Some of the flags are used for internal infrastructure of ftrace, but the +ones that users should be aware of are the following: + +FTRACE_OPS_FL_SAVE_REGS + If the callback requires reading or modifying the pt_regs + passed to the callback, then it must set this flag. Registering + a ftrace_ops with this flag set on an architecture that does not + support passing of pt_regs to the callback will fail. + +FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED + Similar to SAVE_REGS but the registering of a + ftrace_ops on an architecture that does not support passing of regs + will not fail with this flag set. But the callback must check if + regs is NULL or not to determine if the architecture supports it. + +FTRACE_OPS_FL_RECURSION_SAFE + By default, a wrapper is added around the callback to + make sure that recursion of the function does not occur. That is, + if a function that is called as a result of the callback's execution + is also traced, ftrace will prevent the callback from being called + again. But this wrapper adds some overhead, and if the callback is + safe from recursion, it can set this flag to disable the ftrace + protection. + + Note, if this flag is set, and recursion does occur, it could cause + the system to crash, and possibly reboot via a triple fault. + + It is OK if another callback traces a function that is called by a + callback that is marked recursion safe. Recursion safe callbacks + must never trace any function that are called by the callback + itself or any nested functions that those functions call. + + If this flag is set, it is possible that the callback will also + be called with preemption enabled (when CONFIG_PREEMPT is set), + but this is not guaranteed. + +FTRACE_OPS_FL_IPMODIFY + Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack" + the traced function (have another function called instead of the + traced function), it requires setting this flag. This is what live + kernel patches uses. Without this flag the pt_regs->ip can not be + modified. + + Note, only one ftrace_ops with FTRACE_OPS_FL_IPMODIFY set may be + registered to any given function at a time. + +FTRACE_OPS_FL_RCU + If this is set, then the callback will only be called by functions + where RCU is "watching". This is required if the callback function + performs any rcu_read_lock() operation. + + RCU stops watching when the system goes idle, the time when a CPU + is taken down and comes back online, and when entering from kernel + to user space and back to kernel space. During these transitions, + a callback may be executed and RCU synchronization will not protect + it. + + +Filtering which functions to trace +================================== + +If a callback is only to be called from specific functions, a filter must be +set up. The filters are added by name, or ip if it is known. + +.. code-block: c + + int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); + +@ops + The ops to set the filter with + +@buf + The string that holds the function filter text. +@len + The length of the string. + +@reset + Non-zero to reset all filters before applying this filter. + +Filters denote which functions should be enabled when tracing is enabled. +If @buf is NULL and reset is set, all functions will be enabled for tracing. + +The @buf can also be a glob expression to enable all functions that +match a specific pattern. + +See Filter Commands in :file:`Documentation/trace/ftrace.txt`. + +To just trace the schedule function:: + +.. code-block: c + + ret = ftrace_set_filter(&ops, "schedule", strlen("schedule"), 0); + +To add more functions, call the ftrace_set_filter() more than once with the +@reset parameter set to zero. To remove the current filter set and replace it +with new functions defined by @buf, have @reset be non-zero. + +To remove all the filtered functions and trace all functions:: + +.. code-block: c + + ret = ftrace_set_filter(&ops, NULL, 0, 1); + + +Sometimes more than one function has the same name. To trace just a specific +function in this case, ftrace_set_filter_ip() can be used. + +.. code-block: c + + ret = ftrace_set_filter_ip(&ops, ip, 0, 0); + +Although the ip must be the address where the call to fentry or mcount is +located in the function. This function is used by perf and kprobes that +gets the ip address from the user (usually using debug info from the kernel). + +If a glob is used to set the filter, functions can be added to a "notrace" +list that will prevent those functions from calling the callback. +The "notrace" list takes precedence over the "filter" list. If the +two lists are non-empty and contain the same functions, the callback will not +be called by any function. + +An empty "notrace" list means to allow all functions defined by the filter +to be traced. + +.. code-block: c + + int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); + +This takes the same parameters as ftrace_set_filter() but will add the +functions it finds to not be traced. This is a separate list from the +filter list, and this function does not modify the filter list. + +A non-zero @reset will clear the "notrace" list before adding functions +that match @buf to it. + +Clearing the "notrace" list is the same as clearing the filter list + +.. code-block: c + + ret = ftrace_set_notrace(&ops, NULL, 0, 1); + +The filter and notrace lists may be changed at any time. If only a set of +functions should call the callback, it is best to set the filters before +registering the callback. But the changes may also happen after the callback +has been registered. + +If a filter is in place, and the @reset is non-zero, and @buf contains a +matching glob to functions, the switch will happen during the time of +the ftrace_set_filter() call. At no time will all functions call the callback. + +.. code-block: c + + ftrace_set_filter(&ops, "schedule", strlen("schedule"), 1); + + register_ftrace_function(&ops); + + msleep(10); + + ftrace_set_filter(&ops, "try_to_wake_up", strlen("try_to_wake_up"), 1); + +is not the same as: + +.. code-block: c + + ftrace_set_filter(&ops, "schedule", strlen("schedule"), 1); + + register_ftrace_function(&ops); + + msleep(10); + + ftrace_set_filter(&ops, NULL, 0, 1); + + ftrace_set_filter(&ops, "try_to_wake_up", strlen("try_to_wake_up"), 0); + +As the latter will have a short time where all functions will call +the callback, between the time of the reset, and the time of the +new setting of the filter. From 8a0698c19e37019562d2504d4d724811d7fd411c Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:37 +0530 Subject: [PATCH 0922/1085] dmaengine: doc: Add ReST style dmaengine document This removes the index file and adds the index.rst as placeholder and update driver-api index to add dmaengine. As a consequence dmaengine documentation will be in driver-api/ Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- Documentation/dmaengine/00-INDEX | 8 -------- Documentation/driver-api/dmaengine/index.rst | 13 +++++++++++++ Documentation/driver-api/index.rst | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) delete mode 100644 Documentation/dmaengine/00-INDEX create mode 100644 Documentation/driver-api/dmaengine/index.rst diff --git a/Documentation/dmaengine/00-INDEX b/Documentation/dmaengine/00-INDEX deleted file mode 100644 index 07de6573d22b..000000000000 --- a/Documentation/dmaengine/00-INDEX +++ /dev/null @@ -1,8 +0,0 @@ -00-INDEX - - this file. -client.txt - -the DMA Engine API Guide. -dmatest.txt - - how to compile, configure and use the dmatest system. -provider.txt - - the DMA controller API. \ No newline at end of file diff --git a/Documentation/driver-api/dmaengine/index.rst b/Documentation/driver-api/dmaengine/index.rst new file mode 100644 index 000000000000..8c90a6443810 --- /dev/null +++ b/Documentation/driver-api/dmaengine/index.rst @@ -0,0 +1,13 @@ +======================= +DMAEngine documentation +======================= + +DMAEngine documentation provides documents for various aspects of DMAEngine +framework. + +.. only:: subproject + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 9c20624842b7..d17a9876b473 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -46,6 +46,7 @@ available subsections can be seen below. pinctl gpio misc_devices + dmaengine/index .. only:: subproject and html From 77fe661214d784878d666a226116057191de097b Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:38 +0530 Subject: [PATCH 0923/1085] dmaengine: doc: ReSTize provider doc This moves and converts provider file with some format changes for RST style Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- Documentation/dmaengine/provider.txt | 424 --------------- Documentation/driver-api/dmaengine/index.rst | 11 + .../driver-api/dmaengine/provider.rst | 508 ++++++++++++++++++ 3 files changed, 519 insertions(+), 424 deletions(-) delete mode 100644 Documentation/dmaengine/provider.txt create mode 100644 Documentation/driver-api/dmaengine/provider.rst diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt deleted file mode 100644 index 5dbe054a40ad..000000000000 --- a/Documentation/dmaengine/provider.txt +++ /dev/null @@ -1,424 +0,0 @@ -DMAengine controller documentation -================================== - -Hardware Introduction -+++++++++++++++++++++ - -Most of the Slave DMA controllers have the same general principles of -operations. - -They have a given number of channels to use for the DMA transfers, and -a given number of requests lines. - -Requests and channels are pretty much orthogonal. Channels can be used -to serve several to any requests. To simplify, channels are the -entities that will be doing the copy, and requests what endpoints are -involved. - -The request lines actually correspond to physical lines going from the -DMA-eligible devices to the controller itself. Whenever the device -will want to start a transfer, it will assert a DMA request (DRQ) by -asserting that request line. - -A very simple DMA controller would only take into account a single -parameter: the transfer size. At each clock cycle, it would transfer a -byte of data from one buffer to another, until the transfer size has -been reached. - -That wouldn't work well in the real world, since slave devices might -require a specific number of bits to be transferred in a single -cycle. For example, we may want to transfer as much data as the -physical bus allows to maximize performances when doing a simple -memory copy operation, but our audio device could have a narrower FIFO -that requires data to be written exactly 16 or 24 bits at a time. This -is why most if not all of the DMA controllers can adjust this, using a -parameter called the transfer width. - -Moreover, some DMA controllers, whenever the RAM is used as a source -or destination, can group the reads or writes in memory into a buffer, -so instead of having a lot of small memory accesses, which is not -really efficient, you'll get several bigger transfers. This is done -using a parameter called the burst size, that defines how many single -reads/writes it's allowed to do without the controller splitting the -transfer into smaller sub-transfers. - -Our theoretical DMA controller would then only be able to do transfers -that involve a single contiguous block of data. However, some of the -transfers we usually have are not, and want to copy data from -non-contiguous buffers to a contiguous buffer, which is called -scatter-gather. - -DMAEngine, at least for mem2dev transfers, require support for -scatter-gather. So we're left with two cases here: either we have a -quite simple DMA controller that doesn't support it, and we'll have to -implement it in software, or we have a more advanced DMA controller, -that implements in hardware scatter-gather. - -The latter are usually programmed using a collection of chunks to -transfer, and whenever the transfer is started, the controller will go -over that collection, doing whatever we programmed there. - -This collection is usually either a table or a linked list. You will -then push either the address of the table and its number of elements, -or the first item of the list to one channel of the DMA controller, -and whenever a DRQ will be asserted, it will go through the collection -to know where to fetch the data from. - -Either way, the format of this collection is completely dependent on -your hardware. Each DMA controller will require a different structure, -but all of them will require, for every chunk, at least the source and -destination addresses, whether it should increment these addresses or -not and the three parameters we saw earlier: the burst size, the -transfer width and the transfer size. - -The one last thing is that usually, slave devices won't issue DRQ by -default, and you have to enable this in your slave device driver first -whenever you're willing to use DMA. - -These were just the general memory-to-memory (also called mem2mem) or -memory-to-device (mem2dev) kind of transfers. Most devices often -support other kind of transfers or memory operations that dmaengine -support and will be detailed later in this document. - -DMA Support in Linux -++++++++++++++++++++ - -Historically, DMA controller drivers have been implemented using the -async TX API, to offload operations such as memory copy, XOR, -cryptography, etc., basically any memory to memory operation. - -Over time, the need for memory to device transfers arose, and -dmaengine was extended. Nowadays, the async TX API is written as a -layer on top of dmaengine, and acts as a client. Still, dmaengine -accommodates that API in some cases, and made some design choices to -ensure that it stayed compatible. - -For more information on the Async TX API, please look the relevant -documentation file in Documentation/crypto/async-tx-api.txt. - -DMAEngine Registration -++++++++++++++++++++++ - -struct dma_device Initialization --------------------------------- - -Just like any other kernel framework, the whole DMAEngine registration -relies on the driver filling a structure and registering against the -framework. In our case, that structure is dma_device. - -The first thing you need to do in your driver is to allocate this -structure. Any of the usual memory allocators will do, but you'll also -need to initialize a few fields in there: - - * channels: should be initialized as a list using the - INIT_LIST_HEAD macro for example - - * src_addr_widths: - - should contain a bitmask of the supported source transfer width - - * dst_addr_widths: - - should contain a bitmask of the supported destination transfer - width - - * directions: - - should contain a bitmask of the supported slave directions - (i.e. excluding mem2mem transfers) - - * residue_granularity: - - Granularity of the transfer residue reported to dma_set_residue. - - This can be either: - + Descriptor - -> Your device doesn't support any kind of residue - reporting. The framework will only know that a particular - transaction descriptor is done. - + Segment - -> Your device is able to report which chunks have been - transferred - + Burst - -> Your device is able to report which burst have been - transferred - - * dev: should hold the pointer to the struct device associated - to your current driver instance. - -Supported transaction types ---------------------------- - -The next thing you need is to set which transaction types your device -(and driver) supports. - -Our dma_device structure has a field called cap_mask that holds the -various types of transaction supported, and you need to modify this -mask using the dma_cap_set function, with various flags depending on -transaction types you support as an argument. - -All those capabilities are defined in the dma_transaction_type enum, -in include/linux/dmaengine.h - -Currently, the types available are: - * DMA_MEMCPY - - The device is able to do memory to memory copies - - * DMA_XOR - - The device is able to perform XOR operations on memory areas - - Used to accelerate XOR intensive tasks, such as RAID5 - - * DMA_XOR_VAL - - The device is able to perform parity check using the XOR - algorithm against a memory buffer. - - * DMA_PQ - - The device is able to perform RAID6 P+Q computations, P being a - simple XOR, and Q being a Reed-Solomon algorithm. - - * DMA_PQ_VAL - - The device is able to perform parity check using RAID6 P+Q - algorithm against a memory buffer. - - * DMA_INTERRUPT - - The device is able to trigger a dummy transfer that will - generate periodic interrupts - - Used by the client drivers to register a callback that will be - called on a regular basis through the DMA controller interrupt - - * DMA_PRIVATE - - The devices only supports slave transfers, and as such isn't - available for async transfers. - - * DMA_ASYNC_TX - - Must not be set by the device, and will be set by the framework - if needed - - /* TODO: What is it about? */ - - * DMA_SLAVE - - The device can handle device to memory transfers, including - scatter-gather transfers. - - While in the mem2mem case we were having two distinct types to - deal with a single chunk to copy or a collection of them, here, - we just have a single transaction type that is supposed to - handle both. - - If you want to transfer a single contiguous memory buffer, - simply build a scatter list with only one item. - - * DMA_CYCLIC - - The device can handle cyclic transfers. - - A cyclic transfer is a transfer where the chunk collection will - loop over itself, with the last item pointing to the first. - - It's usually used for audio transfers, where you want to operate - on a single ring buffer that you will fill with your audio data. - - * DMA_INTERLEAVE - - The device supports interleaved transfer. - - These transfers can transfer data from a non-contiguous buffer - to a non-contiguous buffer, opposed to DMA_SLAVE that can - transfer data from a non-contiguous data set to a continuous - destination buffer. - - It's usually used for 2d content transfers, in which case you - want to transfer a portion of uncompressed data directly to the - display to print it - -These various types will also affect how the source and destination -addresses change over time. - -Addresses pointing to RAM are typically incremented (or decremented) -after each transfer. In case of a ring buffer, they may loop -(DMA_CYCLIC). Addresses pointing to a device's register (e.g. a FIFO) -are typically fixed. - -Device operations ------------------ - -Our dma_device structure also requires a few function pointers in -order to implement the actual logic, now that we described what -operations we were able to perform. - -The functions that we have to fill in there, and hence have to -implement, obviously depend on the transaction types you reported as -supported. - - * device_alloc_chan_resources - * device_free_chan_resources - - These functions will be called whenever a driver will call - dma_request_channel or dma_release_channel for the first/last - time on the channel associated to that driver. - - They are in charge of allocating/freeing all the needed - resources in order for that channel to be useful for your - driver. - - These functions can sleep. - - * device_prep_dma_* - - These functions are matching the capabilities you registered - previously. - - These functions all take the buffer or the scatterlist relevant - for the transfer being prepared, and should create a hardware - descriptor or a list of hardware descriptors from it - - These functions can be called from an interrupt context - - Any allocation you might do should be using the GFP_NOWAIT - flag, in order not to potentially sleep, but without depleting - the emergency pool either. - - Drivers should try to pre-allocate any memory they might need - during the transfer setup at probe time to avoid putting to - much pressure on the nowait allocator. - - - It should return a unique instance of the - dma_async_tx_descriptor structure, that further represents this - particular transfer. - - - This structure can be initialized using the function - dma_async_tx_descriptor_init. - - You'll also need to set two fields in this structure: - + flags: - TODO: Can it be modified by the driver itself, or - should it be always the flags passed in the arguments - - + tx_submit: A pointer to a function you have to implement, - that is supposed to push the current - transaction descriptor to a pending queue, waiting - for issue_pending to be called. - - In this structure the function pointer callback_result can be - initialized in order for the submitter to be notified that a - transaction has completed. In the earlier code the function pointer - callback has been used. However it does not provide any status to the - transaction and will be deprecated. The result structure defined as - dmaengine_result that is passed in to callback_result has two fields: - + result: This provides the transfer result defined by - dmaengine_tx_result. Either success or some error - condition. - + residue: Provides the residue bytes of the transfer for those that - support residue. - - * device_issue_pending - - Takes the first transaction descriptor in the pending queue, - and starts the transfer. Whenever that transfer is done, it - should move to the next transaction in the list. - - This function can be called in an interrupt context - - * device_tx_status - - Should report the bytes left to go over on the given channel - - Should only care about the transaction descriptor passed as - argument, not the currently active one on a given channel - - The tx_state argument might be NULL - - Should use dma_set_residue to report it - - In the case of a cyclic transfer, it should only take into - account the current period. - - This function can be called in an interrupt context. - - * device_config - - Reconfigures the channel with the configuration given as - argument - - This command should NOT perform synchronously, or on any - currently queued transfers, but only on subsequent ones - - In this case, the function will receive a dma_slave_config - structure pointer as an argument, that will detail which - configuration to use. - - Even though that structure contains a direction field, this - field is deprecated in favor of the direction argument given to - the prep_* functions - - This call is mandatory for slave operations only. This should NOT be - set or expected to be set for memcpy operations. - If a driver support both, it should use this call for slave - operations only and not for memcpy ones. - - * device_pause - - Pauses a transfer on the channel - - This command should operate synchronously on the channel, - pausing right away the work of the given channel - - * device_resume - - Resumes a transfer on the channel - - This command should operate synchronously on the channel, - resuming right away the work of the given channel - - * device_terminate_all - - Aborts all the pending and ongoing transfers on the channel - - For aborted transfers the complete callback should not be called - - Can be called from atomic context or from within a complete - callback of a descriptor. Must not sleep. Drivers must be able - to handle this correctly. - - Termination may be asynchronous. The driver does not have to - wait until the currently active transfer has completely stopped. - See device_synchronize. - - * device_synchronize - - Must synchronize the termination of a channel to the current - context. - - Must make sure that memory for previously submitted - descriptors is no longer accessed by the DMA controller. - - Must make sure that all complete callbacks for previously - submitted descriptors have finished running and none are - scheduled to run. - - May sleep. - - -Misc notes (stuff that should be documented, but don't really know -where to put them) ------------------------------------------------------------------- - * dma_run_dependencies - - Should be called at the end of an async TX transfer, and can be - ignored in the slave transfers case. - - Makes sure that dependent operations are run before marking it - as complete. - - * dma_cookie_t - - it's a DMA transaction ID that will increment over time. - - Not really relevant any more since the introduction of virt-dma - that abstracts it away. - - * DMA_CTRL_ACK - - If clear, the descriptor cannot be reused by provider until the - client acknowledges receipt, i.e. has has a chance to establish any - dependency chains - - This can be acked by invoking async_tx_ack() - - If set, does not mean descriptor can be reused - - * DMA_CTRL_REUSE - - If set, the descriptor can be reused after being completed. It should - not be freed by provider if this flag is set. - - The descriptor should be prepared for reuse by invoking - dmaengine_desc_set_reuse() which will set DMA_CTRL_REUSE. - - dmaengine_desc_set_reuse() will succeed only when channel support - reusable descriptor as exhibited by capabilities - - As a consequence, if a device driver wants to skip the dma_map_sg() and - dma_unmap_sg() in between 2 transfers, because the DMA'd data wasn't used, - it can resubmit the transfer right after its completion. - - Descriptor can be freed in few ways - - Clearing DMA_CTRL_REUSE by invoking dmaengine_desc_clear_reuse() - and submitting for last txn - - Explicitly invoking dmaengine_desc_free(), this can succeed only - when DMA_CTRL_REUSE is already set - - Terminating the channel - - * DMA_PREP_CMD - - If set, the client driver tells DMA controller that passed data in DMA - API is command data. - - Interpretation of command data is DMA controller specific. It can be - used for issuing commands to other peripherals/register reads/register - writes for which the descriptor should be in different format from - normal data descriptors. - -General Design Notes --------------------- - -Most of the DMAEngine drivers you'll see are based on a similar design -that handles the end of transfer interrupts in the handler, but defer -most work to a tasklet, including the start of a new transfer whenever -the previous transfer ended. - -This is a rather inefficient design though, because the inter-transfer -latency will be not only the interrupt latency, but also the -scheduling latency of the tasklet, which will leave the channel idle -in between, which will slow down the global transfer rate. - -You should avoid this kind of practice, and instead of electing a new -transfer in your tasklet, move that part to the interrupt handler in -order to have a shorter idle window (that we can't really avoid -anyway). - -Glossary --------- - -Burst: A number of consecutive read or write operations - that can be queued to buffers before being flushed to - memory. -Chunk: A contiguous collection of bursts -Transfer: A collection of chunks (be it contiguous or not) diff --git a/Documentation/driver-api/dmaengine/index.rst b/Documentation/driver-api/dmaengine/index.rst index 8c90a6443810..eee0377e0d7b 100644 --- a/Documentation/driver-api/dmaengine/index.rst +++ b/Documentation/driver-api/dmaengine/index.rst @@ -5,6 +5,17 @@ DMAEngine documentation DMAEngine documentation provides documents for various aspects of DMAEngine framework. +DMAEngine documentation +----------------------- + +This book helps with DMAengine internal APIs and guide for DMAEngine device +driver writers. + +.. toctree:: + :maxdepth: 1 + + provider + .. only:: subproject Indices diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst new file mode 100644 index 000000000000..814acb4d2294 --- /dev/null +++ b/Documentation/driver-api/dmaengine/provider.rst @@ -0,0 +1,508 @@ +================================== +DMAengine controller documentation +================================== + +Hardware Introduction +===================== + +Most of the Slave DMA controllers have the same general principles of +operations. + +They have a given number of channels to use for the DMA transfers, and +a given number of requests lines. + +Requests and channels are pretty much orthogonal. Channels can be used +to serve several to any requests. To simplify, channels are the +entities that will be doing the copy, and requests what endpoints are +involved. + +The request lines actually correspond to physical lines going from the +DMA-eligible devices to the controller itself. Whenever the device +will want to start a transfer, it will assert a DMA request (DRQ) by +asserting that request line. + +A very simple DMA controller would only take into account a single +parameter: the transfer size. At each clock cycle, it would transfer a +byte of data from one buffer to another, until the transfer size has +been reached. + +That wouldn't work well in the real world, since slave devices might +require a specific number of bits to be transferred in a single +cycle. For example, we may want to transfer as much data as the +physical bus allows to maximize performances when doing a simple +memory copy operation, but our audio device could have a narrower FIFO +that requires data to be written exactly 16 or 24 bits at a time. This +is why most if not all of the DMA controllers can adjust this, using a +parameter called the transfer width. + +Moreover, some DMA controllers, whenever the RAM is used as a source +or destination, can group the reads or writes in memory into a buffer, +so instead of having a lot of small memory accesses, which is not +really efficient, you'll get several bigger transfers. This is done +using a parameter called the burst size, that defines how many single +reads/writes it's allowed to do without the controller splitting the +transfer into smaller sub-transfers. + +Our theoretical DMA controller would then only be able to do transfers +that involve a single contiguous block of data. However, some of the +transfers we usually have are not, and want to copy data from +non-contiguous buffers to a contiguous buffer, which is called +scatter-gather. + +DMAEngine, at least for mem2dev transfers, require support for +scatter-gather. So we're left with two cases here: either we have a +quite simple DMA controller that doesn't support it, and we'll have to +implement it in software, or we have a more advanced DMA controller, +that implements in hardware scatter-gather. + +The latter are usually programmed using a collection of chunks to +transfer, and whenever the transfer is started, the controller will go +over that collection, doing whatever we programmed there. + +This collection is usually either a table or a linked list. You will +then push either the address of the table and its number of elements, +or the first item of the list to one channel of the DMA controller, +and whenever a DRQ will be asserted, it will go through the collection +to know where to fetch the data from. + +Either way, the format of this collection is completely dependent on +your hardware. Each DMA controller will require a different structure, +but all of them will require, for every chunk, at least the source and +destination addresses, whether it should increment these addresses or +not and the three parameters we saw earlier: the burst size, the +transfer width and the transfer size. + +The one last thing is that usually, slave devices won't issue DRQ by +default, and you have to enable this in your slave device driver first +whenever you're willing to use DMA. + +These were just the general memory-to-memory (also called mem2mem) or +memory-to-device (mem2dev) kind of transfers. Most devices often +support other kind of transfers or memory operations that dmaengine +support and will be detailed later in this document. + +DMA Support in Linux +==================== + +Historically, DMA controller drivers have been implemented using the +async TX API, to offload operations such as memory copy, XOR, +cryptography, etc., basically any memory to memory operation. + +Over time, the need for memory to device transfers arose, and +dmaengine was extended. Nowadays, the async TX API is written as a +layer on top of dmaengine, and acts as a client. Still, dmaengine +accommodates that API in some cases, and made some design choices to +ensure that it stayed compatible. + +For more information on the Async TX API, please look the relevant +documentation file in Documentation/crypto/async-tx-api.txt. + +DMAEngine APIs +============== + +``struct dma_device`` Initialization +------------------------------------ + +Just like any other kernel framework, the whole DMAEngine registration +relies on the driver filling a structure and registering against the +framework. In our case, that structure is dma_device. + +The first thing you need to do in your driver is to allocate this +structure. Any of the usual memory allocators will do, but you'll also +need to initialize a few fields in there: + +- channels: should be initialized as a list using the + INIT_LIST_HEAD macro for example + +- src_addr_widths: + should contain a bitmask of the supported source transfer width + +- dst_addr_widths: + should contain a bitmask of the supported destination transfer width + +- directions: + should contain a bitmask of the supported slave directions + (i.e. excluding mem2mem transfers) + +- residue_granularity: + + - Granularity of the transfer residue reported to dma_set_residue. + This can be either: + + - Descriptor + + - Your device doesn't support any kind of residue + reporting. The framework will only know that a particular + transaction descriptor is done. + + - Segment + + - Your device is able to report which chunks have been transferred + + - Burst + + - Your device is able to report which burst have been transferred + + - dev: should hold the pointer to the ``struct device`` associated + to your current driver instance. + +Supported transaction types +--------------------------- + +The next thing you need is to set which transaction types your device +(and driver) supports. + +Our ``dma_device structure`` has a field called cap_mask that holds the +various types of transaction supported, and you need to modify this +mask using the dma_cap_set function, with various flags depending on +transaction types you support as an argument. + +All those capabilities are defined in the ``dma_transaction_type enum``, +in ``include/linux/dmaengine.h`` + +Currently, the types available are: + +- DMA_MEMCPY + + - The device is able to do memory to memory copies + +- DMA_XOR + + - The device is able to perform XOR operations on memory areas + + - Used to accelerate XOR intensive tasks, such as RAID5 + +- DMA_XOR_VAL + + - The device is able to perform parity check using the XOR + algorithm against a memory buffer. + +- DMA_PQ + + - The device is able to perform RAID6 P+Q computations, P being a + simple XOR, and Q being a Reed-Solomon algorithm. + +- DMA_PQ_VAL + + - The device is able to perform parity check using RAID6 P+Q + algorithm against a memory buffer. + +- DMA_INTERRUPT + + - The device is able to trigger a dummy transfer that will + generate periodic interrupts + + - Used by the client drivers to register a callback that will be + called on a regular basis through the DMA controller interrupt + +- DMA_PRIVATE + + - The devices only supports slave transfers, and as such isn't + available for async transfers. + +- DMA_ASYNC_TX + + - Must not be set by the device, and will be set by the framework + if needed + + - TODO: What is it about? + +- DMA_SLAVE + + - The device can handle device to memory transfers, including + scatter-gather transfers. + + - While in the mem2mem case we were having two distinct types to + deal with a single chunk to copy or a collection of them, here, + we just have a single transaction type that is supposed to + handle both. + + - If you want to transfer a single contiguous memory buffer, + simply build a scatter list with only one item. + +- DMA_CYCLIC + + - The device can handle cyclic transfers. + + - A cyclic transfer is a transfer where the chunk collection will + loop over itself, with the last item pointing to the first. + + - It's usually used for audio transfers, where you want to operate + on a single ring buffer that you will fill with your audio data. + +- DMA_INTERLEAVE + + - The device supports interleaved transfer. + + - These transfers can transfer data from a non-contiguous buffer + to a non-contiguous buffer, opposed to DMA_SLAVE that can + transfer data from a non-contiguous data set to a continuous + destination buffer. + + - It's usually used for 2d content transfers, in which case you + want to transfer a portion of uncompressed data directly to the + display to print it + +These various types will also affect how the source and destination +addresses change over time. + +Addresses pointing to RAM are typically incremented (or decremented) +after each transfer. In case of a ring buffer, they may loop +(DMA_CYCLIC). Addresses pointing to a device's register (e.g. a FIFO) +are typically fixed. + +Device operations +----------------- + +Our dma_device structure also requires a few function pointers in +order to implement the actual logic, now that we described what +operations we were able to perform. + +The functions that we have to fill in there, and hence have to +implement, obviously depend on the transaction types you reported as +supported. + +- ``device_alloc_chan_resources`` + +- ``device_free_chan_resources`` + + - These functions will be called whenever a driver will call + ``dma_request_channel`` or ``dma_release_channel`` for the first/last + time on the channel associated to that driver. + + - They are in charge of allocating/freeing all the needed + resources in order for that channel to be useful for your driver. + + - These functions can sleep. + +- ``device_prep_dma_*`` + + - These functions are matching the capabilities you registered + previously. + + - These functions all take the buffer or the scatterlist relevant + for the transfer being prepared, and should create a hardware + descriptor or a list of hardware descriptors from it + + - These functions can be called from an interrupt context + + - Any allocation you might do should be using the GFP_NOWAIT + flag, in order not to potentially sleep, but without depleting + the emergency pool either. + + - Drivers should try to pre-allocate any memory they might need + during the transfer setup at probe time to avoid putting to + much pressure on the nowait allocator. + + - It should return a unique instance of the + ``dma_async_tx_descriptor structure``, that further represents this + particular transfer. + + - This structure can be initialized using the function + ``dma_async_tx_descriptor_init``. + + - You'll also need to set two fields in this structure: + + - flags: + TODO: Can it be modified by the driver itself, or + should it be always the flags passed in the arguments + + - tx_submit: A pointer to a function you have to implement, + that is supposed to push the current transaction descriptor to a + pending queue, waiting for issue_pending to be called. + + - In this structure the function pointer callback_result can be + initialized in order for the submitter to be notified that a + transaction has completed. In the earlier code the function pointer + callback has been used. However it does not provide any status to the + transaction and will be deprecated. The result structure defined as + ``dmaengine_result`` that is passed in to callback_result + has two fields: + + - result: This provides the transfer result defined by + ``dmaengine_tx_result``. Either success or some error condition. + + - residue: Provides the residue bytes of the transfer for those that + support residue. + +- ``device_issue_pending`` + + - Takes the first transaction descriptor in the pending queue, + and starts the transfer. Whenever that transfer is done, it + should move to the next transaction in the list. + + - This function can be called in an interrupt context + +- ``device_tx_status`` + + - Should report the bytes left to go over on the given channel + + - Should only care about the transaction descriptor passed as + argument, not the currently active one on a given channel + + - The tx_state argument might be NULL + + - Should use dma_set_residue to report it + + - In the case of a cyclic transfer, it should only take into + account the current period. + + - This function can be called in an interrupt context. + +- device_config + + - Reconfigures the channel with the configuration given as argument + + - This command should NOT perform synchronously, or on any + currently queued transfers, but only on subsequent ones + + - In this case, the function will receive a ``dma_slave_config`` + structure pointer as an argument, that will detail which + configuration to use. + + - Even though that structure contains a direction field, this + field is deprecated in favor of the direction argument given to + the prep_* functions + + - This call is mandatory for slave operations only. This should NOT be + set or expected to be set for memcpy operations. + If a driver support both, it should use this call for slave + operations only and not for memcpy ones. + +- device_pause + + - Pauses a transfer on the channel + + - This command should operate synchronously on the channel, + pausing right away the work of the given channel + +- device_resume + + - Resumes a transfer on the channel + + - This command should operate synchronously on the channel, + resuming right away the work of the given channel + +- device_terminate_all + + - Aborts all the pending and ongoing transfers on the channel + + - For aborted transfers the complete callback should not be called + + - Can be called from atomic context or from within a complete + callback of a descriptor. Must not sleep. Drivers must be able + to handle this correctly. + + - Termination may be asynchronous. The driver does not have to + wait until the currently active transfer has completely stopped. + See device_synchronize. + +- device_synchronize + + - Must synchronize the termination of a channel to the current + context. + + - Must make sure that memory for previously submitted + descriptors is no longer accessed by the DMA controller. + + - Must make sure that all complete callbacks for previously + submitted descriptors have finished running and none are + scheduled to run. + + - May sleep. + + +Misc notes +========== + +(stuff that should be documented, but don't really know +where to put them) + +``dma_run_dependencies`` + +- Should be called at the end of an async TX transfer, and can be + ignored in the slave transfers case. + +- Makes sure that dependent operations are run before marking it + as complete. + +dma_cookie_t + +- it's a DMA transaction ID that will increment over time. + +- Not really relevant any more since the introduction of ``virt-dma`` + that abstracts it away. + +DMA_CTRL_ACK + +- If clear, the descriptor cannot be reused by provider until the + client acknowledges receipt, i.e. has has a chance to establish any + dependency chains + +- This can be acked by invoking async_tx_ack() + +- If set, does not mean descriptor can be reused + +DMA_CTRL_REUSE + +- If set, the descriptor can be reused after being completed. It should + not be freed by provider if this flag is set. + +- The descriptor should be prepared for reuse by invoking + ``dmaengine_desc_set_reuse()`` which will set DMA_CTRL_REUSE. + +- ``dmaengine_desc_set_reuse()`` will succeed only when channel support + reusable descriptor as exhibited by capabilities + +- As a consequence, if a device driver wants to skip the + ``dma_map_sg()`` and ``dma_unmap_sg()`` in between 2 transfers, + because the DMA'd data wasn't used, it can resubmit the transfer right after + its completion. + +- Descriptor can be freed in few ways + + - Clearing DMA_CTRL_REUSE by invoking + ``dmaengine_desc_clear_reuse()`` and submitting for last txn + + - Explicitly invoking ``dmaengine_desc_free()``, this can succeed only + when DMA_CTRL_REUSE is already set + + - Terminating the channel + +- DMA_PREP_CMD + + - If set, the client driver tells DMA controller that passed data in DMA + API is command data. + + - Interpretation of command data is DMA controller specific. It can be + used for issuing commands to other peripherals/register reads/register + writes for which the descriptor should be in different format from + normal data descriptors. + +General Design Notes +==================== + +Most of the DMAEngine drivers you'll see are based on a similar design +that handles the end of transfer interrupts in the handler, but defer +most work to a tasklet, including the start of a new transfer whenever +the previous transfer ended. + +This is a rather inefficient design though, because the inter-transfer +latency will be not only the interrupt latency, but also the +scheduling latency of the tasklet, which will leave the channel idle +in between, which will slow down the global transfer rate. + +You should avoid this kind of practice, and instead of electing a new +transfer in your tasklet, move that part to the interrupt handler in +order to have a shorter idle window (that we can't really avoid +anyway). + +Glossary +======== + +- Burst: A number of consecutive read or write operations that + can be queued to buffers before being flushed to memory. + +- Chunk: A contiguous collection of bursts + +- Transfer: A collection of chunks (be it contiguous or not) From eeb1c6435293e9d3c30b21579bd5154c013f2843 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:39 +0530 Subject: [PATCH 0924/1085] dmaengine: doc: ReSTize client API doc This converts and moves client API file with some format changes for RST style Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- Documentation/dmaengine/client.txt | 222 -------------- Documentation/driver-api/dmaengine/client.rst | 275 ++++++++++++++++++ Documentation/driver-api/dmaengine/index.rst | 11 + 3 files changed, 286 insertions(+), 222 deletions(-) delete mode 100644 Documentation/dmaengine/client.txt create mode 100644 Documentation/driver-api/dmaengine/client.rst diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt deleted file mode 100644 index c72b4563de10..000000000000 --- a/Documentation/dmaengine/client.txt +++ /dev/null @@ -1,222 +0,0 @@ - DMA Engine API Guide - ==================== - - Vinod Koul - -NOTE: For DMA Engine usage in async_tx please see: - Documentation/crypto/async-tx-api.txt - - -Below is a guide to device driver writers on how to use the Slave-DMA API of the -DMA Engine. This is applicable only for slave DMA usage only. - -The slave DMA usage consists of following steps: -1. Allocate a DMA slave channel -2. Set slave and controller specific parameters -3. Get a descriptor for transaction -4. Submit the transaction -5. Issue pending requests and wait for callback notification - -1. Allocate a DMA slave channel - - Channel allocation is slightly different in the slave DMA context, - client drivers typically need a channel from a particular DMA - controller only and even in some cases a specific channel is desired. - To request a channel dma_request_chan() API is used. - - Interface: - struct dma_chan *dma_request_chan(struct device *dev, const char *name); - - Which will find and return the 'name' DMA channel associated with the 'dev' - device. The association is done via DT, ACPI or board file based - dma_slave_map matching table. - - A channel allocated via this interface is exclusive to the caller, - until dma_release_channel() is called. - -2. Set slave and controller specific parameters - - Next step is always to pass some specific information to the DMA - driver. Most of the generic information which a slave DMA can use - is in struct dma_slave_config. This allows the clients to specify - DMA direction, DMA addresses, bus widths, DMA burst lengths etc - for the peripheral. - - If some DMA controllers have more parameters to be sent then they - should try to embed struct dma_slave_config in their controller - specific structure. That gives flexibility to client to pass more - parameters, if required. - - Interface: - int dmaengine_slave_config(struct dma_chan *chan, - struct dma_slave_config *config) - - Please see the dma_slave_config structure definition in dmaengine.h - for a detailed explanation of the struct members. Please note - that the 'direction' member will be going away as it duplicates the - direction given in the prepare call. - -3. Get a descriptor for transaction - - For slave usage the various modes of slave transfers supported by the - DMA-engine are: - - slave_sg - DMA a list of scatter gather buffers from/to a peripheral - dma_cyclic - Perform a cyclic DMA operation from/to a peripheral till the - operation is explicitly stopped. - interleaved_dma - This is common to Slave as well as M2M clients. For slave - address of devices' fifo could be already known to the driver. - Various types of operations could be expressed by setting - appropriate values to the 'dma_interleaved_template' members. - - A non-NULL return of this transfer API represents a "descriptor" for - the given transaction. - - Interface: - struct dma_async_tx_descriptor *dmaengine_prep_slave_sg( - struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, - unsigned long flags); - - struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic( - struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction); - - struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( - struct dma_chan *chan, struct dma_interleaved_template *xt, - unsigned long flags); - - The peripheral driver is expected to have mapped the scatterlist for - the DMA operation prior to calling dmaengine_prep_slave_sg(), and must - keep the scatterlist mapped until the DMA operation has completed. - The scatterlist must be mapped using the DMA struct device. - If a mapping needs to be synchronized later, dma_sync_*_for_*() must be - called using the DMA struct device, too. - So, normal setup should look like this: - - nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len); - if (nr_sg == 0) - /* error */ - - desc = dmaengine_prep_slave_sg(chan, sgl, nr_sg, direction, flags); - - Once a descriptor has been obtained, the callback information can be - added and the descriptor must then be submitted. Some DMA engine - drivers may hold a spinlock between a successful preparation and - submission so it is important that these two operations are closely - paired. - - Note: - Although the async_tx API specifies that completion callback - routines cannot submit any new operations, this is not the - case for slave/cyclic DMA. - - For slave DMA, the subsequent transaction may not be available - for submission prior to callback function being invoked, so - slave DMA callbacks are permitted to prepare and submit a new - transaction. - - For cyclic DMA, a callback function may wish to terminate the - DMA via dmaengine_terminate_async(). - - Therefore, it is important that DMA engine drivers drop any - locks before calling the callback function which may cause a - deadlock. - - Note that callbacks will always be invoked from the DMA - engines tasklet, never from interrupt context. - -4. Submit the transaction - - Once the descriptor has been prepared and the callback information - added, it must be placed on the DMA engine drivers pending queue. - - Interface: - dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc) - - This returns a cookie can be used to check the progress of DMA engine - activity via other DMA engine calls not covered in this document. - - dmaengine_submit() will not start the DMA operation, it merely adds - it to the pending queue. For this, see step 5, dma_async_issue_pending. - -5. Issue pending DMA requests and wait for callback notification - - The transactions in the pending queue can be activated by calling the - issue_pending API. If channel is idle then the first transaction in - queue is started and subsequent ones queued up. - - On completion of each DMA operation, the next in queue is started and - a tasklet triggered. The tasklet will then call the client driver - completion callback routine for notification, if set. - - Interface: - void dma_async_issue_pending(struct dma_chan *chan); - -Further APIs: - -1. int dmaengine_terminate_sync(struct dma_chan *chan) - int dmaengine_terminate_async(struct dma_chan *chan) - int dmaengine_terminate_all(struct dma_chan *chan) /* DEPRECATED */ - - This causes all activity for the DMA channel to be stopped, and may - discard data in the DMA FIFO which hasn't been fully transferred. - No callback functions will be called for any incomplete transfers. - - Two variants of this function are available. - - dmaengine_terminate_async() might not wait until the DMA has been fully - stopped or until any running complete callbacks have finished. But it is - possible to call dmaengine_terminate_async() from atomic context or from - within a complete callback. dmaengine_synchronize() must be called before it - is safe to free the memory accessed by the DMA transfer or free resources - accessed from within the complete callback. - - dmaengine_terminate_sync() will wait for the transfer and any running - complete callbacks to finish before it returns. But the function must not be - called from atomic context or from within a complete callback. - - dmaengine_terminate_all() is deprecated and should not be used in new code. - -2. int dmaengine_pause(struct dma_chan *chan) - - This pauses activity on the DMA channel without data loss. - -3. int dmaengine_resume(struct dma_chan *chan) - - Resume a previously paused DMA channel. It is invalid to resume a - channel which is not currently paused. - -4. enum dma_status dma_async_is_tx_complete(struct dma_chan *chan, - dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) - - This can be used to check the status of the channel. Please see - the documentation in include/linux/dmaengine.h for a more complete - description of this API. - - This can be used in conjunction with dma_async_is_complete() and - the cookie returned from dmaengine_submit() to check for - completion of a specific DMA transaction. - - Note: - Not all DMA engine drivers can return reliable information for - a running DMA channel. It is recommended that DMA engine users - pause or stop (via dmaengine_terminate_all()) the channel before - using this API. - -5. void dmaengine_synchronize(struct dma_chan *chan) - - Synchronize the termination of the DMA channel to the current context. - - This function should be used after dmaengine_terminate_async() to synchronize - the termination of the DMA channel to the current context. The function will - wait for the transfer and any running complete callbacks to finish before it - returns. - - If dmaengine_terminate_async() is used to stop the DMA channel this function - must be called before it is safe to free memory accessed by previously - submitted descriptors or to free any resources accessed within the complete - callback of previously submitted descriptors. - - The behavior of this function is undefined if dma_async_issue_pending() has - been called between dmaengine_terminate_async() and this function. diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst new file mode 100644 index 000000000000..6245c99af8c1 --- /dev/null +++ b/Documentation/driver-api/dmaengine/client.rst @@ -0,0 +1,275 @@ +==================== +DMA Engine API Guide +==================== + +Vinod Koul + +.. note:: For DMA Engine usage in async_tx please see: + ``Documentation/crypto/async-tx-api.txt`` + + +Below is a guide to device driver writers on how to use the Slave-DMA API of the +DMA Engine. This is applicable only for slave DMA usage only. + +DMA usage +========= + +The slave DMA usage consists of following steps: + +- Allocate a DMA slave channel + +- Set slave and controller specific parameters + +- Get a descriptor for transaction + +- Submit the transaction + +- Issue pending requests and wait for callback notification + +The details of these operations are: + +1. Allocate a DMA slave channel + + Channel allocation is slightly different in the slave DMA context, + client drivers typically need a channel from a particular DMA + controller only and even in some cases a specific channel is desired. + To request a channel dma_request_chan() API is used. + + Interface: + + .. code-block:: c + + struct dma_chan *dma_request_chan(struct device *dev, const char *name); + + Which will find and return the ``name`` DMA channel associated with the 'dev' + device. The association is done via DT, ACPI or board file based + dma_slave_map matching table. + + A channel allocated via this interface is exclusive to the caller, + until dma_release_channel() is called. + +2. Set slave and controller specific parameters + + Next step is always to pass some specific information to the DMA + driver. Most of the generic information which a slave DMA can use + is in struct dma_slave_config. This allows the clients to specify + DMA direction, DMA addresses, bus widths, DMA burst lengths etc + for the peripheral. + + If some DMA controllers have more parameters to be sent then they + should try to embed struct dma_slave_config in their controller + specific structure. That gives flexibility to client to pass more + parameters, if required. + + Interface: + + .. code-block:: c + + int dmaengine_slave_config(struct dma_chan *chan, + struct dma_slave_config *config) + + Please see the dma_slave_config structure definition in dmaengine.h + for a detailed explanation of the struct members. Please note + that the 'direction' member will be going away as it duplicates the + direction given in the prepare call. + +3. Get a descriptor for transaction + + For slave usage the various modes of slave transfers supported by the + DMA-engine are: + + - slave_sg: DMA a list of scatter gather buffers from/to a peripheral + + - dma_cyclic: Perform a cyclic DMA operation from/to a peripheral till the + operation is explicitly stopped. + + - interleaved_dma: This is common to Slave as well as M2M clients. For slave + address of devices' fifo could be already known to the driver. + Various types of operations could be expressed by setting + appropriate values to the 'dma_interleaved_template' members. + + A non-NULL return of this transfer API represents a "descriptor" for + the given transaction. + + Interface: + + .. code-block:: c + + struct dma_async_tx_descriptor *dmaengine_prep_slave_sg( + struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_data_direction direction, + unsigned long flags); + + struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic( + struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, + size_t period_len, enum dma_data_direction direction); + + struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( + struct dma_chan *chan, struct dma_interleaved_template *xt, + unsigned long flags); + + The peripheral driver is expected to have mapped the scatterlist for + the DMA operation prior to calling dmaengine_prep_slave_sg(), and must + keep the scatterlist mapped until the DMA operation has completed. + The scatterlist must be mapped using the DMA struct device. + If a mapping needs to be synchronized later, dma_sync_*_for_*() must be + called using the DMA struct device, too. + So, normal setup should look like this: + + .. code-block:: c + + nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len); + if (nr_sg == 0) + /* error */ + + desc = dmaengine_prep_slave_sg(chan, sgl, nr_sg, direction, flags); + + Once a descriptor has been obtained, the callback information can be + added and the descriptor must then be submitted. Some DMA engine + drivers may hold a spinlock between a successful preparation and + submission so it is important that these two operations are closely + paired. + + .. note:: + + Although the async_tx API specifies that completion callback + routines cannot submit any new operations, this is not the + case for slave/cyclic DMA. + + For slave DMA, the subsequent transaction may not be available + for submission prior to callback function being invoked, so + slave DMA callbacks are permitted to prepare and submit a new + transaction. + + For cyclic DMA, a callback function may wish to terminate the + DMA via dmaengine_terminate_async(). + + Therefore, it is important that DMA engine drivers drop any + locks before calling the callback function which may cause a + deadlock. + + Note that callbacks will always be invoked from the DMA + engines tasklet, never from interrupt context. + +4. Submit the transaction + + Once the descriptor has been prepared and the callback information + added, it must be placed on the DMA engine drivers pending queue. + + Interface: + + .. code-block:: c + + dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc) + + This returns a cookie can be used to check the progress of DMA engine + activity via other DMA engine calls not covered in this document. + + dmaengine_submit() will not start the DMA operation, it merely adds + it to the pending queue. For this, see step 5, dma_async_issue_pending. + +5. Issue pending DMA requests and wait for callback notification + + The transactions in the pending queue can be activated by calling the + issue_pending API. If channel is idle then the first transaction in + queue is started and subsequent ones queued up. + + On completion of each DMA operation, the next in queue is started and + a tasklet triggered. The tasklet will then call the client driver + completion callback routine for notification, if set. + + Interface: + + .. code-block:: c + + void dma_async_issue_pending(struct dma_chan *chan); + +Further APIs: +------------ + +1. Terminate APIs + + .. code-block:: c + + int dmaengine_terminate_sync(struct dma_chan *chan) + int dmaengine_terminate_async(struct dma_chan *chan) + int dmaengine_terminate_all(struct dma_chan *chan) /* DEPRECATED */ + + This causes all activity for the DMA channel to be stopped, and may + discard data in the DMA FIFO which hasn't been fully transferred. + No callback functions will be called for any incomplete transfers. + + Two variants of this function are available. + + dmaengine_terminate_async() might not wait until the DMA has been fully + stopped or until any running complete callbacks have finished. But it is + possible to call dmaengine_terminate_async() from atomic context or from + within a complete callback. dmaengine_synchronize() must be called before it + is safe to free the memory accessed by the DMA transfer or free resources + accessed from within the complete callback. + + dmaengine_terminate_sync() will wait for the transfer and any running + complete callbacks to finish before it returns. But the function must not be + called from atomic context or from within a complete callback. + + dmaengine_terminate_all() is deprecated and should not be used in new code. + +2. Pause API + + .. code-block:: c + + int dmaengine_pause(struct dma_chan *chan) + + This pauses activity on the DMA channel without data loss. + +3. Resume API + + .. code-block:: c + + int dmaengine_resume(struct dma_chan *chan) + + Resume a previously paused DMA channel. It is invalid to resume a + channel which is not currently paused. + +4. Check Txn complete + + .. code-block:: c + + enum dma_status dma_async_is_tx_complete(struct dma_chan *chan, + dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) + + This can be used to check the status of the channel. Please see + the documentation in include/linux/dmaengine.h for a more complete + description of this API. + + This can be used in conjunction with dma_async_is_complete() and + the cookie returned from dmaengine_submit() to check for + completion of a specific DMA transaction. + + .. note:: + + Not all DMA engine drivers can return reliable information for + a running DMA channel. It is recommended that DMA engine users + pause or stop (via dmaengine_terminate_all()) the channel before + using this API. + +5. Synchronize termination API + + .. code-block:: c + + void dmaengine_synchronize(struct dma_chan *chan) + + Synchronize the termination of the DMA channel to the current context. + + This function should be used after dmaengine_terminate_async() to synchronize + the termination of the DMA channel to the current context. The function will + wait for the transfer and any running complete callbacks to finish before it + returns. + + If dmaengine_terminate_async() is used to stop the DMA channel this function + must be called before it is safe to free memory accessed by previously + submitted descriptors or to free any resources accessed within the complete + callback of previously submitted descriptors. + + The behavior of this function is undefined if dma_async_issue_pending() has + been called between dmaengine_terminate_async() and this function. diff --git a/Documentation/driver-api/dmaengine/index.rst b/Documentation/driver-api/dmaengine/index.rst index eee0377e0d7b..ffc2b797938c 100644 --- a/Documentation/driver-api/dmaengine/index.rst +++ b/Documentation/driver-api/dmaengine/index.rst @@ -16,6 +16,17 @@ driver writers. provider +DMAEngine client documentation +------------------------------ + +This book is a guide to device driver writers on how to use the Slave-DMA +API of the DMAEngine. This is applicable only for slave DMA usage only. + +.. toctree:: + :maxdepth: 1 + + client + .. only:: subproject Indices From 179a214e9e985ef9b14d7e5d9fff20acddb9315b Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:40 +0530 Subject: [PATCH 0925/1085] dmaengine: doc: ReSTize dmatest doc This converts and moves dmatest file with some format changes for RST style Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- .../dmaengine/dmatest.rst} | 96 +++++++++++-------- Documentation/driver-api/dmaengine/index.rst | 10 ++ 2 files changed, 67 insertions(+), 39 deletions(-) rename Documentation/{dmaengine/dmatest.txt => driver-api/dmaengine/dmatest.rst} (50%) diff --git a/Documentation/dmaengine/dmatest.txt b/Documentation/driver-api/dmaengine/dmatest.rst similarity index 50% rename from Documentation/dmaengine/dmatest.txt rename to Documentation/driver-api/dmaengine/dmatest.rst index fb683c72dea8..3922c0a3f0c0 100644 --- a/Documentation/dmaengine/dmatest.txt +++ b/Documentation/driver-api/dmaengine/dmatest.rst @@ -1,11 +1,13 @@ - DMA Test Guide - ============== +============== +DMA Test Guide +============== - Andy Shevchenko +Andy Shevchenko This small document introduces how to test DMA drivers using dmatest module. - Part 1 - How to build the test module +Part 1 - How to build the test module +===================================== The menuconfig contains an option that could be found by following path: Device Drivers -> DMA Engine support -> DMA Test client @@ -13,25 +15,31 @@ The menuconfig contains an option that could be found by following path: In the configuration file the option called CONFIG_DMATEST. The dmatest could be built as module or inside kernel. Let's consider those cases. - Part 2 - When dmatest is built as a module... +Part 2 - When dmatest is built as a module +========================================== -Example of usage: - % modprobe dmatest channel=dma0chan0 timeout=2000 iterations=1 run=1 +Example of usage: :: -...or: - % modprobe dmatest - % echo dma0chan0 > /sys/module/dmatest/parameters/channel - % echo 2000 > /sys/module/dmatest/parameters/timeout - % echo 1 > /sys/module/dmatest/parameters/iterations - % echo 1 > /sys/module/dmatest/parameters/run + % modprobe dmatest channel=dma0chan0 timeout=2000 iterations=1 run=1 -...or on the kernel command line: +...or: :: - dmatest.channel=dma0chan0 dmatest.timeout=2000 dmatest.iterations=1 dmatest.run=1 + % modprobe dmatest + % echo dma0chan0 > /sys/module/dmatest/parameters/channel + % echo 2000 > /sys/module/dmatest/parameters/timeout + % echo 1 > /sys/module/dmatest/parameters/iterations + % echo 1 > /sys/module/dmatest/parameters/run -Hint: available channel list could be extracted by running the following -command: - % ls -1 /sys/class/dma/ +...or on the kernel command line: :: + + dmatest.channel=dma0chan0 dmatest.timeout=2000 dmatest.iterations=1 dmatest.run=1 + +..hint:: available channel list could be extracted by running the following + command: + +:: + + % ls -1 /sys/class/dma/ Once started a message like "dmatest: Started 1 threads using dma0chan0" is emitted. After that only test failure messages are reported until the test @@ -39,8 +47,9 @@ stops. Note that running a new test will not stop any in progress test. -The following command returns the state of the test. - % cat /sys/module/dmatest/parameters/run +The following command returns the state of the test. :: + + % cat /sys/module/dmatest/parameters/run To wait for test completion userpace can poll 'run' until it is false, or use the wait parameter. Specifying 'wait=1' when loading the module causes module @@ -50,15 +59,19 @@ before returning. For example, the following scripts wait for 42 tests to complete before exiting. Note that if 'iterations' is set to 'infinite' then waiting is disabled. -Example: - % modprobe dmatest run=1 iterations=42 wait=1 - % modprobe -r dmatest -...or: - % modprobe dmatest run=1 iterations=42 - % cat /sys/module/dmatest/parameters/wait - % modprobe -r dmatest +Example: :: - Part 3 - When built-in in the kernel... + % modprobe dmatest run=1 iterations=42 wait=1 + % modprobe -r dmatest + +...or: :: + + % modprobe dmatest run=1 iterations=42 + % cat /sys/module/dmatest/parameters/wait + % modprobe -r dmatest + +Part 3 - When built-in in the kernel +==================================== The module parameters that is supplied to the kernel command line will be used for the first performed test. After user gets a control, the test could be @@ -66,27 +79,32 @@ re-run with the same or different parameters. For the details see the above section "Part 2 - When dmatest is built as a module..." In both cases the module parameters are used as the actual values for the test -case. You always could check them at run-time by running - % grep -H . /sys/module/dmatest/parameters/* +case. You always could check them at run-time by running :: - Part 4 - Gathering the test results + % grep -H . /sys/module/dmatest/parameters/* -Test results are printed to the kernel log buffer with the format: +Part 4 - Gathering the test results +=================================== -"dmatest: result : : '' with src_off= dst_off= len= ()" +Test results are printed to the kernel log buffer with the format: :: -Example of output: - % dmesg | tail -n 1 - dmatest: result dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0) + "dmatest: result : : '' with src_off= dst_off= len= ()" + +Example of output: :: + + + % dmesg | tail -n 1 + dmatest: result dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0) The message format is unified across the different types of errors. A number in the parens represents additional information, e.g. error code, error counter, or status. A test thread also emits a summary line at completion listing the number of tests executed, number that failed, and a result code. -Example: - % dmesg | tail -n 1 - dmatest: dma0chan0-copy0: summary 1 test, 0 failures 1000 iops 100000 KB/s (0) +Example: :: + + % dmesg | tail -n 1 + dmatest: dma0chan0-copy0: summary 1 test, 0 failures 1000 iops 100000 KB/s (0) The details of a data miscompare error are also emitted, but do not follow the above format. diff --git a/Documentation/driver-api/dmaengine/index.rst b/Documentation/driver-api/dmaengine/index.rst index ffc2b797938c..fae852922a49 100644 --- a/Documentation/driver-api/dmaengine/index.rst +++ b/Documentation/driver-api/dmaengine/index.rst @@ -27,6 +27,16 @@ API of the DMAEngine. This is applicable only for slave DMA usage only. client +DMA Test documentation +---------------------- + +This book introduces how to test DMA drivers using dmatest module. + +.. toctree:: + :maxdepth: 1 + + dmatest + .. only:: subproject Indices From fbbe0bff9d2aa6f7b2d34059fca2ee808c04a651 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:41 +0530 Subject: [PATCH 0926/1085] dmaengine: doc: ReSTize pxa_dma doc This converts and moves pxa_dma file with some format changes for RST style Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- Documentation/dmaengine/pxa_dma.txt | 153 -------------- Documentation/driver-api/dmaengine/index.rst | 10 + .../driver-api/dmaengine/pxa_dma.rst | 190 ++++++++++++++++++ 3 files changed, 200 insertions(+), 153 deletions(-) delete mode 100644 Documentation/dmaengine/pxa_dma.txt create mode 100644 Documentation/driver-api/dmaengine/pxa_dma.rst diff --git a/Documentation/dmaengine/pxa_dma.txt b/Documentation/dmaengine/pxa_dma.txt deleted file mode 100644 index 0736d44b5438..000000000000 --- a/Documentation/dmaengine/pxa_dma.txt +++ /dev/null @@ -1,153 +0,0 @@ -PXA/MMP - DMA Slave controller -============================== - -Constraints ------------ - a) Transfers hot queuing - A driver submitting a transfer and issuing it should be granted the transfer - is queued even on a running DMA channel. - This implies that the queuing doesn't wait for the previous transfer end, - and that the descriptor chaining is not only done in the irq/tasklet code - triggered by the end of the transfer. - A transfer which is submitted and issued on a phy doesn't wait for a phy to - stop and restart, but is submitted on a "running channel". The other - drivers, especially mmp_pdma waited for the phy to stop before relaunching - a new transfer. - - b) All transfers having asked for confirmation should be signaled - Any issued transfer with DMA_PREP_INTERRUPT should trigger a callback call. - This implies that even if an irq/tasklet is triggered by end of tx1, but - at the time of irq/dma tx2 is already finished, tx1->complete() and - tx2->complete() should be called. - - c) Channel running state - A driver should be able to query if a channel is running or not. For the - multimedia case, such as video capture, if a transfer is submitted and then - a check of the DMA channel reports a "stopped channel", the transfer should - not be issued until the next "start of frame interrupt", hence the need to - know if a channel is in running or stopped state. - - d) Bandwidth guarantee - The PXA architecture has 4 levels of DMAs priorities : high, normal, low. - The high priorities get twice as much bandwidth as the normal, which get twice - as much as the low priorities. - A driver should be able to request a priority, especially the real-time - ones such as pxa_camera with (big) throughputs. - -Design ------- - a) Virtual channels - Same concept as in sa11x0 driver, ie. a driver was assigned a "virtual - channel" linked to the requestor line, and the physical DMA channel is - assigned on the fly when the transfer is issued. - - b) Transfer anatomy for a scatter-gather transfer - +------------+-----+---------------+----------------+-----------------+ - | desc-sg[0] | ... | desc-sg[last] | status updater | finisher/linker | - +------------+-----+---------------+----------------+-----------------+ - - This structure is pointed by dma->sg_cpu. - The descriptors are used as follows : - - desc-sg[i]: i-th descriptor, transferring the i-th sg - element to the video buffer scatter gather - - status updater - Transfers a single u32 to a well known dma coherent memory to leave - a trace that this transfer is done. The "well known" is unique per - physical channel, meaning that a read of this value will tell which - is the last finished transfer at that point in time. - - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN - - linker: has ddadr= desc-sg[0] of next transfer, dcmd=0 - - c) Transfers hot-chaining - Suppose the running chain is : - Buffer 1 Buffer 2 - +---------+----+---+ +----+----+----+---+ - | d0 | .. | dN | l | | d0 | .. | dN | f | - +---------+----+-|-+ ^----+----+----+---+ - | | - +----+ - - After a call to dmaengine_submit(b3), the chain will look like : - Buffer 1 Buffer 2 Buffer 3 - +---------+----+---+ +----+----+----+---+ +----+----+----+---+ - | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f | - +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+ - | | | | - +----+ +----+ - new_link - - If while new_link was created the DMA channel stopped, it is _not_ - restarted. Hot-chaining doesn't break the assumption that - dma_async_issue_pending() is to be used to ensure the transfer is actually started. - - One exception to this rule : - - if Buffer1 and Buffer2 had all their addresses 8 bytes aligned - - and if Buffer3 has at least one address not 4 bytes aligned - - then hot-chaining cannot happen, as the channel must be stopped, the - "align bit" must be set, and the channel restarted As a consequence, - such a transfer tx_submit() will be queued on the submitted queue, and - this specific case if the DMA is already running in aligned mode. - - d) Transfers completion updater - Each time a transfer is completed on a channel, an interrupt might be - generated or not, up to the client's request. But in each case, the last - descriptor of a transfer, the "status updater", will write the latest - transfer being completed into the physical channel's completion mark. - - This will speed up residue calculation, for large transfers such as video - buffers which hold around 6k descriptors or more. This also allows without - any lock to find out what is the latest completed transfer in a running - DMA chain. - - e) Transfers completion, irq and tasklet - When a transfer flagged as "DMA_PREP_INTERRUPT" is finished, the dma irq - is raised. Upon this interrupt, a tasklet is scheduled for the physical - channel. - The tasklet is responsible for : - - reading the physical channel last updater mark - - calling all the transfer callbacks of finished transfers, based on - that mark, and each transfer flags. - If a transfer is completed while this handling is done, a dma irq will - be raised, and the tasklet will be scheduled once again, having a new - updater mark. - - f) Residue - Residue granularity will be descriptor based. The issued but not completed - transfers will be scanned for all of their descriptors against the - currently running descriptor. - - g) Most complicated case of driver's tx queues - The most tricky situation is when : - - there are not "acked" transfers (tx0) - - a driver submitted an aligned tx1, not chained - - a driver submitted an aligned tx2 => tx2 is cold chained to tx1 - - a driver issued tx1+tx2 => channel is running in aligned mode - - a driver submitted an aligned tx3 => tx3 is hot-chained - - a driver submitted an unaligned tx4 => tx4 is put in submitted queue, - not chained - - a driver issued tx4 => tx4 is put in issued queue, not chained - - a driver submitted an aligned tx5 => tx5 is put in submitted queue, not - chained - - a driver submitted an aligned tx6 => tx6 is put in submitted queue, - cold chained to tx5 - - This translates into (after tx4 is issued) : - - issued queue - +-----+ +-----+ +-----+ +-----+ - | tx1 | | tx2 | | tx3 | | tx4 | - +---|-+ ^---|-+ ^-----+ +-----+ - | | | | - +---+ +---+ - - submitted queue - +-----+ +-----+ - | tx5 | | tx6 | - +---|-+ ^-----+ - | | - +---+ - - completed queue : empty - - allocated queue : tx0 - - It should be noted that after tx3 is completed, the channel is stopped, and - restarted in "unaligned mode" to handle tx4. - -Author: Robert Jarzmik diff --git a/Documentation/driver-api/dmaengine/index.rst b/Documentation/driver-api/dmaengine/index.rst index fae852922a49..3026fa975937 100644 --- a/Documentation/driver-api/dmaengine/index.rst +++ b/Documentation/driver-api/dmaengine/index.rst @@ -37,6 +37,16 @@ This book introduces how to test DMA drivers using dmatest module. dmatest +PXA DMA documentation +---------------------- + +This book adds some notes about PXA DMA + +.. toctree:: + :maxdepth: 1 + + pxa_dma + .. only:: subproject Indices diff --git a/Documentation/driver-api/dmaengine/pxa_dma.rst b/Documentation/driver-api/dmaengine/pxa_dma.rst new file mode 100644 index 000000000000..442ee691a190 --- /dev/null +++ b/Documentation/driver-api/dmaengine/pxa_dma.rst @@ -0,0 +1,190 @@ +============================== +PXA/MMP - DMA Slave controller +============================== + +Constraints +=========== + +a) Transfers hot queuing +A driver submitting a transfer and issuing it should be granted the transfer +is queued even on a running DMA channel. +This implies that the queuing doesn't wait for the previous transfer end, +and that the descriptor chaining is not only done in the irq/tasklet code +triggered by the end of the transfer. +A transfer which is submitted and issued on a phy doesn't wait for a phy to +stop and restart, but is submitted on a "running channel". The other +drivers, especially mmp_pdma waited for the phy to stop before relaunching +a new transfer. + +b) All transfers having asked for confirmation should be signaled +Any issued transfer with DMA_PREP_INTERRUPT should trigger a callback call. +This implies that even if an irq/tasklet is triggered by end of tx1, but +at the time of irq/dma tx2 is already finished, tx1->complete() and +tx2->complete() should be called. + +c) Channel running state +A driver should be able to query if a channel is running or not. For the +multimedia case, such as video capture, if a transfer is submitted and then +a check of the DMA channel reports a "stopped channel", the transfer should +not be issued until the next "start of frame interrupt", hence the need to +know if a channel is in running or stopped state. + +d) Bandwidth guarantee +The PXA architecture has 4 levels of DMAs priorities : high, normal, low. +The high priorities get twice as much bandwidth as the normal, which get twice +as much as the low priorities. +A driver should be able to request a priority, especially the real-time +ones such as pxa_camera with (big) throughputs. + +Design +====== +a) Virtual channels +Same concept as in sa11x0 driver, ie. a driver was assigned a "virtual +channel" linked to the requestor line, and the physical DMA channel is +assigned on the fly when the transfer is issued. + +b) Transfer anatomy for a scatter-gather transfer + +:: + + +------------+-----+---------------+----------------+-----------------+ + | desc-sg[0] | ... | desc-sg[last] | status updater | finisher/linker | + +------------+-----+---------------+----------------+-----------------+ + +This structure is pointed by dma->sg_cpu. +The descriptors are used as follows : + + - desc-sg[i]: i-th descriptor, transferring the i-th sg + element to the video buffer scatter gather + + - status updater + Transfers a single u32 to a well known dma coherent memory to leave + a trace that this transfer is done. The "well known" is unique per + physical channel, meaning that a read of this value will tell which + is the last finished transfer at that point in time. + + - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN + + - linker: has ddadr= desc-sg[0] of next transfer, dcmd=0 + +c) Transfers hot-chaining +Suppose the running chain is: + +:: + + Buffer 1 Buffer 2 + +---------+----+---+ +----+----+----+---+ + | d0 | .. | dN | l | | d0 | .. | dN | f | + +---------+----+-|-+ ^----+----+----+---+ + | | + +----+ + +After a call to dmaengine_submit(b3), the chain will look like: + +:: + + Buffer 1 Buffer 2 Buffer 3 + +---------+----+---+ +----+----+----+---+ +----+----+----+---+ + | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f | + +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+ + | | | | + +----+ +----+ + new_link + +If while new_link was created the DMA channel stopped, it is _not_ +restarted. Hot-chaining doesn't break the assumption that +dma_async_issue_pending() is to be used to ensure the transfer is actually started. + +One exception to this rule : + +- if Buffer1 and Buffer2 had all their addresses 8 bytes aligned + +- and if Buffer3 has at least one address not 4 bytes aligned + +- then hot-chaining cannot happen, as the channel must be stopped, the + "align bit" must be set, and the channel restarted As a consequence, + such a transfer tx_submit() will be queued on the submitted queue, and + this specific case if the DMA is already running in aligned mode. + +d) Transfers completion updater +Each time a transfer is completed on a channel, an interrupt might be +generated or not, up to the client's request. But in each case, the last +descriptor of a transfer, the "status updater", will write the latest +transfer being completed into the physical channel's completion mark. + +This will speed up residue calculation, for large transfers such as video +buffers which hold around 6k descriptors or more. This also allows without +any lock to find out what is the latest completed transfer in a running +DMA chain. + +e) Transfers completion, irq and tasklet +When a transfer flagged as "DMA_PREP_INTERRUPT" is finished, the dma irq +is raised. Upon this interrupt, a tasklet is scheduled for the physical +channel. + +The tasklet is responsible for : + +- reading the physical channel last updater mark + +- calling all the transfer callbacks of finished transfers, based on + that mark, and each transfer flags. + +If a transfer is completed while this handling is done, a dma irq will +be raised, and the tasklet will be scheduled once again, having a new +updater mark. + +f) Residue +Residue granularity will be descriptor based. The issued but not completed +transfers will be scanned for all of their descriptors against the +currently running descriptor. + +g) Most complicated case of driver's tx queues +The most tricky situation is when : + + - there are not "acked" transfers (tx0) + + - a driver submitted an aligned tx1, not chained + + - a driver submitted an aligned tx2 => tx2 is cold chained to tx1 + + - a driver issued tx1+tx2 => channel is running in aligned mode + + - a driver submitted an aligned tx3 => tx3 is hot-chained + + - a driver submitted an unaligned tx4 => tx4 is put in submitted queue, + not chained + + - a driver issued tx4 => tx4 is put in issued queue, not chained + + - a driver submitted an aligned tx5 => tx5 is put in submitted queue, not + chained + + - a driver submitted an aligned tx6 => tx6 is put in submitted queue, + cold chained to tx5 + + This translates into (after tx4 is issued) : + + - issued queue + + :: + + +-----+ +-----+ +-----+ +-----+ + | tx1 | | tx2 | | tx3 | | tx4 | + +---|-+ ^---|-+ ^-----+ +-----+ + | | | | + +---+ +---+ + - submitted queue + +-----+ +-----+ + | tx5 | | tx6 | + +---|-+ ^-----+ + | | + +---+ + +- completed queue : empty + +- allocated queue : tx0 + +It should be noted that after tx3 is completed, the channel is stopped, and +restarted in "unaligned mode" to handle tx4. + +Author: Robert Jarzmik From e78707f2eee7bcfaf400e925581b09425aab1b41 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 3 Nov 2017 10:19:42 +0530 Subject: [PATCH 0927/1085] MAINTAINERS: update DMAengine documentation location WIth ReST style documentation, we moved it to driver-api/dmaengine so update this in MAINTAINERS entry Signed-off-by: Vinod Koul Signed-off-by: Jonathan Corbet --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fc..eb04329ff97d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4234,7 +4234,7 @@ S: Maintained F: drivers/dma/ F: include/linux/dmaengine.h F: Documentation/devicetree/bindings/dma/ -F: Documentation/dmaengine/ +F: Documentation/driver-api/dmaengine/ T: git git://git.infradead.org/users/vkoul/slave-dma.git DMA MAPPING HELPERS From 34fa9b2177a2abec05ae5976eaadbcce648a0982 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 5 Sep 2017 22:48:42 +1000 Subject: [PATCH 0928/1085] m68k: move coldfire MMU initialization code The M54[78]x ColdFire parts are not the only members of the ColdFire family that have an MMU. But currently some of the early MMU initialization code is inside the startup code specific to only the ColdFire M54[78]x parts. Move that early ColdFire MMU init code so that it is run for other ColdFire parts running with MMU enabled. Specifically this means that the MMU initialization code will now also be run for the ColdFire M5441x parts when running with MMU enabled. The code move meant that the extern definition for the mmu_context_init() function had to be moved as well. To make it clear that is ColdFire specific I have renamed that with a "cf_" in front of it and put its extern definition in the mcfmmu.h (which is already included by the setup code). Reported-by: Angelo Dureghello Tested-by: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/m54xx.c | 4 ---- arch/m68k/include/asm/mcfmmu.h | 1 + arch/m68k/include/asm/mmu_context.h | 1 - arch/m68k/kernel/setup_mm.c | 2 ++ arch/m68k/mm/mcfmmu.c | 2 +- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/m68k/coldfire/m54xx.c b/arch/m68k/coldfire/m54xx.c index e53ffed13ba8..adad03ca6e11 100644 --- a/arch/m68k/coldfire/m54xx.c +++ b/arch/m68k/coldfire/m54xx.c @@ -96,10 +96,6 @@ static void mcf54xx_reset(void) void __init config_BSP(char *commandp, int size) { -#ifdef CONFIG_MMU - cf_bootmem_alloc(); - mmu_context_init(); -#endif mach_reset = mcf54xx_reset; mach_sched_init = hw_timer_init; m54xx_uarts_init(); diff --git a/arch/m68k/include/asm/mcfmmu.h b/arch/m68k/include/asm/mcfmmu.h index 10f9930ec49a..283352ab0d5d 100644 --- a/arch/m68k/include/asm/mcfmmu.h +++ b/arch/m68k/include/asm/mcfmmu.h @@ -106,6 +106,7 @@ static inline void mmu_write(u32 a, u32 v) } void cf_bootmem_alloc(void); +void cf_mmu_context_init(void); int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word); #endif diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h index 836acea8f758..f5b1852b4663 100644 --- a/arch/m68k/include/asm/mmu_context.h +++ b/arch/m68k/include/asm/mmu_context.h @@ -92,7 +92,6 @@ static inline void activate_mm(struct mm_struct *active_mm, #define deactivate_mm(tsk, mm) do { } while (0) -extern void mmu_context_init(void); #define prepare_arch_switch(next) load_ksp_mmu(next) static inline void load_ksp_mmu(struct task_struct *task) diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 657a9843ebfa..93ded524971c 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -344,6 +344,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_COLDFIRE case MACH_M54XX: case MACH_M5441X: + cf_bootmem_alloc(); + cf_mmu_context_init(); config_BSP(NULL, 0); break; #endif diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 8d1408583cf4..11eb333e699b 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -184,7 +184,7 @@ void __init cf_bootmem_alloc(void) * Initialize the context management stuff. * The following was taken from arch/ppc/mmu_context.c */ -void __init mmu_context_init(void) +void __init cf_mmu_context_init(void) { /* * Some processors have too few contexts to reserve one for From f55ab8f27548ff3431a6567d400c6757c49fd520 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 5 Sep 2017 22:57:06 +1000 Subject: [PATCH 0929/1085] m68k: fix ColdFire node shift size calculation The m68k pg_data_table is a fix size array defined in arch/m68k/mm/init.c. Index numbers within it are defined based on memory size. But for Coldfire these don't take into account a non-zero physical RAM base address, and this causes us to access past the end of this array at system start time. Change the node shift calculation so that we keep the index inside its range. Reported-by: Angelo Dureghello Tested-by: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/mm/mcfmmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 11eb333e699b..2925d795d71a 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -170,7 +170,7 @@ void __init cf_bootmem_alloc(void) max_pfn = max_low_pfn = PFN_DOWN(_ramend); high_memory = (void *)_ramend; - m68k_virt_to_node_shift = fls(_ramend - _rambase - 1) - 6; + m68k_virt_to_node_shift = fls(_ramend - 1) - 6; module_fixup(NULL, __start_fixup, __stop_fixup); /* setup bootmem data */ From b47c7b6f9f97f3e5596c348bea433ed09cad131d Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 10 Jan 2017 22:52:32 +1000 Subject: [PATCH 0930/1085] m68k: allow ColdFire m5441x parts to run with MMU enabled The Freescale ColdFire M5441x system-on-chip parts have full paged MMU hardware support. So far though we have only allowed them to be configured for use in non-MMU mode. All required kernel changes to support operation of the M5441x parts with MMU enabled have been pushed into the kernel, so now we can allow it to be configured and used with the MMU enabled. Tested-by: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index ff5f0896318b..21f00349af52 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -284,7 +284,7 @@ config M548x config M5441x bool "MCF5441x" - depends on !MMU + select MMU_COLDFIRE if MMU select GENERIC_CLOCKEVENTS select HAVE_CACHE_CB help From 375bc91e634195b094aa4acb30e1d19807122eca Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 28 Sep 2017 11:44:52 +0200 Subject: [PATCH 0931/1085] m68k: pull mach_beep in setup.c It is possible to select INPUT_M68K_BEEP in a nommu configuration. This results in the following link error: drivers/input/misc/m68kspkr.o: In function `m68kspkr_event': m68kspkr.c:(.text+0x3a): undefined reference to `mach_beep' m68kspkr.c:(.text+0x5e): undefined reference to `mach_beep' m68kspkr.c:(.text+0x78): undefined reference to `mach_beep' drivers/input/misc/m68kspkr.o: In function `m68kspkr_init': m68kspkr.c:(.init.text+0x4): undefined reference to `mach_beep' Pull the mach_beep definition in setup.c to avoid it. Signed-off-by: Alexandre Belloni Acked-by: Geert Uytterhoeven Signed-off-by: Greg Ungerer --- arch/m68k/kernel/setup.c | 5 +++++ arch/m68k/kernel/setup_mm.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/m68k/kernel/setup.c b/arch/m68k/kernel/setup.c index 854e09f403e7..19a92982629a 100644 --- a/arch/m68k/kernel/setup.c +++ b/arch/m68k/kernel/setup.c @@ -4,3 +4,8 @@ #else #include "setup_no.c" #endif + +#if IS_ENABLED(CONFIG_INPUT_M68K_BEEP) +void (*mach_beep)(unsigned int, unsigned int); +EXPORT_SYMBOL(mach_beep); +#endif diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 93ded524971c..dd25bfc22fb4 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -106,10 +106,6 @@ EXPORT_SYMBOL(mach_heartbeat); #ifdef CONFIG_M68K_L2_CACHE void (*mach_l2_flush) (int); #endif -#if IS_ENABLED(CONFIG_INPUT_M68K_BEEP) -void (*mach_beep)(unsigned int, unsigned int); -EXPORT_SYMBOL(mach_beep); -#endif #if defined(CONFIG_ISA) && defined(MULTI_ISA) int isa_type; int isa_sex; From c7da092a1f243bfd1bfb4124f538e69e941882da Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Nov 2017 11:20:28 +0100 Subject: [PATCH 0932/1085] x86/mm: Define _PAGE_TABLE using _KERNPG_TABLE ... so that the difference is obvious. No functionality change. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171103102028.20284-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f1492473f10e..c33f80da8a79 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -199,10 +199,9 @@ enum page_cache_mode { #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY | _PAGE_ENC) +#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) From ca0db18da26a47f824fe79e6870ad19470a45525 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 6 Nov 2017 11:18:37 +0000 Subject: [PATCH 0933/1085] regmap: Try to work around Kconfig exploding on HWSPINLOCK Trying to work with hwspinlock from built in code is painful as it can be built modular. Invert the test for REGMAP_HWSPINLOCK for now so we end up requiring users to depend on HWSPINLOCK=y in order to turn on the hwspinlock code. Signed-off-by: Mark Brown --- drivers/base/regmap/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig index 2d5e849f79c9..e7fa7b4bf9af 100644 --- a/drivers/base/regmap/Kconfig +++ b/drivers/base/regmap/Kconfig @@ -5,7 +5,7 @@ config REGMAP default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ) select IRQ_DOMAIN if REGMAP_IRQ - select HWSPINLOCK if REGMAP_HWSPINLOCK + select REGMAP_HWSPINLOCK if HWSPINLOCK=y bool config REGCACHE_COMPRESSED From c077fadf4d1e05bcdd7ddb07758cbd8acf0033f6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 4 Nov 2017 14:08:23 +0800 Subject: [PATCH 0934/1085] regmap: Fix unused warning This patch fixes the warning of label 'err_map' defined but not used. Signed-off-by: Baolin Wang Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index bfe2f250d011..8d516a9bfc01 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -699,7 +699,7 @@ struct regmap *__regmap_init(struct device *dev, map->lock_arg = map; #else ret = -EINVAL; - goto err; + goto err_map; #endif } else { if ((bus && bus->fast_io) || From 14c8276d3bdfbadb033e3aaca6ac39781f20e44d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 3 Nov 2017 13:33:23 -0700 Subject: [PATCH 0935/1085] ARM: footbridge: Fix typo in timer conversion This fixes a missing semi-colon. It went unnoticed initially since it is only built under certain defconfigs. Reported-by: kbuild test robot Signed-off-by: Kees Cook --- arch/arm/mach-footbridge/dc21285.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-footbridge/dc21285.c b/arch/arm/mach-footbridge/dc21285.c index 8407e4a07c77..e7b350f18f5f 100644 --- a/arch/arm/mach-footbridge/dc21285.c +++ b/arch/arm/mach-footbridge/dc21285.c @@ -141,7 +141,7 @@ static void dc21285_enable_error(struct timer_list *timer) del_timer(timer); if (timer == &serr_timer) - enable_irq(IRQ_PCI_SERR) + enable_irq(IRQ_PCI_SERR); else if (timer == &perr_timer) enable_irq(IRQ_PCI_PERR); } From 439dc05fbfdbcf01995f4941ac597b8668e00ec3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Nov 2017 19:32:31 -0700 Subject: [PATCH 0936/1085] drivers/pcmcia: omap1: Fix error in automated timer conversion One part of automated timer conversion tools did not take into account void * variables when searching out prior direct timer callback usage, which resulted in an attempt to dereference the timer field without a proper type. Reported-by: kbuild test robot Signed-off-by: Kees Cook --- drivers/pcmcia/omap_cf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index 8216ceb51b18..c2a17a79f0b2 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -102,7 +102,9 @@ static void omap_cf_timer(struct timer_list *t) */ static irqreturn_t omap_cf_irq(int irq, void *_cf) { - omap_cf_timer(&_cf->timer); + struct omap_cf_socket *cf = (struct omap_cf_socket *)_cf; + + omap_cf_timer(&cf->timer); return IRQ_HANDLED; } From f34d8d506eeff1375f0b3533839ce0b77b748437 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Oct 2017 16:43:19 -0700 Subject: [PATCH 0937/1085] crypto: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jesper Nilsson Cc: Lars Persson Cc: Niklas Cassel Cc: Herbert Xu Cc: "David S. Miller" Cc: Jamie Iles Cc: linux-arm-kernel@axis.com Cc: linux-crypto@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook Acked-by: Jamie Iles Acked-by: Lars Persson # for axis --- drivers/crypto/axis/artpec6_crypto.c | 6 +++--- drivers/crypto/mv_cesa.c | 4 ++-- drivers/crypto/picoxcell_crypto.c | 7 +++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index d9fbbf01062b..6b515c2f2c14 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -2074,9 +2074,9 @@ static void artpec6_crypto_process_queue(struct artpec6_crypto *ac) del_timer(&ac->timer); } -static void artpec6_crypto_timeout(unsigned long data) +static void artpec6_crypto_timeout(struct timer_list *t) { - struct artpec6_crypto *ac = (struct artpec6_crypto *) data; + struct artpec6_crypto *ac = from_timer(ac, t, timer); dev_info_ratelimited(artpec6_crypto_dev, "timeout\n"); @@ -3063,7 +3063,7 @@ static int artpec6_crypto_probe(struct platform_device *pdev) spin_lock_init(&ac->queue_lock); INIT_LIST_HEAD(&ac->queue); INIT_LIST_HEAD(&ac->pending); - setup_timer(&ac->timer, artpec6_crypto_timeout, (unsigned long) ac); + timer_setup(&ac->timer, artpec6_crypto_timeout, 0); ac->base = base; diff --git a/drivers/crypto/mv_cesa.c b/drivers/crypto/mv_cesa.c index bf25f415eea6..0eb2706f23c8 100644 --- a/drivers/crypto/mv_cesa.c +++ b/drivers/crypto/mv_cesa.c @@ -149,7 +149,7 @@ struct mv_req_hash_ctx { int count_add; }; -static void mv_completion_timer_callback(unsigned long unused) +static void mv_completion_timer_callback(struct timer_list *unused) { int active = readl(cpg->reg + SEC_ACCEL_CMD) & SEC_CMD_EN_SEC_ACCL0; @@ -167,7 +167,7 @@ static void mv_completion_timer_callback(unsigned long unused) static void mv_setup_timer(void) { - setup_timer(&cpg->completion_timer, &mv_completion_timer_callback, 0); + timer_setup(&cpg->completion_timer, mv_completion_timer_callback, 0); mod_timer(&cpg->completion_timer, jiffies + msecs_to_jiffies(MV_CESA_EXPIRE)); } diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index b6f14844702e..5a6dc53b2b9d 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -1125,9 +1125,9 @@ static irqreturn_t spacc_spacc_irq(int irq, void *dev) return IRQ_HANDLED; } -static void spacc_packet_timeout(unsigned long data) +static void spacc_packet_timeout(struct timer_list *t) { - struct spacc_engine *engine = (struct spacc_engine *)data; + struct spacc_engine *engine = from_timer(engine, t, packet_timeout); spacc_process_done(engine); } @@ -1714,8 +1714,7 @@ static int spacc_probe(struct platform_device *pdev) writel(SPA_IRQ_EN_STAT_EN | SPA_IRQ_EN_GLBL_EN, engine->regs + SPA_IRQ_EN_REG_OFFSET); - setup_timer(&engine->packet_timeout, spacc_packet_timeout, - (unsigned long)engine); + timer_setup(&engine->packet_timeout, spacc_packet_timeout, 0); INIT_LIST_HEAD(&engine->pending); INIT_LIST_HEAD(&engine->completed); From c6f15047dd8681f1a6387b044188ef028306c4bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 16:08:43 -0700 Subject: [PATCH 0938/1085] mailbox: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Ley Foon Tan Cc: Jassi Brar Cc: nios2-dev@lists.rocketboards.org Signed-off-by: Kees Cook --- drivers/mailbox/mailbox-altera.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mailbox/mailbox-altera.c b/drivers/mailbox/mailbox-altera.c index bb682c926b0a..bcb29df9549e 100644 --- a/drivers/mailbox/mailbox-altera.c +++ b/drivers/mailbox/mailbox-altera.c @@ -57,6 +57,7 @@ struct altera_mbox { /* If the controller supports only RX polling mode */ struct timer_list rxpoll_timer; + struct mbox_chan *chan; }; static struct altera_mbox *mbox_chan_to_altera_mbox(struct mbox_chan *chan) @@ -138,12 +139,11 @@ static void altera_mbox_rx_data(struct mbox_chan *chan) } } -static void altera_mbox_poll_rx(unsigned long data) +static void altera_mbox_poll_rx(struct timer_list *t) { - struct mbox_chan *chan = (struct mbox_chan *)data; - struct altera_mbox *mbox = mbox_chan_to_altera_mbox(chan); + struct altera_mbox *mbox = from_timer(mbox, t, rxpoll_timer); - altera_mbox_rx_data(chan); + altera_mbox_rx_data(mbox->chan); mod_timer(&mbox->rxpoll_timer, jiffies + msecs_to_jiffies(MBOX_POLLING_MS)); @@ -206,8 +206,8 @@ static int altera_mbox_startup_receiver(struct mbox_chan *chan) polling: /* Setup polling timer */ - setup_timer(&mbox->rxpoll_timer, altera_mbox_poll_rx, - (unsigned long)chan); + mbox->chan = chan; + timer_setup(&mbox->rxpoll_timer, altera_mbox_poll_rx, 0); mod_timer(&mbox->rxpoll_timer, jiffies + msecs_to_jiffies(MBOX_POLLING_MS)); From 2bccef39c0d94b9ee428ae777c59cef1fced786c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 20:33:01 -0700 Subject: [PATCH 0939/1085] drbd: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Philipp Reisner Cc: Lars Ellenberg Cc: drbd-dev@lists.linbit.com Signed-off-by: Kees Cook --- drivers/block/drbd/drbd_int.h | 4 ++-- drivers/block/drbd/drbd_main.c | 18 +++++++----------- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 4 ++-- drivers/block/drbd/drbd_req.h | 2 +- drivers/block/drbd/drbd_worker.c | 8 ++++---- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7e8589ce631c..06ecee1b528e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1551,8 +1551,8 @@ extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); extern int w_start_resync(struct drbd_work *, int); -extern void resync_timer_fn(unsigned long data); -extern void start_resync_timer_fn(unsigned long data); +extern void resync_timer_fn(struct timer_list *t); +extern void start_resync_timer_fn(struct timer_list *t); extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8cb3791898ae..4b4697a1f963 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -64,7 +64,7 @@ static DEFINE_MUTEX(drbd_main_mutex); static int drbd_open(struct block_device *bdev, fmode_t mode); static void drbd_release(struct gendisk *gd, fmode_t mode); -static void md_sync_timer_fn(unsigned long data); +static void md_sync_timer_fn(struct timer_list *t); static int w_bitmap_io(struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner , " @@ -2023,14 +2023,10 @@ void drbd_init_set_defaults(struct drbd_device *device) device->unplug_work.cb = w_send_write_hint; device->bm_io_work.w.cb = w_bitmap_io; - setup_timer(&device->resync_timer, resync_timer_fn, - (unsigned long)device); - setup_timer(&device->md_sync_timer, md_sync_timer_fn, - (unsigned long)device); - setup_timer(&device->start_resync_timer, start_resync_timer_fn, - (unsigned long)device); - setup_timer(&device->request_timer, request_timer_fn, - (unsigned long)device); + timer_setup(&device->resync_timer, resync_timer_fn, 0); + timer_setup(&device->md_sync_timer, md_sync_timer_fn, 0); + timer_setup(&device->start_resync_timer, start_resync_timer_fn, 0); + timer_setup(&device->request_timer, request_timer_fn, 0); init_waitqueue_head(&device->misc_wait); init_waitqueue_head(&device->state_wait); @@ -3721,9 +3717,9 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) return (bdev->md.flags & flag) != 0; } -static void md_sync_timer_fn(unsigned long data) +static void md_sync_timer_fn(struct timer_list *t) { - struct drbd_device *device = (struct drbd_device *) data; + struct drbd_device *device = from_timer(device, t, md_sync_timer); drbd_device_post_work(device, MD_SYNC); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 796eaf347dc0..cb2fa63f6bc0 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -5056,7 +5056,7 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) wake_up(&device->misc_wait); del_timer_sync(&device->resync_timer); - resync_timer_fn((unsigned long)device); + resync_timer_fn(&device->resync_timer); /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, * w_make_resync_request etc. which may still be on the worker queue diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index de8566e55334..a500e738d929 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1714,9 +1714,9 @@ static bool net_timeout_reached(struct drbd_request *net_req, * to expire twice (worst case) to become effective. Good enough. */ -void request_timer_fn(unsigned long data) +void request_timer_fn(struct timer_list *t) { - struct drbd_device *device = (struct drbd_device *) data; + struct drbd_device *device = from_timer(device, t, request_timer); struct drbd_connection *connection = first_peer_device(device)->connection; struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ struct net_conf *nc; diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index a2254f825601..cb97b3b30962 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -294,7 +294,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, struct bio_and_error *m); -extern void request_timer_fn(unsigned long data); +extern void request_timer_fn(struct timer_list *t); extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); extern void tl_abort_disk_io(struct drbd_device *device); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 03471b3fce86..1476cb3439f4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -457,9 +457,9 @@ int w_resync_timer(struct drbd_work *w, int cancel) return 0; } -void resync_timer_fn(unsigned long data) +void resync_timer_fn(struct timer_list *t) { - struct drbd_device *device = (struct drbd_device *) data; + struct drbd_device *device = from_timer(device, t, resync_timer); drbd_queue_work_if_unqueued( &first_peer_device(device)->connection->sender_work, @@ -1705,9 +1705,9 @@ void drbd_rs_controller_reset(struct drbd_device *device) rcu_read_unlock(); } -void start_resync_timer_fn(unsigned long data) +void start_resync_timer_fn(struct timer_list *t) { - struct drbd_device *device = (struct drbd_device *) data; + struct drbd_device *device = from_timer(device, t, start_resync_timer); drbd_device_post_work(device, RS_START); } From 10738ba8e02bc8cb1c4e832611621aa9666f4a24 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 21:06:32 -0700 Subject: [PATCH 0940/1085] ide: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: linux-ide@vger.kernel.org Signed-off-by: Kees Cook --- drivers/ide/ide-io.c | 4 ++-- drivers/ide/ide-probe.c | 2 +- include/linux/ide.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 3a234701d92c..6f25da56a169 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -611,9 +611,9 @@ static int drive_is_ready(ide_drive_t *drive) * logic that wants cleaning up. */ -void ide_timer_expiry (unsigned long data) +void ide_timer_expiry (struct timer_list *t) { - ide_hwif_t *hwif = (ide_hwif_t *)data; + ide_hwif_t *hwif = from_timer(hwif, t, timer); ide_drive_t *uninitialized_var(drive); ide_handler_t *handler; unsigned long flags; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 01b2adfd8226..27a2488c7468 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1184,7 +1184,7 @@ static void ide_init_port_data(ide_hwif_t *hwif, unsigned int index) spin_lock_init(&hwif->lock); - setup_timer(&hwif->timer, &ide_timer_expiry, (unsigned long)hwif); + timer_setup(&hwif->timer, ide_timer_expiry, 0); init_completion(&hwif->gendev_rel_comp); diff --git a/include/linux/ide.h b/include/linux/ide.h index dc152e4b7f73..cc412175d036 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1211,7 +1211,7 @@ extern int ide_wait_not_busy(ide_hwif_t *hwif, unsigned long timeout); extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout); -extern void ide_timer_expiry(unsigned long); +extern void ide_timer_expiry(struct timer_list *t); extern irqreturn_t ide_intr(int irq, void *dev_id); extern void do_ide_request(struct request_queue *); extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq); From 5ea22086ed42bef8dde93f45a42a3ace486eb788 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 Aug 2017 14:53:24 -0700 Subject: [PATCH 0941/1085] block/aoe: discover_timer: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This refactors the discover_timer to remove the needless locking and state machine used for synchronizing timer death. Using del_timer_sync() will already do the right thing. Cc: Jens Axboe Cc: "Ed L. Cashin" Cc: Thomas Gleixner Signed-off-by: Kees Cook --- drivers/block/aoe/aoemain.c | 44 +++++++------------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 4b987c2fefbe..251482066977 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -15,49 +15,19 @@ MODULE_AUTHOR("Sam Hopkins "); MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); MODULE_VERSION(VERSION); -enum { TINIT, TRUN, TKILL }; +static struct timer_list timer; -static void -discover_timer(ulong vp) +static void discover_timer(struct timer_list *t) { - static struct timer_list t; - static volatile ulong die; - static spinlock_t lock; - ulong flags; - enum { DTIMERTICK = HZ * 60 }; /* one minute */ + mod_timer(t, jiffies + HZ * 60); /* one minute */ - switch (vp) { - case TINIT: - init_timer(&t); - spin_lock_init(&lock); - t.data = TRUN; - t.function = discover_timer; - die = 0; - case TRUN: - spin_lock_irqsave(&lock, flags); - if (!die) { - t.expires = jiffies + DTIMERTICK; - add_timer(&t); - } - spin_unlock_irqrestore(&lock, flags); - - aoecmd_cfg(0xffff, 0xff); - return; - case TKILL: - spin_lock_irqsave(&lock, flags); - die = 1; - spin_unlock_irqrestore(&lock, flags); - - del_timer_sync(&t); - default: - return; - } + aoecmd_cfg(0xffff, 0xff); } static void aoe_exit(void) { - discover_timer(TKILL); + del_timer_sync(&timer); aoenet_exit(); unregister_blkdev(AOE_MAJOR, DEVICE_NAME); @@ -93,7 +63,9 @@ aoe_init(void) goto blkreg_fail; } printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); - discover_timer(TINIT); + + timer_setup(&timer, discover_timer, 0); + discover_timer(&timer); return 0; blkreg_fail: aoecmd_exit(); From ded0eb83449e8fcba22fd2736826336e101ffbcb Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Fri, 3 Nov 2017 15:53:01 +1100 Subject: [PATCH 0942/1085] dt-bindings: pmbus: Add Maxim MAX31785 documentation Signed-off-by: Andrew Jeffery Acked-by: Rob Herring Signed-off-by: Guenter Roeck --- .../devicetree/bindings/hwmon/max31785.txt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Documentation/devicetree/bindings/hwmon/max31785.txt diff --git a/Documentation/devicetree/bindings/hwmon/max31785.txt b/Documentation/devicetree/bindings/hwmon/max31785.txt new file mode 100644 index 000000000000..106e08c56aaa --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/max31785.txt @@ -0,0 +1,22 @@ +Bindings for the Maxim MAX31785 Intelligent Fan Controller +========================================================== + +Reference: + +https://datasheets.maximintegrated.com/en/ds/MAX31785.pdf + +The Maxim MAX31785 is a PMBus device providing closed-loop, multi-channel fan +management with temperature and remote voltage sensing. Various fan control +features are provided, including PWM frequency control, temperature hysteresis, +dual tachometer measurements, and fan health monitoring. + +Required properties: +- compatible : One of "maxim,max31785" or "maxim,max31785a" +- reg : I2C address, one of 0x52, 0x53, 0x54, 0x55. + +Example: + + fans@52 { + compatible = "maxim,max31785"; + reg = <0x52>; + }; From 08fe92e2052c79e79d0570902f64bf991d6dd563 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Fri, 13 Oct 2017 01:09:32 +0200 Subject: [PATCH 0943/1085] m68k: coldfire: add dspi0 module support This patch adds initial module base address and irq for dspi0. It also defines the dspi0 clock to be used by the Freescale driver. Signed-off-by: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/m5441x.c | 3 ++- arch/m68k/include/asm/m5441xsim.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/m68k/coldfire/m5441x.c b/arch/m68k/coldfire/m5441x.c index 315d14b0dca0..55392af845fb 100644 --- a/arch/m68k/coldfire/m5441x.c +++ b/arch/m68k/coldfire/m5441x.c @@ -27,7 +27,7 @@ DEFINE_CLK(0, "intc.0", 18, MCF_CLK); DEFINE_CLK(0, "intc.1", 19, MCF_CLK); DEFINE_CLK(0, "intc.2", 20, MCF_CLK); DEFINE_CLK(0, "imx1-i2c.0", 22, MCF_CLK); -DEFINE_CLK(0, "mcfdspi.0", 23, MCF_CLK); +DEFINE_CLK(0, "fsl-dspi.0", 23, MCF_CLK); DEFINE_CLK(0, "mcfuart.0", 24, MCF_BUSCLK); DEFINE_CLK(0, "mcfuart.1", 25, MCF_BUSCLK); DEFINE_CLK(0, "mcfuart.2", 26, MCF_BUSCLK); @@ -140,6 +140,7 @@ static struct clk * const enable_clks[] __initconst = { &__clk_0_18, /* intc0 */ &__clk_0_19, /* intc0 */ &__clk_0_20, /* intc0 */ + &__clk_0_23, /* dspi.0 */ &__clk_0_24, /* uart0 */ &__clk_0_25, /* uart1 */ &__clk_0_26, /* uart2 */ diff --git a/arch/m68k/include/asm/m5441xsim.h b/arch/m68k/include/asm/m5441xsim.h index 4e9095b9480a..c87556d5581c 100644 --- a/arch/m68k/include/asm/m5441xsim.h +++ b/arch/m68k/include/asm/m5441xsim.h @@ -278,4 +278,10 @@ #define MCFGPIO_IRQ_VECBASE (MCFINT_VECBASE - MCFGPIO_IRQ_MIN) #define MCFGPIO_PIN_MAX 87 +/* + * DSPI module. + */ +#define MCFDSPI_BASE0 0xfc05c000 +#define MCF_IRQ_DSPI0 (MCFINT0_VECBASE + MCFINT0_DSPI0) + #endif /* m5441xsim_h */ From c8b61d508986b7e77bab11f8252b67e6e2b9a7cc Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Fri, 13 Oct 2017 00:42:51 +0200 Subject: [PATCH 0944/1085] m68k: add Sysam stmark2 open board support Add support for Sysam stmark2 board, an open hardware embedded Linux board, see http://sysam.it/cff_stmark2.html for any info. Signed-off-by: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/Kconfig.machine | 6 ++ arch/m68k/coldfire/Makefile | 3 +- arch/m68k/coldfire/stmark2.c | 119 ++++++++++++++++++++++++++++ arch/m68k/configs/stmark2_defconfig | 92 +++++++++++++++++++++ 4 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 arch/m68k/coldfire/stmark2.c create mode 100644 arch/m68k/configs/stmark2_defconfig diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index 5cd57b4d3615..64a641467736 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -266,6 +266,12 @@ config AMCORE help Support for the Sysam AMCORE open-hardware generic board. +config STMARK2 + bool "Sysam stmark2 board support" + depends on M5441x + help + Support for the Sysam stmark2 open-hardware generic board. + config FIREBEE bool "FireBee board support" depends on M547x diff --git a/arch/m68k/coldfire/Makefile b/arch/m68k/coldfire/Makefile index f8cef9681416..573eabca1a3a 100644 --- a/arch/m68k/coldfire/Makefile +++ b/arch/m68k/coldfire/Makefile @@ -35,7 +35,8 @@ obj-$(CONFIG_NETtel) += nettel.o obj-$(CONFIG_CLEOPATRA) += nettel.o obj-$(CONFIG_FIREBEE) += firebee.o obj-$(CONFIG_MCF8390) += mcf8390.o -obj-$(CONFIG_AMCORE) += amcore.o +obj-$(CONFIG_AMCORE) += amcore.o +obj-$(CONFIG_STMARK2) += stmark2.o obj-$(CONFIG_PCI) += pci.o diff --git a/arch/m68k/coldfire/stmark2.c b/arch/m68k/coldfire/stmark2.c new file mode 100644 index 000000000000..a8d2b3d172f9 --- /dev/null +++ b/arch/m68k/coldfire/stmark2.c @@ -0,0 +1,119 @@ +/* + * stmark2.c -- Support for Sysam AMCORE open board + * + * (C) Copyright 2017, Angelo Dureghello + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Partitioning of parallel NOR flash (39VF3201B) + */ +static struct mtd_partition stmark2_partitions[] = { + { + .name = "U-Boot (1024K)", + .size = 0x100000, + .offset = 0x0 + }, { + .name = "Kernel+initramfs (7168K)", + .size = 0x700000, + .offset = MTDPART_OFS_APPEND + }, { + .name = "Flash Free Space (8192K)", + .size = MTDPART_SIZ_FULL, + .offset = MTDPART_OFS_APPEND + } +}; + +static struct flash_platform_data stmark2_spi_flash_data = { + .name = "is25lp128", + .parts = stmark2_partitions, + .nr_parts = ARRAY_SIZE(stmark2_partitions), + .type = "is25lp128", +}; + +static struct spi_board_info stmark2_board_info[] __initdata = { + { + .modalias = "m25p80", + .max_speed_hz = 5000000, + .bus_num = 0, + .chip_select = 1, + .platform_data = &stmark2_spi_flash_data, + .mode = SPI_MODE_3, + } +}; + +/* SPI controller data, SPI (0) */ +static struct fsl_dspi_platform_data dspi_spi0_info = { + .cs_num = 4, + .bus_num = 0, + .sck_cs_delay = 100, + .cs_sck_delay = 100, +}; + +static struct resource dspi_spi0_resource[] = { + [0] = { + .start = MCFDSPI_BASE0, + .end = MCFDSPI_BASE0 + 0xFF, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 12, + .end = 13, + .flags = IORESOURCE_DMA, + }, + [2] = { + .start = MCF_IRQ_DSPI0, + .end = MCF_IRQ_DSPI0, + .flags = IORESOURCE_IRQ, + }, +}; + +/* SPI controller, id = bus number */ +static struct platform_device dspi_spi0_device = { + .name = "fsl-dspi", + .id = 0, + .num_resources = ARRAY_SIZE(dspi_spi0_resource), + .resource = dspi_spi0_resource, + .dev = { + .platform_data = &dspi_spi0_info, + }, +}; + +static struct platform_device *stmark2_devices[] __initdata = { + &dspi_spi0_device, +}; + +/* + * Note: proper pin-mux setup is mandatory for proper SPI functionality. + */ +static int __init init_stmark2(void) +{ + /* DSPI0, all pins as DSPI, and using CS1 */ + __raw_writeb(0x80, MCFGPIO_PAR_DSPIOWL); + __raw_writeb(0xfc, MCFGPIO_PAR_DSPIOWH); + + /* Board gpio setup */ + __raw_writeb(0x00, MCFGPIO_PAR_BE); + __raw_writeb(0x00, MCFGPIO_PAR_FBCTL); + __raw_writeb(0x00, MCFGPIO_PAR_CS); + __raw_writeb(0x00, MCFGPIO_PAR_CANI2C); + + platform_add_devices(stmark2_devices, ARRAY_SIZE(stmark2_devices)); + + spi_register_board_info(stmark2_board_info, + ARRAY_SIZE(stmark2_board_info)); + + return 0; +} + +late_initcall(init_stmark2); diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig new file mode 100644 index 000000000000..55e55dbc2fb6 --- /dev/null +++ b/arch/m68k/configs/stmark2_defconfig @@ -0,0 +1,92 @@ +CONFIG_LOCALVERSION="stmark2-001" +CONFIG_DEFAULT_HOSTNAME="stmark2" +CONFIG_SYSVIPC=y +# CONFIG_FHANDLE is not set +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_NAMESPACES=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="../uClinux-dist/romfs" +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_AIO is not set +# CONFIG_ADVISE_SYSCALLS is not set +# CONFIG_MEMBARRIER is not set +CONFIG_EMBEDDED=y +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_LBDAF is not set +# CONFIG_BLK_DEV_BSG is not set +CONFIG_BLK_CMDLINE_PARSER=y +# CONFIG_MMU is not set +CONFIG_M5441x=y +CONFIG_CLOCK_FREQ=240000000 +CONFIG_STMARK2=y +CONFIG_RAMBASE=0x40000000 +CONFIG_RAMSIZE=0x8000000 +CONFIG_VECTORBASE=0x40000000 +CONFIG_KERNELBASE=0x40001000 +CONFIG_BINFMT_FLAT=y +CONFIG_BINFMT_MISC=y +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y +# CONFIG_ALLOW_DEV_COREDUMP is not set +CONFIG_MTD=y +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_CFI=y +CONFIG_MTD_JEDECPROBE=y +CONFIG_MTD_CFI_ADV_OPTIONS=y +CONFIG_MTD_CFI_LE_BYTE_SWAP=y +CONFIG_MTD_CFI_GEOMETRY=y +# CONFIG_MTD_CFI_I2 is not set +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_CFI_STAA=y +CONFIG_MTD_ROM=y +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PLATRAM=y +CONFIG_MTD_M25P80=y +CONFIG_MTD_SPI_NOR=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_UNIX98_PTYS is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_MCF=y +CONFIG_SERIAL_MCF_BAUDRATE=115200 +CONFIG_SERIAL_MCF_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_SPI=y +CONFIG_SPI_DEBUG=y +CONFIG_SPI_FSL_DSPI=y +CONFIG_DEBUG_GPIO=y +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC_PLATFORM=y +# CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set +# CONFIG_HID is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_FILE_LOCKING is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +CONFIG_FSCACHE=y +# CONFIG_PROC_SYSCTL is not set +CONFIG_PRINTK_TIME=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +CONFIG_SLUB_DEBUG_ON=y +CONFIG_PANIC_ON_OOPS=y +# CONFIG_SCHED_DEBUG is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +CONFIG_BOOTPARAM=y +CONFIG_BOOTPARAM_STRING="console=ttyS0,115200 root=/dev/ram0 rw rootfstype=ramfs rdinit=/bin/init devtmpfs.mount=1" +CONFIG_CRYPTO=y +# CONFIG_CRYPTO_ECHAINIV is not set +CONFIG_CRYPTO_ANSI_CPRNG=y +# CONFIG_CRYPTO_HW is not set +CONFIG_CRC16=y From acbc845ffefd9fb70466182cd8555a26189462b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 31 Oct 2017 13:17:22 +0100 Subject: [PATCH 0945/1085] x86/cpufeatures: Re-tabulate the X86_FEATURE definitions Over the years asm/cpufeatures.h has become somewhat of a mess: the original tabulation style was too narrow, while x86 feature names also kept growing in length, creating frequent field width overflows. Re-tabulate it to make it wider and easier to read/modify. Also harmonize the tabulation of the other defines in this file to match it. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171031121723.28524-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 508 ++++++++++++++--------------- 1 file changed, 254 insertions(+), 254 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 74370734663c..ad1b835001cc 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,8 +13,8 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used @@ -28,163 +28,163 @@ */ /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ /* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -192,152 +192,152 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ /* * BUG word(s) */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) +#define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #endif /* _ASM_X86_CPUFEATURES_H */ From f3a624e901c633593156f7b00ca743a6204a29bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 31 Oct 2017 13:17:23 +0100 Subject: [PATCH 0946/1085] x86/cpufeatures: Fix various details in the feature definitions Kept this commit separate from the re-tabulation changes, to make the changes easier to review: - add better explanation for entries with no explanation - fix/enhance the text of some of the entries - fix the vertical alignment of some of the feature number definitions - fix inconsistent capitalization - ... and lots of other small details i.e. make it all more of a coherent unit, instead of a patchwork of years of additions. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171031121723.28524-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 149 ++++++++++++++--------------- 1 file changed, 74 insertions(+), 75 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ad1b835001cc..cdf5be866863 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -20,14 +20,12 @@ * Note: If the comment begins with a quoted string, that string is used * in /proc/cpuinfo instead of the macro name. If the string is "", * this feature bit is not displayed in /proc/cpuinfo at all. - */ - -/* + * * When adding new features here that depend on other features, - * please update the table in kernel/cpu/cpuid-deps.c + * please update the table in kernel/cpu/cpuid-deps.c as well. */ -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ @@ -42,8 +40,7 @@ #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ @@ -63,15 +60,15 @@ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ @@ -84,66 +81,67 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ + +/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ #define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ @@ -158,10 +156,10 @@ #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ @@ -175,16 +173,16 @@ #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ #define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -192,7 +190,7 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ #define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ @@ -206,8 +204,8 @@ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ @@ -218,19 +216,19 @@ #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ @@ -238,8 +236,8 @@ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ @@ -251,25 +249,25 @@ #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ +/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ @@ -281,7 +279,7 @@ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ @@ -296,24 +294,24 @@ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ +/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ /* * BUG word(s) @@ -340,4 +338,5 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ From d65dfc81bb3894fdb68cbc74bbf5fb48d2354071 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 6 Nov 2017 18:46:32 +0100 Subject: [PATCH 0947/1085] x86/MCE/AMD: Always give panic severity for UC errors in kernel context The AMD severity grading function was introduced in kernel 4.1. The current logic can possibly give MCE_AR_SEVERITY for uncorrectable errors in kernel context. The system may then get stuck in a loop as memory_failure() will try to handle the bad kernel memory and find it busy. Return MCE_PANIC_SEVERITY for all UC errors IN_KERNEL context on AMD systems. After: b2f9d678e28c ("x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries") was accepted in v4.6, this issue was masked because of the tail-end attempt at kernel mode recovery in the #MC handler. However, uncorrectable errors IN_KERNEL context should always be considered unrecoverable and cause a panic. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: # 4.9.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Fixes: bf80bbd7dcf5 (x86/mce: Add an AMD severities-grading function) Link: http://lkml.kernel.org/r/20171106174633.13576-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 87cc9ab7a13c..4b8187639c2d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -245,6 +245,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (m->status & MCI_STATUS_UC) { + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + /* * On older systems where overflow_recov flag is not present, we * should simply panic if an error overflow occurs. If @@ -255,10 +258,6 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (mce_flags.smca) return mce_severity_amd_smca(m, ctx); - /* software can try to contain */ - if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - /* kill current process */ return MCE_AR_SEVERITY; } else { From 783ca517bfd62ca516178712775e4b273292d5b1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 6 Nov 2017 18:46:33 +0100 Subject: [PATCH 0948/1085] x86/MCE/AMD: Fix mce_severity_amd_smca() signature Change the err_ctx type to "enum context" to match the type passed in. No functionality change. Suggested-by: Borislav Petkov Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20171106174633.13576-2-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4b8187639c2d..4ca632a06e0b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -204,7 +204,7 @@ static int error_context(struct mce *m) return IN_KERNEL; } -static int mce_severity_amd_smca(struct mce *m, int err_ctx) +static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) { u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); u32 low, high; From 693cb5580fdb026922363aa103add64b3ecd572e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:19:48 -0700 Subject: [PATCH 0949/1085] selftests/x86/protection_keys: Fix syscall NR redefinition warnings On new enough glibc, the pkey syscalls numbers are available. Check first before defining them to avoid warnings like: protection_keys.c:198:0: warning: "SYS_pkey_alloc" redefined Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1fbef53a9e6befb7165ff855fc1a7d4788a191d6.1509794321.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/protection_keys.c | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c index 555e43ca846b..7a1cc0e56d2d 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/x86/protection_keys.c @@ -189,17 +189,29 @@ void lots_o_noops_around_write(int *write_to_me) #define u64 uint64_t #ifdef __i386__ -#define SYS_mprotect_key 380 -#define SYS_pkey_alloc 381 -#define SYS_pkey_free 382 + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif #define REG_IP_IDX REG_EIP #define si_pkey_offset 0x14 + #else -#define SYS_mprotect_key 329 -#define SYS_pkey_alloc 330 -#define SYS_pkey_free 331 + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif #define REG_IP_IDX REG_RIP #define si_pkey_offset 0x20 + #endif void dump_mem(void *dumpme, int len_bytes) From d60ad744c9741586010d4bea286f09a063a90fbd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:19:49 -0700 Subject: [PATCH 0950/1085] selftests/x86/ldt_gdt: Robustify against set_thread_area() and LAR oddities Bits 19:16 of LAR's result are undefined, and some upcoming improvements to the test case seem to trigger this. Mask off those bits to avoid spurious failures. commit 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS segments") adds a valid case in which LAR's output doesn't quite agree with set_thread_area()'s input. This isn't triggered in the test as is, but it will be if we start calling set_thread_area() with the accessed bit clear. Work around this discrepency. I've added a Fixes tag so that -stable can pick this up if neccesary. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS segments") Link: http://lkml.kernel.org/r/b82f3f89c034b53580970ac865139fd8863f44e2.1509794321.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 961e3ee26c27..b0334338a4b0 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -115,7 +115,15 @@ static void check_valid_segment(uint16_t index, int ldt, return; } - if (ar != expected_ar) { + /* The SDM says "bits 19:16 are undefined". Thanks. */ + ar &= ~0xF0000; + + /* + * NB: Different Linux versions do different things with the + * accessed bit in set_thread_area(). + */ + if (ar != expected_ar && + (ldt || ar != (expected_ar | AR_ACCESSED))) { printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", (ldt ? "LDT" : "GDT"), index, ar, expected_ar); nerrs++; From d744dcad39094c9187075e274d1cdef79c57c8b5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:19:50 -0700 Subject: [PATCH 0951/1085] selftests/x86/ldt_gdt: Add infrastructure to test set_thread_area() Much of the test design could apply to set_thread_area() (i.e. GDT), not just modify_ldt(). Add set_thread_area() to the install_valid_mode() helper. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/02c23f8fba5547007f741dc24c3926e5284ede02.1509794321.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ldt_gdt.c | 57 ++++++++++++++++++--------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index b0334338a4b0..45f30249133f 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -137,30 +137,51 @@ static void check_valid_segment(uint16_t index, int ldt, } } -static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, - bool oldmode) +static bool install_valid_mode(const struct user_desc *d, uint32_t ar, + bool oldmode, bool ldt) { - int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, - desc, sizeof(*desc)); - if (ret < -1) - errno = -ret; - if (ret == 0) { - uint32_t limit = desc->limit; - if (desc->limit_in_pages) - limit = (limit << 12) + 4095; - check_valid_segment(desc->entry_number, 1, ar, limit, true); - return true; - } else if (errno == ENOSYS) { - printf("[OK]\tmodify_ldt returned -ENOSYS\n"); + struct user_desc desc = *d; + int ret; + + if (!ldt) { +#ifndef __i386__ + /* No point testing set_thread_area in a 64-bit build */ return false; +#endif + if (!gdt_entry_num) + return false; + desc.entry_number = gdt_entry_num; + + ret = syscall(SYS_set_thread_area, &desc); } else { - if (desc->seg_32bit) { - printf("[FAIL]\tUnexpected modify_ldt failure %d\n", + ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, + &desc, sizeof(desc)); + + if (ret < -1) + errno = -ret; + + if (ret != 0 && errno == ENOSYS) { + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); + return false; + } + } + + if (ret == 0) { + uint32_t limit = desc.limit; + if (desc.limit_in_pages) + limit = (limit << 12) + 4095; + check_valid_segment(desc.entry_number, ldt, ar, limit, true); + return true; + } else { + if (desc.seg_32bit) { + printf("[FAIL]\tUnexpected %s failure %d\n", + ldt ? "modify_ldt" : "set_thread_area", errno); nerrs++; return false; } else { - printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); + printf("[OK]\t%s rejected 16 bit segment\n", + ldt ? "modify_ldt" : "set_thread_area"); return false; } } @@ -168,7 +189,7 @@ static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, static bool install_valid(const struct user_desc *desc, uint32_t ar) { - return install_valid_mode(desc, ar, false); + return install_valid_mode(desc, ar, false, true); } static void install_invalid(const struct user_desc *desc, bool oldmode) From adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:19:51 -0700 Subject: [PATCH 0952/1085] selftests/x86/ldt_gdt: Run most existing LDT test cases against the GDT as well Now that the main test infrastructure supports the GDT, run tests that will pass the kernel's GDT permission tests against the GDT. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/686a1eda63414da38fcecc2412db8dba1ae40581.1509794321.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 45f30249133f..3bb42fff5d66 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -189,7 +189,15 @@ static bool install_valid_mode(const struct user_desc *d, uint32_t ar, static bool install_valid(const struct user_desc *desc, uint32_t ar) { - return install_valid_mode(desc, ar, false, true); + bool ret = install_valid_mode(desc, ar, false, true); + + if (desc->contents <= 1 && desc->seg_32bit && + !desc->seg_not_present) { + /* Should work in the GDT, too. */ + install_valid_mode(desc, ar, false, false); + } + + return ret; } static void install_invalid(const struct user_desc *desc, bool oldmode) From fec8f5ae1715a01c72ad52cb2ecd8aacaf142302 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:19:52 -0700 Subject: [PATCH 0953/1085] selftests/x86/ldt_get: Add a few additional tests for limits We weren't testing the .limit and .limit_in_pages fields very well. Add more tests. This addition seems to trigger the "bits 16:19 are undefined" issue that was fixed in an earlier patch. I think that, at least on my CPU, the high nibble of the limit ends in LAR bits 16:19. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5601c15ea9b3113d288953fd2838b18bedf6bc67.1509794321.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ldt_gdt.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 3bb42fff5d66..66e5ce5b91f0 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -404,9 +404,24 @@ static void do_simple_tests(void) install_invalid(&desc, false); desc.seg_not_present = 0; - desc.read_exec_only = 0; desc.seg_32bit = 1; + desc.read_exec_only = 0; + desc.limit = 0xfffff; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); + + desc.limit_in_pages = 1; + + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G); + desc.read_exec_only = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G); + desc.contents = 1; + desc.read_exec_only = 0; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); + desc.read_exec_only = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); + + desc.limit = 0; install_invalid(&desc, true); } From 629a359bdb0e0652a8227b4ff3125431995fec6e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 7 Nov 2017 11:33:37 +0300 Subject: [PATCH 0954/1085] mm/sparsemem: Fix ARM64 boot crash when CONFIG_SPARSEMEM_EXTREME=y Since commit: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") we allocate the mem_section array dynamically in sparse_memory_present_with_active_regions(), but some architectures, like arm64, don't call the routine to initialize sparsemem. Let's move the initialization into memory_present() it should cover all architectures. Reported-and-tested-by: Sudeep Holla Tested-by: Bjorn Andersson Signed-off-by: Kirill A. Shutemov Acked-by: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Link: http://lkml.kernel.org/r/20171107083337.89952-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 10 ---------- mm/sparse.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dfd13f724d9..77e4d3c5c57b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5646,16 +5646,6 @@ void __init sparse_memory_present_with_active_regions(int nid) unsigned long start_pfn, end_pfn; int i, this_nid; -#ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) { - unsigned long size, align; - - size = sizeof(struct mem_section) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); - } -#endif - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index b00a97398795..d294148ba395 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -206,6 +206,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_virt_alloc(size, align); + } +#endif + start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { From 4366d57af19e89d23ef9154414b2539f1c3d18fa Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 6 Nov 2017 11:32:56 +0800 Subject: [PATCH 0955/1085] x86/build: Factor out fdimage/isoimage generation commands to standalone script The build messages for fdimage/isoimage generation are pretty unstructured, just the raw shell command blocks are printed. Emit shortened messages similar to existing kbuild messages, and move the Makefile commands into a separate shell script - which is much easier to handle. This patch factors out the commands used for fdimage/isoimage generation from arch/x86/boot/Makefile to a new script arch/x86/boot/genimage.sh. Then it adds the new kbuild command 'genimage' which invokes the new script. All fdimages/isoimage files are now generated by a call to 'genimage' with different parameters. Now 'make isoimage' becomes: ... Kernel: arch/x86/boot/bzImage is ready (#30) GENIMAGE arch/x86/boot/image.iso Size of boot image is 4 sectors -> No emulation 15.37% done, estimate finish Sun Nov 5 23:36:57 2017 30.68% done, estimate finish Sun Nov 5 23:36:57 2017 46.04% done, estimate finish Sun Nov 5 23:36:57 2017 61.35% done, estimate finish Sun Nov 5 23:36:57 2017 76.69% done, estimate finish Sun Nov 5 23:36:57 2017 92.00% done, estimate finish Sun Nov 5 23:36:57 2017 Total translation table size: 2048 Total rockridge attributes bytes: 659 Total directory bytes: 0 Path table size(bytes): 10 Max brk space used 0 32608 extents written (63 MB) Kernel: arch/x86/boot/image.iso is ready Before: Kernel: arch/x86/boot/bzImage is ready (#63) rm -rf arch/x86/boot/isoimage mkdir arch/x86/boot/isoimage for i in lib lib64 share end ; do \ if [ -f /usr/$i/syslinux/isolinux.bin ] ; then \ cp /usr/$i/syslinux/isolinux.bin arch/x86/boot/isoimage ; \ if [ -f /usr/$i/syslinux/ldlinux.c32 ]; then \ cp /usr/$i/syslinux/ldlinux.c32 arch/x86/boot/isoimage ; \ fi ; \ break ; \ fi ; \ if [ $i = end ] ; then exit 1 ; fi ; \ done ... Suggested-by: Ingo Molnar Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1509939179-7556-2-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/Makefile | 59 ++++----------------- arch/x86/boot/genimage.sh | 105 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 48 deletions(-) create mode 100644 arch/x86/boot/genimage.sh diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index d88a2fddba8c..9b5adae9cc40 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -123,63 +123,26 @@ image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) $(obj)/mtools.conf: $(src)/mtools.conf.in sed -e 's|@OBJ@|$(obj)|g' < $< > $@ +quiet_cmd_genimage = GENIMAGE $3 +cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) + # This requires write access to /dev/fd0 bzdisk: $(obj)/bzImage $(obj)/mtools.conf - MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync - syslinux /dev/fd0 ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync + $(call cmd,genimage,bzdisk,/dev/fd0) # These require being root or having syslinux 2.02 or higher installed fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 - MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync + $(call cmd,genimage,fdimage144,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' fdimage288: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 - MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync + $(call cmd,genimage,fdimage288,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' isoimage: $(obj)/bzImage - -rm -rf $(obj)/isoimage - mkdir $(obj)/isoimage - for i in lib lib64 share end ; do \ - if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ - cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ - if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \ - cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \ - fi ; \ - break ; \ - fi ; \ - if [ $$i = end ] ; then exit 1 ; fi ; \ - done - cp $(obj)/bzImage $(obj)/isoimage/linux - echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ - fi - mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ - -no-emul-boot -boot-load-size 4 -boot-info-table \ - $(obj)/isoimage - isohybrid $(obj)/image.iso 2>/dev/null || true - rm -rf $(obj)/isoimage + $(call cmd,genimage,isoimage,$(obj)/image.iso) + @$(kecho) 'Kernel: $(obj)/image.iso is ready' bzlilo: $(obj)/bzImage if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh new file mode 100644 index 000000000000..75a9de1d3f72 --- /dev/null +++ b/arch/x86/boot/genimage.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 2017 by Changbin Du +# +# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others +# +# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture +# +# Arguments: +# $1 - fdimage format +# $2 - target image file +# $3 - kernel bzImage file +# $4 - mtool configuration file +# $5 - kernel cmdline +# $6 - inird image file +# + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo "" 1>&2 + exit 1 + fi +} + + +export MTOOLSRC=$4 +FIMAGE=$2 +FBZIMAGE=$3 +KCMDLINE=$5 +FDINITRD=$6 + +# Make sure the files actually exist +verify "$FBZIMAGE" +verify "$MTOOLSRC" + +genbzdisk() { + mformat a: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - a:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" a:initrd.img + fi + mcopy $FBZIMAGE a:linux +} + +genfdimage144() { + dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 + mformat v: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - v:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" v:initrd.img + fi + mcopy $FBZIMAGE v:linux +} + +genfdimage288() { + dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 + mformat w: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - W:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" w:initrd.img + fi + mcopy $FBZIMAGE w:linux +} + +genisoimage() { + tmp_dir=`dirname $FIMAGE`/isoimage + rm -rf $tmp_dir + mkdir $tmp_dir + for i in lib lib64 share end ; do + if [ -f /usr/$i/syslinux/isolinux.bin ] ; then + cp /usr/$i/syslinux/isolinux.bin $tmp_dir + if [ -f /usr/$i/syslinux/ldlinux.c32 ]; then + cp /usr/$i/syslinux/ldlinux.c32 $tmp_dir + fi + break + fi + if [ $i = end ] ; then exit 1 ; fi ; + done + cp $FBZIMAGE $tmp_dir/linux + echo "$KCMDLINE" > $tmp_dir/isolinux.cfg + if [ -f "$FDINITRD" ] ; then + cp "$FDINITRD" $tmp_dir/initrd.img + fi + mkisofs -J -r -o $FIMAGE -b isolinux.bin -c boot.cat \ + -no-emul-boot -boot-load-size 4 -boot-info-table $tmp_dir + isohybrid $FIMAGE 2>/dev/null || true + rm -rf $tmp_dir +} + +case $1 in + bzdisk) genbzdisk;; + fdimage144) genfdimage144;; + fdimage288) genfdimage288;; + isoimage) genisoimage;; + *) echo 'Unknown image format'; exit 1; +esac From 8a7546a04ecb7c9388a3e935c6cfcc7c2c4c6b67 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 6 Nov 2017 11:32:57 +0800 Subject: [PATCH 0956/1085] x86/build: Add new paths for isolinux.bin and ldlinux.c32 Recently I failed to build isoimage target, because the path of isolinux.bin changed to /usr/xxx/ISOLINUX/isolinux.bin, as well as ldlinux.c32 which changed to /usr/xxx/syslinux/modules/bios/ldlinux.c32. This patch improves the file search logic: - Show a error message instead of silent fail. - Add above new paths. Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yamada.masahiro@socionext.com Link: http://lkml.kernel.org/r/1509939179-7556-3-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 75a9de1d3f72..6c8100b51ea9 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -76,14 +76,27 @@ genisoimage() { rm -rf $tmp_dir mkdir $tmp_dir for i in lib lib64 share end ; do - if [ -f /usr/$i/syslinux/isolinux.bin ] ; then - cp /usr/$i/syslinux/isolinux.bin $tmp_dir - if [ -f /usr/$i/syslinux/ldlinux.c32 ]; then - cp /usr/$i/syslinux/ldlinux.c32 $tmp_dir + for j in syslinux ISOLINUX ; do + if [ -f /usr/$i/$j/isolinux.bin ] ; then + isolinux=/usr/$i/$j/isolinux.bin + echo "Using $isolinux" + cp $isolinux $tmp_dir fi + done + for j in syslinux syslinux/modules/bios ; do + if [ -f /usr/$i/$j/ldlinux.c32 ]; then + ldlinux=/usr/$i/$j/ldlinux.c32 + echo "Using $ldlinux" + cp $ldlinux $tmp_dir + fi + done + if [ -n "$isolinux" -a -n "$ldlinux" ] ; then break fi - if [ $i = end ] ; then exit 1 ; fi ; + if [ $i = end -a -z "$isolinux" ] ; then + echo 'Need an isolinux.bin file, please install syslinux/isolinux.' + exit 1 + fi done cp $FBZIMAGE $tmp_dir/linux echo "$KCMDLINE" > $tmp_dir/isolinux.cfg From c306ba7b909408dbf935bb4a8dc62fae64cb337c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 6 Nov 2017 11:32:58 +0800 Subject: [PATCH 0957/1085] x86/build: Specify -input-charset=utf-8 for mkisofs It avoids the following warning triggered by newer versions of mkisofs: -input-charset not specified, using utf-8 (detected in locale settings) Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yamada.masahiro@socionext.com Link: http://lkml.kernel.org/r/1509939179-7556-4-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 6c8100b51ea9..628e93662093 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -103,8 +103,9 @@ genisoimage() { if [ -f "$FDINITRD" ] ; then cp "$FDINITRD" $tmp_dir/initrd.img fi - mkisofs -J -r -o $FIMAGE -b isolinux.bin -c boot.cat \ - -no-emul-boot -boot-load-size 4 -boot-info-table $tmp_dir + mkisofs -J -r -input-charset=utf-8 -o $FIMAGE -b isolinux.bin \ + -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ + $tmp_dir isohybrid $FIMAGE 2>/dev/null || true rm -rf $tmp_dir } From d786f05175fae09cc39dfa5769c62649341caeb4 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 6 Nov 2017 11:32:59 +0800 Subject: [PATCH 0958/1085] x86/build: Add more generated files to the .gitignore file Some of the files generated by the build process were not listed. Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yamada.masahiro@socionext.com Link: http://lkml.kernel.org/r/1509939179-7556-5-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index e3cf9f682be5..09d25dd09307 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -7,3 +7,6 @@ zoffset.h setup setup.bin setup.elf +fdimage +mtools.conf +image.iso From 4650209b166789182657c8eb0612cecd5b54d591 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 4 Nov 2017 13:30:52 +0900 Subject: [PATCH 0959/1085] arm/kprobes: Fix kretprobe test to check correct counter test_kretprobe() uses jprobe_func_called at the last test, but it must check kretprobe_handler_called. Signed-off-by: Masami Hiramatsu Cc: Arnd Bergmann Cc: Jon Medhurst Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Zijlstra Cc: Russell King Cc: Stephen Rothwell Cc: Thomas Gleixner Cc: Wang Nan Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/150976985182.2012.15495311380682779381.stgit@devbox Signed-off-by: Ingo Molnar --- arch/arm/probes/kprobes/test-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/probes/kprobes/test-core.c b/arch/arm/probes/kprobes/test-core.c index 1c98a87786ca..9c3ceba69015 100644 --- a/arch/arm/probes/kprobes/test-core.c +++ b/arch/arm/probes/kprobes/test-core.c @@ -451,7 +451,7 @@ static int test_kretprobe(long (*func)(long, long)) } if (!call_test_func(func, false)) return -EINVAL; - if (jprobe_func_called == test_func_instance) { + if (kretprobe_handler_called == test_func_instance) { pr_err("FAIL: kretprobe called after unregistering\n"); return -EINVAL; } From a443026a48ad7a8b1b966b00fb5d7111b81a219b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 4 Nov 2017 13:31:21 +0900 Subject: [PATCH 0960/1085] arm/kprobes: Remove jprobe test case Remove the jprobes test case because jprobes is a deprecated feature. Signed-off-by: Masami Hiramatsu Cc: Arnd Bergmann Cc: Jon Medhurst Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Zijlstra Cc: Russell King Cc: Stephen Rothwell Cc: Thomas Gleixner Cc: Wang Nan Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/150976988105.2012.13618117383683725047.stgit@devbox Signed-off-by: Ingo Molnar --- arch/arm/probes/kprobes/test-core.c | 57 ----------------------------- 1 file changed, 57 deletions(-) diff --git a/arch/arm/probes/kprobes/test-core.c b/arch/arm/probes/kprobes/test-core.c index 9c3ceba69015..9ed0129bed3c 100644 --- a/arch/arm/probes/kprobes/test-core.c +++ b/arch/arm/probes/kprobes/test-core.c @@ -227,7 +227,6 @@ static bool test_regs_ok; static int test_func_instance; static int pre_handler_called; static int post_handler_called; -static int jprobe_func_called; static int kretprobe_handler_called; static int tests_failed; @@ -370,50 +369,6 @@ static int test_kprobe(long (*func)(long, long)) return 0; } -static void __kprobes jprobe_func(long r0, long r1) -{ - jprobe_func_called = test_func_instance; - if (r0 == FUNC_ARG1 && r1 == FUNC_ARG2) - test_regs_ok = true; - jprobe_return(); -} - -static struct jprobe the_jprobe = { - .entry = jprobe_func, -}; - -static int test_jprobe(long (*func)(long, long)) -{ - int ret; - - the_jprobe.kp.addr = (kprobe_opcode_t *)func; - ret = register_jprobe(&the_jprobe); - if (ret < 0) { - pr_err("FAIL: register_jprobe failed with %d\n", ret); - return ret; - } - - ret = call_test_func(func, true); - - unregister_jprobe(&the_jprobe); - the_jprobe.kp.flags = 0; /* Clear disable flag to allow reuse */ - - if (!ret) - return -EINVAL; - if (jprobe_func_called != test_func_instance) { - pr_err("FAIL: jprobe handler function not called\n"); - return -EINVAL; - } - if (!call_test_func(func, false)) - return -EINVAL; - if (jprobe_func_called == test_func_instance) { - pr_err("FAIL: probe called after unregistering\n"); - return -EINVAL; - } - - return 0; -} - static int __kprobes kretprobe_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -468,18 +423,6 @@ static int run_api_tests(long (*func)(long, long)) if (ret < 0) return ret; - pr_info(" jprobe\n"); - ret = test_jprobe(func); -#if defined(CONFIG_THUMB2_KERNEL) && !defined(MODULE) - if (ret == -EINVAL) { - pr_err("FAIL: Known longtime bug with jprobe on Thumb kernels\n"); - tests_failed = ret; - ret = 0; - } -#endif - if (ret < 0) - return ret; - pr_info(" kretprobe\n"); ret = test_kretprobe(func); if (ret < 0) From 6ef930f20c30f8a7dcffa50fa9f33a9211727a6e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 7 Nov 2017 10:04:38 +0000 Subject: [PATCH 0961/1085] irqchip/gic-v3-its: Fix VPE activate callback return value its_vpe_irq_domain_activate should always return 0. Really. There is not a single case why it wouldn't. So this "return true;" is really a copy/paste issue that got revealed now that we actually check the return value of the activate method. Brown paper bag day. Fixes: 2247e1bf7063 ("irqchip/gic-v3-its: Limit scope of VPE mapping to be per ITS") Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 29b2ff5c6841..be99d59bf636 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2802,7 +2802,7 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain, /* If we use the list map, we issue VMAPP on demand... */ if (its_list_map) - return true; + return 0; /* Map the VPE to the first possible CPU */ vpe->col_idx = cpumask_first(cpu_online_mask); From c5e260890d5fd6687287f2b532c738a359a61a82 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 1 Nov 2017 11:54:26 -0500 Subject: [PATCH 0962/1085] x86/mm: Remove unnecessary TLB flush for SME in-place encryption A TLB flush is not required when doing in-place encryption or decryption since the area's pagetable attributes are not being altered. To avoid confusion between what the routine is doing and what is documented in the AMD APM, delete the local_flush_tlb() call. Suggested-by: Dave Hansen Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171101165426.1388.24866.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 16c5f37933a2..53680a9dfbed 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -63,7 +63,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, if (!sme_me_mask) return; - local_flush_tlb(); wbinvd(); /* From 0ea04c7322b0dbbc4e7a862451855b10ef9922d3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 6 Nov 2017 18:34:36 +0000 Subject: [PATCH 0963/1085] dt-bindings: Add description of Socionext EXIU interrupt controller Add a description of the External Interrupt Unit (EXIU) interrupt controller as found on the Socionext SynQuacer SoC. Acked-by: Rob Herring Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- .../socionext,synquacer-exiu.txt | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.txt b/Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.txt new file mode 100644 index 000000000000..8b2faefe29ca --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.txt @@ -0,0 +1,32 @@ +Socionext SynQuacer External Interrupt Unit (EXIU) + +The Socionext Synquacer SoC has an external interrupt unit (EXIU) +that forwards a block of 32 configurable input lines to 32 adjacent +level-high type GICv3 SPIs. + +Required properties: + +- compatible : Should be "socionext,synquacer-exiu". +- reg : Specifies base physical address and size of the + control registers. +- interrupt-controller : Identifies the node as an interrupt controller. +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The value must be 3. +- interrupt-parent : phandle of the GIC these interrupts are routed to. +- socionext,spi-base : The SPI number of the first SPI of the 32 adjacent + ones the EXIU forwards its interrups to. + +Notes: + +- Only SPIs can use the EXIU as an interrupt parent. + +Example: + + exiu: interrupt-controller@510c0000 { + compatible = "socionext,synquacer-exiu"; + reg = <0x0 0x510c0000 0x0 0x20>; + interrupt-controller; + interrupt-parent = <&gic>; + #interrupt-cells = <3>; + socionext,spi-base = <112>; + }; From 706cffc1b912342668e621526c860fb093dfc2d5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 6 Nov 2017 18:34:37 +0000 Subject: [PATCH 0964/1085] irqchip/exiu: Add support for Socionext Synquacer EXIU controller The Socionext Synquacer SoC has an external interrupt unit (EXIU) that forwards a block of 32 configurable input lines to 32 adjacent level-high type GICv3 SPIs. The EXIU has per-interrupt level/edge and polarity controls, and mask bits that keep the outgoing lines de-asserted, even though the controller may still latch interrupt conditions that occur while the line is masked. Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- arch/arm64/Kconfig.platforms | 3 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-sni-exiu.c | 227 +++++++++++++++++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 drivers/irqchip/irq-sni-exiu.c diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 6b54ee8c1262..1d03ef54295a 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -161,6 +161,9 @@ config ARCH_SEATTLE config ARCH_SHMOBILE bool +config ARCH_SYNQUACER + bool "Socionext SynQuacer SoC Family" + config ARCH_RENESAS bool "Renesas SoC Platforms" select ARCH_SHMOBILE diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 065adf4102c9..dee3390390d5 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -79,4 +79,5 @@ obj-$(CONFIG_ARCH_ASPEED) += irq-aspeed-vic.o irq-aspeed-i2c-ic.o obj-$(CONFIG_STM32_EXTI) += irq-stm32-exti.o obj-$(CONFIG_QCOM_IRQ_COMBINER) += qcom-irq-combiner.o obj-$(CONFIG_IRQ_UNIPHIER_AIDET) += irq-uniphier-aidet.o +obj-$(CONFIG_ARCH_SYNQUACER) += irq-sni-exiu.o obj-$(CONFIG_MESON_IRQ_GPIO) += irq-meson-gpio.o diff --git a/drivers/irqchip/irq-sni-exiu.c b/drivers/irqchip/irq-sni-exiu.c new file mode 100644 index 000000000000..1b6e2f7c59af --- /dev/null +++ b/drivers/irqchip/irq-sni-exiu.c @@ -0,0 +1,227 @@ +/* + * Driver for Socionext External Interrupt Unit (EXIU) + * + * Copyright (c) 2017 Linaro, Ltd. + * + * Based on irq-tegra.c: + * Copyright (C) 2011 Google, Inc. + * Copyright (C) 2010,2013, NVIDIA Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NUM_IRQS 32 + +#define EIMASK 0x00 +#define EISRCSEL 0x04 +#define EIREQSTA 0x08 +#define EIRAWREQSTA 0x0C +#define EIREQCLR 0x10 +#define EILVL 0x14 +#define EIEDG 0x18 +#define EISIR 0x1C + +struct exiu_irq_data { + void __iomem *base; + u32 spi_base; +}; + +static void exiu_irq_eoi(struct irq_data *d) +{ + struct exiu_irq_data *data = irq_data_get_irq_chip_data(d); + + writel(BIT(d->hwirq), data->base + EIREQCLR); + irq_chip_eoi_parent(d); +} + +static void exiu_irq_mask(struct irq_data *d) +{ + struct exiu_irq_data *data = irq_data_get_irq_chip_data(d); + u32 val; + + val = readl_relaxed(data->base + EIMASK) | BIT(d->hwirq); + writel_relaxed(val, data->base + EIMASK); + irq_chip_mask_parent(d); +} + +static void exiu_irq_unmask(struct irq_data *d) +{ + struct exiu_irq_data *data = irq_data_get_irq_chip_data(d); + u32 val; + + val = readl_relaxed(data->base + EIMASK) & ~BIT(d->hwirq); + writel_relaxed(val, data->base + EIMASK); + irq_chip_unmask_parent(d); +} + +static void exiu_irq_enable(struct irq_data *d) +{ + struct exiu_irq_data *data = irq_data_get_irq_chip_data(d); + u32 val; + + /* clear interrupts that were latched while disabled */ + writel_relaxed(BIT(d->hwirq), data->base + EIREQCLR); + + val = readl_relaxed(data->base + EIMASK) & ~BIT(d->hwirq); + writel_relaxed(val, data->base + EIMASK); + irq_chip_enable_parent(d); +} + +static int exiu_irq_set_type(struct irq_data *d, unsigned int type) +{ + struct exiu_irq_data *data = irq_data_get_irq_chip_data(d); + u32 val; + + val = readl_relaxed(data->base + EILVL); + if (type == IRQ_TYPE_EDGE_RISING || type == IRQ_TYPE_LEVEL_HIGH) + val |= BIT(d->hwirq); + else + val &= ~BIT(d->hwirq); + writel_relaxed(val, data->base + EILVL); + + val = readl_relaxed(data->base + EIEDG); + if (type == IRQ_TYPE_LEVEL_LOW || type == IRQ_TYPE_LEVEL_HIGH) + val &= ~BIT(d->hwirq); + else + val |= BIT(d->hwirq); + writel_relaxed(val, data->base + EIEDG); + + writel_relaxed(BIT(d->hwirq), data->base + EIREQCLR); + + return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); +} + +static struct irq_chip exiu_irq_chip = { + .name = "EXIU", + .irq_eoi = exiu_irq_eoi, + .irq_enable = exiu_irq_enable, + .irq_mask = exiu_irq_mask, + .irq_unmask = exiu_irq_unmask, + .irq_set_type = exiu_irq_set_type, + .irq_set_affinity = irq_chip_set_affinity_parent, + .flags = IRQCHIP_SET_TYPE_MASKED | + IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_EOI_THREADED | + IRQCHIP_MASK_ON_SUSPEND, +}; + +static int exiu_domain_translate(struct irq_domain *domain, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + struct exiu_irq_data *info = domain->host_data; + + if (is_of_node(fwspec->fwnode)) { + if (fwspec->param_count != 3) + return -EINVAL; + + if (fwspec->param[0] != GIC_SPI) + return -EINVAL; /* No PPI should point to this domain */ + + *hwirq = fwspec->param[1] - info->spi_base; + *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + return -EINVAL; +} + +static int exiu_domain_alloc(struct irq_domain *dom, unsigned int virq, + unsigned int nr_irqs, void *data) +{ + struct irq_fwspec *fwspec = data; + struct irq_fwspec parent_fwspec; + struct exiu_irq_data *info = dom->host_data; + irq_hw_number_t hwirq; + + if (fwspec->param_count != 3) + return -EINVAL; /* Not GIC compliant */ + if (fwspec->param[0] != GIC_SPI) + return -EINVAL; /* No PPI should point to this domain */ + + WARN_ON(nr_irqs != 1); + hwirq = fwspec->param[1] - info->spi_base; + irq_domain_set_hwirq_and_chip(dom, virq, hwirq, &exiu_irq_chip, info); + + parent_fwspec = *fwspec; + parent_fwspec.fwnode = dom->parent->fwnode; + return irq_domain_alloc_irqs_parent(dom, virq, nr_irqs, &parent_fwspec); +} + +static const struct irq_domain_ops exiu_domain_ops = { + .translate = exiu_domain_translate, + .alloc = exiu_domain_alloc, + .free = irq_domain_free_irqs_common, +}; + +static int __init exiu_init(struct device_node *node, + struct device_node *parent) +{ + struct irq_domain *parent_domain, *domain; + struct exiu_irq_data *data; + int err; + + if (!parent) { + pr_err("%pOF: no parent, giving up\n", node); + return -ENODEV; + } + + parent_domain = irq_find_host(parent); + if (!parent_domain) { + pr_err("%pOF: unable to obtain parent domain\n", node); + return -ENXIO; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (of_property_read_u32(node, "socionext,spi-base", &data->spi_base)) { + pr_err("%pOF: failed to parse 'spi-base' property\n", node); + err = -ENODEV; + goto out_free; + } + + data->base = of_iomap(node, 0); + if (IS_ERR(data->base)) { + err = PTR_ERR(data->base); + goto out_free; + } + + /* clear and mask all interrupts */ + writel_relaxed(0xFFFFFFFF, data->base + EIREQCLR); + writel_relaxed(0xFFFFFFFF, data->base + EIMASK); + + domain = irq_domain_add_hierarchy(parent_domain, 0, NUM_IRQS, node, + &exiu_domain_ops, data); + if (!domain) { + pr_err("%pOF: failed to allocate domain\n", node); + err = -ENOMEM; + goto out_unmap; + } + + pr_info("%pOF: %d interrupts forwarded to %pOF\n", node, NUM_IRQS, + parent); + + return 0; + +out_unmap: + iounmap(data->base); +out_free: + kfree(data); + return err; +} +IRQCHIP_DECLARE(exiu, "socionext,synquacer-exiu", exiu_init); From e846d13958066828a9483d862cc8370a72fadbb6 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Thu, 2 Nov 2017 09:18:21 +0800 Subject: [PATCH 0965/1085] kprobes, x86/alternatives: Use text_mutex to protect smp_alt_modules We use alternatives_text_reserved() to check if the address is in the fixed pieces of alternative reserved, but the problem is that we don't hold the smp_alt mutex when call this function. So the list traversal may encounter a deleted list_head if another path is doing alternatives_smp_module_del(). One solution is that we can hold smp_alt mutex before call this function, but the difficult point is that the callers of this functions, arch_prepare_kprobe() and arch_prepare_optimized_kprobe(), are called inside the text_mutex. So we must hold smp_alt mutex before we go into these arch dependent code. But we can't now, the smp_alt mutex is the arch dependent part, only x86 has it. Maybe we can export another arch dependent callback to solve this. But there is a simpler way to handle this problem. We can reuse the text_mutex to protect smp_alt_modules instead of using another mutex. And all the arch dependent checks of kprobes are inside the text_mutex, so it's safe now. Signed-off-by: Zhou Chengming Reviewed-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Fixes: 2cfa197 "ftrace/alternatives: Introducing *_text_reserved functions" Link: http://lkml.kernel.org/r/1509585501-79466-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 26 +++++++++++++------------- kernel/extable.c | 2 ++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..dbaf14d69ebd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } - mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(const s32 *start, const s32 *end, @@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } - mutex_unlock(&text_mutex); } struct smp_alt_module { @@ -489,8 +485,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_MUTEX(smp_alt); -static bool uniproc_patched = false; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; @@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; @@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) kfree(item); break; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) @@ -551,7 +546,7 @@ void alternatives_enable_smp(void) /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); @@ -563,10 +558,13 @@ void alternatives_enable_smp(void) mod->text, mod->text_end); uniproc_patched = false; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } -/* Return 1 if the address range is reserved for smp-alternatives */ +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; @@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end) u8 *text_start = start; u8 *text_end = end; + lockdep_assert_held(&text_mutex); + list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; diff --git a/kernel/extable.c b/kernel/extable.c index 9aa1cc41ecf7..a17fdb63dc3e 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -31,6 +31,8 @@ * mutex protecting text section modification (dynamic code patching). * some users need to sleep (allocating memory...) while they hold this lock. * + * Note: Also protects SMP-alternatives modification on x86. + * * NOT exported to modules - patching kernel text is a really delicate matter. */ DEFINE_MUTEX(text_mutex); From f791dd2589d7e217625d7e411621a5bac71cbf69 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 3 Nov 2017 18:59:48 +0800 Subject: [PATCH 0966/1085] locking/rwlocks: Fix comments - fix the list of locking API headers in kernel/locking/spinlock.c - fix an #endif comment Signed-off-by: Cheng Jian Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1509706788-152547-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- include/linux/rwlock_api_smp.h | 2 +- kernel/locking/spinlock.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 5b9b84b20407..86ebb4bf9c6e 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -211,7 +211,7 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } -#endif /* CONFIG_PREEMPT */ +#endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) { diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b96343374a87..1fd1a7543cdd 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -30,7 +30,8 @@ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) /* * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h + * spinlock : include/linux/spinlock_api_smp.h + * rwlock : include/linux/rwlock_api_smp.h */ #else From 0e7d780721bd259d061ac0c1076e40e326d1b32b Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:31 +0100 Subject: [PATCH 0967/1085] irqchip/stm32: Select GENERIC_IRQ_CHIP This patch adds GENERIC_IRQ_CHIP to stm32 exti config. Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 1cb39405d7f8..ccfb14de57e4 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -306,6 +306,7 @@ config EZNPS_GIC config STM32_EXTI bool select IRQ_DOMAIN + select GENERIC_IRQ_CHIP config QCOM_IRQ_COMBINER bool "QCOM IRQ combiner support" From 6dd64ee17e04c3949f21630b6426f149252ff8b1 Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:32 +0100 Subject: [PATCH 0968/1085] irqchip/stm32: Add multi-bank management -Prepare to manage multi-bank of external interrupts (N banks of 32 inputs). -Prepare to manage registers offsets by compatible (registers offsets could be different follow per stm32 platform). Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 149 +++++++++++++++++++++---------- 1 file changed, 103 insertions(+), 46 deletions(-) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 45363ff8d06f..ef378709baea 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -14,27 +14,66 @@ #include #include -#define EXTI_IMR 0x0 -#define EXTI_EMR 0x4 -#define EXTI_RTSR 0x8 -#define EXTI_FTSR 0xc -#define EXTI_SWIER 0x10 -#define EXTI_PR 0x14 +#define IRQS_PER_BANK 32 + +struct stm32_exti_bank { + u32 imr_ofst; + u32 emr_ofst; + u32 rtsr_ofst; + u32 ftsr_ofst; + u32 swier_ofst; + u32 pr_ofst; +}; + +static const struct stm32_exti_bank stm32f4xx_exti_b1 = { + .imr_ofst = 0x00, + .emr_ofst = 0x04, + .rtsr_ofst = 0x08, + .ftsr_ofst = 0x0C, + .swier_ofst = 0x10, + .pr_ofst = 0x14, +}; + +static const struct stm32_exti_bank *stm32f4xx_exti_banks[] = { + &stm32f4xx_exti_b1, +}; + +static unsigned long stm32_exti_pending(struct irq_chip_generic *gc) +{ + const struct stm32_exti_bank *stm32_bank = gc->private; + + return irq_reg_readl(gc, stm32_bank->pr_ofst); +} + +static void stm32_exti_irq_ack(struct irq_chip_generic *gc, u32 mask) +{ + const struct stm32_exti_bank *stm32_bank = gc->private; + + irq_reg_writel(gc, mask, stm32_bank->pr_ofst); +} static void stm32_irq_handler(struct irq_desc *desc) { struct irq_domain *domain = irq_desc_get_handler_data(desc); - struct irq_chip_generic *gc = domain->gc->gc[0]; struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int virq, nbanks = domain->gc->num_chips; + struct irq_chip_generic *gc; + const struct stm32_exti_bank *stm32_bank; unsigned long pending; - int n; + int n, i, irq_base = 0; chained_irq_enter(chip, desc); - while ((pending = irq_reg_readl(gc, EXTI_PR))) { - for_each_set_bit(n, &pending, BITS_PER_LONG) { - generic_handle_irq(irq_find_mapping(domain, n)); - irq_reg_writel(gc, BIT(n), EXTI_PR); + for (i = 0; i < nbanks; i++, irq_base += IRQS_PER_BANK) { + gc = irq_get_domain_generic_chip(domain, irq_base); + stm32_bank = gc->private; + + while ((pending = stm32_exti_pending(gc))) { + for_each_set_bit(n, &pending, IRQS_PER_BANK) { + virq = irq_find_mapping(domain, irq_base + n); + generic_handle_irq(virq); + stm32_exti_irq_ack(gc, BIT(n)); + } } } @@ -44,13 +83,14 @@ static void stm32_irq_handler(struct irq_desc *desc) static int stm32_irq_set_type(struct irq_data *data, unsigned int type) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); - int pin = data->hwirq; + const struct stm32_exti_bank *stm32_bank = gc->private; + int pin = data->hwirq % IRQS_PER_BANK; u32 rtsr, ftsr; irq_gc_lock(gc); - rtsr = irq_reg_readl(gc, EXTI_RTSR); - ftsr = irq_reg_readl(gc, EXTI_FTSR); + rtsr = irq_reg_readl(gc, stm32_bank->rtsr_ofst); + ftsr = irq_reg_readl(gc, stm32_bank->ftsr_ofst); switch (type) { case IRQ_TYPE_EDGE_RISING: @@ -70,8 +110,8 @@ static int stm32_irq_set_type(struct irq_data *data, unsigned int type) return -EINVAL; } - irq_reg_writel(gc, rtsr, EXTI_RTSR); - irq_reg_writel(gc, ftsr, EXTI_FTSR); + irq_reg_writel(gc, rtsr, stm32_bank->rtsr_ofst); + irq_reg_writel(gc, ftsr, stm32_bank->ftsr_ofst); irq_gc_unlock(gc); @@ -81,17 +121,18 @@ static int stm32_irq_set_type(struct irq_data *data, unsigned int type) static int stm32_irq_set_wake(struct irq_data *data, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); - int pin = data->hwirq; + const struct stm32_exti_bank *stm32_bank = gc->private; + int pin = data->hwirq % IRQS_PER_BANK; u32 emr; irq_gc_lock(gc); - emr = irq_reg_readl(gc, EXTI_EMR); + emr = irq_reg_readl(gc, stm32_bank->emr_ofst); if (on) emr |= BIT(pin); else emr &= ~BIT(pin); - irq_reg_writel(gc, emr, EXTI_EMR); + irq_reg_writel(gc, emr, stm32_bank->emr_ofst); irq_gc_unlock(gc); @@ -101,11 +142,12 @@ static int stm32_irq_set_wake(struct irq_data *data, unsigned int on) static int stm32_exti_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, void *data) { - struct irq_chip_generic *gc = d->gc->gc[0]; + struct irq_chip_generic *gc; struct irq_fwspec *fwspec = data; irq_hw_number_t hwirq; hwirq = fwspec->param[0]; + gc = irq_get_domain_generic_chip(d, hwirq); irq_map_generic_chip(d, virq, hwirq); irq_domain_set_info(d, virq, hwirq, &gc->chip_types->chip, gc, @@ -129,8 +171,9 @@ struct irq_domain_ops irq_exti_domain_ops = { .free = stm32_exti_free, }; -static int __init stm32_exti_init(struct device_node *node, - struct device_node *parent) +static int +__init stm32_exti_init(const struct stm32_exti_bank **stm32_exti_banks, + int bank_nr, struct device_node *node) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; int nr_irqs, nr_exti, ret, i; @@ -144,23 +187,16 @@ static int __init stm32_exti_init(struct device_node *node, return -ENOMEM; } - /* Determine number of irqs supported */ - writel_relaxed(~0UL, base + EXTI_RTSR); - nr_exti = fls(readl_relaxed(base + EXTI_RTSR)); - writel_relaxed(0, base + EXTI_RTSR); - - pr_info("%pOF: %d External IRQs detected\n", node, nr_exti); - - domain = irq_domain_add_linear(node, nr_exti, + domain = irq_domain_add_linear(node, bank_nr * IRQS_PER_BANK, &irq_exti_domain_ops, NULL); if (!domain) { pr_err("%s: Could not register interrupt domain.\n", - node->name); + node->name); ret = -ENOMEM; goto out_unmap; } - ret = irq_alloc_domain_generic_chips(domain, nr_exti, 1, "exti", + ret = irq_alloc_domain_generic_chips(domain, IRQS_PER_BANK, 1, "exti", handle_edge_irq, clr, 0, 0); if (ret) { pr_err("%pOF: Could not allocate generic interrupt chip.\n", @@ -168,18 +204,32 @@ static int __init stm32_exti_init(struct device_node *node, goto out_free_domain; } - gc = domain->gc->gc[0]; - gc->reg_base = base; - gc->chip_types->type = IRQ_TYPE_EDGE_BOTH; - gc->chip_types->chip.name = gc->chip_types[0].chip.name; - gc->chip_types->chip.irq_ack = irq_gc_ack_set_bit; - gc->chip_types->chip.irq_mask = irq_gc_mask_clr_bit; - gc->chip_types->chip.irq_unmask = irq_gc_mask_set_bit; - gc->chip_types->chip.irq_set_type = stm32_irq_set_type; - gc->chip_types->chip.irq_set_wake = stm32_irq_set_wake; - gc->chip_types->regs.ack = EXTI_PR; - gc->chip_types->regs.mask = EXTI_IMR; - gc->chip_types->handler = handle_edge_irq; + for (i = 0; i < bank_nr; i++) { + const struct stm32_exti_bank *stm32_bank = stm32_exti_banks[i]; + u32 irqs_mask; + + gc = irq_get_domain_generic_chip(domain, i * IRQS_PER_BANK); + + gc->reg_base = base; + gc->chip_types->type = IRQ_TYPE_EDGE_BOTH; + gc->chip_types->chip.irq_ack = irq_gc_ack_set_bit; + gc->chip_types->chip.irq_mask = irq_gc_mask_clr_bit; + gc->chip_types->chip.irq_unmask = irq_gc_mask_set_bit; + gc->chip_types->chip.irq_set_type = stm32_irq_set_type; + gc->chip_types->chip.irq_set_wake = stm32_irq_set_wake; + gc->chip_types->regs.ack = stm32_bank->pr_ofst; + gc->chip_types->regs.mask = stm32_bank->imr_ofst; + gc->private = (void *)stm32_bank; + + /* Determine number of irqs supported */ + writel_relaxed(~0UL, base + stm32_bank->rtsr_ofst); + irqs_mask = readl_relaxed(base + stm32_bank->rtsr_ofst); + nr_exti = fls(readl_relaxed(base + stm32_bank->rtsr_ofst)); + writel_relaxed(0, base + stm32_bank->rtsr_ofst); + + pr_info("%s: bank%d, External IRQs available:%#x\n", + node->full_name, i, irqs_mask); + } nr_irqs = of_irq_count(node); for (i = 0; i < nr_irqs; i++) { @@ -198,4 +248,11 @@ out_unmap: return ret; } -IRQCHIP_DECLARE(stm32_exti, "st,stm32-exti", stm32_exti_init); +static int __init stm32f4_exti_of_init(struct device_node *np, + struct device_node *parent) +{ + return stm32_exti_init(stm32f4xx_exti_banks, + ARRAY_SIZE(stm32f4xx_exti_banks), np); +} + +IRQCHIP_DECLARE(stm32f4_exti, "st,stm32-exti", stm32f4_exti_of_init); From ce0b7e39c5a0c65bdce6fc36bba2991ea8f915b7 Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:33 +0100 Subject: [PATCH 0969/1085] dt-bindings/interrupt-controllers: Add compatible string for stm32h7 This patch updates stm32-exti documentation with stm32h7-exti compatible string. Acked-by: Rob Herring Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- .../bindings/interrupt-controller/st,stm32-exti.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt index 6e7703d4ff5b..edf03f09244b 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt @@ -2,7 +2,9 @@ STM32 External Interrupt Controller Required properties: -- compatible: Should be "st,stm32-exti" +- compatible: Should be: + "st,stm32-exti" + "st,stm32h7-exti" - reg: Specifies base physical address and size of the registers - interrupt-controller: Indentifies the node as an interrupt controller - #interrupt-cells: Specifies the number of cells to encode an interrupt From 539c603e147c1566f90623d863fa0d64ecb6c89d Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:34 +0100 Subject: [PATCH 0970/1085] irqchip/stm32: Add stm32h7 support stm32h7 has up to 96 inputs (3 banks of 32 inputs max). Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index ef378709baea..8f409a95918f 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -38,6 +38,39 @@ static const struct stm32_exti_bank *stm32f4xx_exti_banks[] = { &stm32f4xx_exti_b1, }; +static const struct stm32_exti_bank stm32h7xx_exti_b1 = { + .imr_ofst = 0x80, + .emr_ofst = 0x84, + .rtsr_ofst = 0x00, + .ftsr_ofst = 0x04, + .swier_ofst = 0x08, + .pr_ofst = 0x88, +}; + +static const struct stm32_exti_bank stm32h7xx_exti_b2 = { + .imr_ofst = 0x90, + .emr_ofst = 0x94, + .rtsr_ofst = 0x20, + .ftsr_ofst = 0x24, + .swier_ofst = 0x28, + .pr_ofst = 0x98, +}; + +static const struct stm32_exti_bank stm32h7xx_exti_b3 = { + .imr_ofst = 0xA0, + .emr_ofst = 0xA4, + .rtsr_ofst = 0x40, + .ftsr_ofst = 0x44, + .swier_ofst = 0x48, + .pr_ofst = 0xA8, +}; + +static const struct stm32_exti_bank *stm32h7xx_exti_banks[] = { + &stm32h7xx_exti_b1, + &stm32h7xx_exti_b2, + &stm32h7xx_exti_b3, +}; + static unsigned long stm32_exti_pending(struct irq_chip_generic *gc) { const struct stm32_exti_bank *stm32_bank = gc->private; @@ -256,3 +289,12 @@ static int __init stm32f4_exti_of_init(struct device_node *np, } IRQCHIP_DECLARE(stm32f4_exti, "st,stm32-exti", stm32f4_exti_of_init); + +static int __init stm32h7_exti_of_init(struct device_node *np, + struct device_node *parent) +{ + return stm32_exti_init(stm32h7xx_exti_banks, + ARRAY_SIZE(stm32h7xx_exti_banks), np); +} + +IRQCHIP_DECLARE(stm32h7_exti, "st,stm32h7-exti", stm32h7_exti_of_init); From 2ca6b9bcaad4a78af255be9e16b3f3062a6d7c32 Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:35 +0100 Subject: [PATCH 0971/1085] irqchip/stm32: Fix initial values -After cold boot, imr default value depends on hardware configuration. -After hot reboot the registers must be cleared to avoid residue. Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 8f409a95918f..477d0faf0ea2 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -258,7 +258,16 @@ __init stm32_exti_init(const struct stm32_exti_bank **stm32_exti_banks, writel_relaxed(~0UL, base + stm32_bank->rtsr_ofst); irqs_mask = readl_relaxed(base + stm32_bank->rtsr_ofst); nr_exti = fls(readl_relaxed(base + stm32_bank->rtsr_ofst)); + + /* + * This IP has no reset, so after hot reboot we should + * clear registers to avoid residue + */ + writel_relaxed(0, base + stm32_bank->imr_ofst); + writel_relaxed(0, base + stm32_bank->emr_ofst); writel_relaxed(0, base + stm32_bank->rtsr_ofst); + writel_relaxed(0, base + stm32_bank->ftsr_ofst); + writel_relaxed(~0UL, base + stm32_bank->pr_ofst); pr_info("%s: bank%d, External IRQs available:%#x\n", node->full_name, i, irqs_mask); From 90af7c254ffb94b0df47bf6f613e3f3173df05d1 Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Mon, 6 Nov 2017 18:03:36 +0100 Subject: [PATCH 0972/1085] irqchip/stm32: Move the wakeup on interrupt mask Move irq_set_wake on interrupt mask, needed to wake up from low power mode as the event mask is not able to do so. Signed-off-by: Ludovic Barre Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 477d0faf0ea2..31ab0dee2ce7 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -156,16 +156,16 @@ static int stm32_irq_set_wake(struct irq_data *data, unsigned int on) struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); const struct stm32_exti_bank *stm32_bank = gc->private; int pin = data->hwirq % IRQS_PER_BANK; - u32 emr; + u32 imr; irq_gc_lock(gc); - emr = irq_reg_readl(gc, stm32_bank->emr_ofst); + imr = irq_reg_readl(gc, stm32_bank->imr_ofst); if (on) - emr |= BIT(pin); + imr |= BIT(pin); else - emr &= ~BIT(pin); - irq_reg_writel(gc, emr, stm32_bank->emr_ofst); + imr &= ~BIT(pin); + irq_reg_writel(gc, imr, stm32_bank->imr_ofst); irq_gc_unlock(gc); From 1f27ddf0b50b45eaf0f95565125cf10f9c821746 Mon Sep 17 00:00:00 2001 From: Masaharu Hayakawa Date: Fri, 3 Nov 2017 10:36:28 +0100 Subject: [PATCH 0973/1085] mmc: tmio: Replace msleep() of 20ms or less with usleep_range() As documented in Documentation/timers/timers-howto.txt as follows, replace msleep() with usleep_range(). msleep(1~20) may not do what the caller intends, and will often sleep longer (~20 ms actual sleep for any value given in the 1~20ms range). In many cases this is not the desired behavior. Signed-off-by: Masaharu Hayakawa Signed-off-by: Simon Horman Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 4c8198f8b04a..583bf3262df5 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -167,11 +167,11 @@ static void tmio_mmc_clk_start(struct tmio_mmc_host *host) /* HW engineers overrode docs: no sleep needed on R-Car2+ */ if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) - msleep(10); + usleep_range(10000, 11000); if (host->pdata->flags & TMIO_MMC_HAVE_HIGH_REG) { sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0100); - msleep(10); + usleep_range(10000, 11000); } } @@ -179,7 +179,7 @@ static void tmio_mmc_clk_stop(struct tmio_mmc_host *host) { if (host->pdata->flags & TMIO_MMC_HAVE_HIGH_REG) { sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0000); - msleep(10); + usleep_range(10000, 11000); } sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, ~CLK_CTL_SCLKEN & @@ -187,7 +187,7 @@ static void tmio_mmc_clk_stop(struct tmio_mmc_host *host) /* HW engineers overrode docs: no sleep needed on R-Car2+ */ if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) - msleep(10); + usleep_range(10000, 11000); } static void tmio_mmc_set_clock(struct tmio_mmc_host *host, @@ -219,7 +219,7 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clk & CLK_CTL_DIV_MASK); if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) - msleep(10); + usleep_range(10000, 11000); tmio_mmc_clk_start(host); } @@ -230,11 +230,11 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) sd_ctrl_write16(host, CTL_RESET_SD, 0x0000); if (host->pdata->flags & TMIO_MMC_HAVE_HIGH_REG) sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0000); - msleep(10); + usleep_range(10000, 11000); sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); if (host->pdata->flags & TMIO_MMC_HAVE_HIGH_REG) sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0001); - msleep(10); + usleep_range(10000, 11000); if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); From 167e5b6f98d27901c7a00b566938ce1973af4395 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 6 Nov 2017 15:29:22 +0000 Subject: [PATCH 0974/1085] dt-bindings: sdhci-fujitsu: document cmd-dat-delay property Document a new boolean property of the sdhci-fujitsu binding that indicates whether the CMD_DAT_DELAY bit needs to be set in the F_SDH30_ESD_CONTROL register. Acked-by: Rob Herring Signed-off-by: Ard Biesheuvel Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt b/Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt index de2c53cff4f1..3ee9263adf73 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt @@ -15,6 +15,8 @@ Required properties: Optional properties: - vqmmc-supply: phandle to the regulator device tree node, mentioned as the VCCQ/VDD_IO supply in the eMMC/SD specs. +- fujitsu,cmd-dat-delay-select: boolean property indicating that this host + requires the CMD_DAT_DELAY control to be enabled. Example: From 06641e8deae68ee2769c734158bc9170be257bb9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 6 Nov 2017 15:29:23 +0000 Subject: [PATCH 0975/1085] sdhci-fujitsu: add support for setting the CMD_DAT_DELAY attribute The Socionext SynQuacer SoC inherits this IP from Fujitsu, but requires the F_SDH30_CMD_DAT_DELAY bit to be set in the F_SDH30_ESD_CONTROL control register. So set this bit if the DT node has the 'fujitsu,cmd-dat-delay-select' property. Signed-off-by: Ard Biesheuvel Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_f_sdh30.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/mmc/host/sdhci_f_sdh30.c b/drivers/mmc/host/sdhci_f_sdh30.c index 111b66f5439b..04ca0d33a521 100644 --- a/drivers/mmc/host/sdhci_f_sdh30.c +++ b/drivers/mmc/host/sdhci_f_sdh30.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "sdhci-pltfm.h" @@ -47,6 +48,7 @@ struct f_sdhost_priv { struct clk *clk; u32 vendor_hs200; struct device *dev; + bool enable_cmd_dat_delay; }; static void sdhci_f_sdh30_soft_voltage_switch(struct sdhci_host *host) @@ -84,10 +86,19 @@ static unsigned int sdhci_f_sdh30_get_min_clock(struct sdhci_host *host) static void sdhci_f_sdh30_reset(struct sdhci_host *host, u8 mask) { + struct f_sdhost_priv *priv = sdhci_priv(host); + u32 ctl; + if (sdhci_readw(host, SDHCI_CLOCK_CONTROL) == 0) sdhci_writew(host, 0xBC01, SDHCI_CLOCK_CONTROL); sdhci_reset(host, mask); + + if (priv->enable_cmd_dat_delay) { + ctl = sdhci_readl(host, F_SDH30_ESD_CONTROL); + ctl |= F_SDH30_CMD_DAT_DELAY; + sdhci_writel(host, ctl, F_SDH30_ESD_CONTROL); + } } static const struct sdhci_ops sdhci_f_sdh30_ops = { @@ -126,6 +137,9 @@ static int sdhci_f_sdh30_probe(struct platform_device *pdev) host->quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE | SDHCI_QUIRK2_TUNING_WORK_AROUND; + priv->enable_cmd_dat_delay = device_property_read_bool(dev, + "fujitsu,cmd-dat-delay-select"); + ret = mmc_of_parse(host->mmc); if (ret) goto err; From 33e63acc119d15c2fac3e3775f32d1ce7a01021b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 20 Oct 2017 09:30:43 -0500 Subject: [PATCH 0976/1085] Documentation/x86: Add AMD Secure Encrypted Virtualization (SEV) description Update the AMD memory encryption document describing the Secure Encrypted Virtualization (SEV) feature. Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: Jonathan Corbet Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-2-brijesh.singh@amd.com --- Documentation/x86/amd-memory-encryption.txt | 30 ++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt index f512ab718541..afc41f544dab 100644 --- a/Documentation/x86/amd-memory-encryption.txt +++ b/Documentation/x86/amd-memory-encryption.txt @@ -1,4 +1,5 @@ -Secure Memory Encryption (SME) is a feature found on AMD processors. +Secure Memory Encryption (SME) and Secure Encrypted Virtualization (SEV) are +features found on AMD processors. SME provides the ability to mark individual pages of memory as encrypted using the standard x86 page tables. A page that is marked encrypted will be @@ -6,24 +7,38 @@ automatically decrypted when read from DRAM and encrypted when written to DRAM. SME can therefore be used to protect the contents of DRAM from physical attacks on the system. +SEV enables running encrypted virtual machines (VMs) in which the code and data +of the guest VM are secured so that a decrypted version is available only +within the VM itself. SEV guest VMs have the concept of private and shared +memory. Private memory is encrypted with the guest-specific key, while shared +memory may be encrypted with hypervisor key. When SME is enabled, the hypervisor +key is the same key which is used in SME. + A page is encrypted when a page table entry has the encryption bit set (see below on how to determine its position). The encryption bit can also be specified in the cr3 register, allowing the PGD table to be encrypted. Each successive level of page tables can also be encrypted by setting the encryption bit in the page table entry that points to the next table. This allows the full page table hierarchy to be encrypted. Note, this means that just because the -encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. +encryption bit is set in cr3, doesn't imply the full hierarchy is encrypted. Each page table entry in the hierarchy needs to have the encryption bit set to achieve that. So, theoretically, you could have the encryption bit set in cr3 so that the PGD is encrypted, but not set the encryption bit in the PGD entry for a PUD which results in the PUD pointed to by that entry to not be encrypted. -Support for SME can be determined through the CPUID instruction. The CPUID -function 0x8000001f reports information related to SME: +When SEV is enabled, instruction pages and guest page tables are always treated +as private. All the DMA operations inside the guest must be performed on shared +memory. Since the memory encryption bit is controlled by the guest OS when it +is operating in 64-bit or 32-bit PAE mode, in all other modes the SEV hardware +forces the memory encryption bit to 1. + +Support for SME and SEV can be determined through the CPUID instruction. The +CPUID function 0x8000001f reports information related to SME: 0x8000001f[eax]: Bit[0] indicates support for SME + Bit[1] indicates support for SEV 0x8000001f[ebx]: Bits[5:0] pagetable bit number used to activate memory encryption @@ -39,6 +54,13 @@ determine if SME is enabled and/or to enable memory encryption: Bit[23] 0 = memory encryption features are disabled 1 = memory encryption features are enabled +If SEV is supported, MSR 0xc0010131 (MSR_AMD64_SEV) can be used to determine if +SEV is active: + + 0xc0010131: + Bit[0] 0 = memory encryption is not active + 1 = memory encryption is active + Linux relies on BIOS to set this bit if BIOS has determined that the reduction in the physical address space as a result of enabling memory encryption (see CPUID information above) will not conflict with the address space resource From d8aa7eea78a1401cce39b3bb61ead0150044a3df Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:44 -0500 Subject: [PATCH 0977/1085] x86/mm: Add Secure Encrypted Virtualization (SEV) support Provide support for Secure Encrypted Virtualization (SEV). This initial support defines a flag that is used by the kernel to determine if it is running with SEV active. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20171020143059.3291-3-brijesh.singh@amd.com --- arch/x86/include/asm/mem_encrypt.h | 6 ++++++ arch/x86/mm/mem_encrypt.c | 26 ++++++++++++++++++++++++++ include/linux/mem_encrypt.h | 7 +++++-- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 6a77c63540f7..2b024741bce9 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -47,6 +47,9 @@ void __init mem_encrypt_init(void); void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); +bool sme_active(void); +bool sev_active(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -64,6 +67,9 @@ static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } static inline void __init sme_enable(struct boot_params *bp) { } +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 16c5f37933a2..add836d3d174 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -42,6 +42,8 @@ static char sme_cmdline_off[] __initdata = "off"; u64 sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +static bool sev_enabled __section(.data); + /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -192,6 +194,30 @@ void __init sme_early_init(void) protection_map[i] = pgprot_encrypted(protection_map[i]); } +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * sme_active() and sev_active() functions are used for this. When a + * distinction isn't needed, the mem_encrypt_active() function can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +bool sme_active(void) +{ + return sme_me_mask && !sev_enabled; +} +EXPORT_SYMBOL_GPL(sme_active); + +bool sev_active(void) +{ + return sme_me_mask && sev_enabled; +} +EXPORT_SYMBOL_GPL(sev_active); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 265a9cd21cb4..b310a9c18113 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -23,11 +23,14 @@ #define sme_me_mask 0ULL +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ -static inline bool sme_active(void) +static inline bool mem_encrypt_active(void) { - return !!sme_me_mask; + return sme_me_mask; } static inline u64 sme_get_me_mask(void) From 682af54399b6111730aec0be63e5f6a3a3359a76 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:45 -0500 Subject: [PATCH 0978/1085] x86/mm: Don't attempt to encrypt initrd under SEV When SEV is active the initrd/initramfs will already have already been placed in memory encrypted so do not try to encrypt it. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20171020143059.3291-4-brijesh.singh@amd.com --- arch/x86/kernel/setup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..507100a72eb3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -380,9 +380,11 @@ static void __init reserve_initrd(void) * If SME is active, this memory will be marked encrypted by the * kernel when it is accessed (including relocation). However, the * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. + * sure that it is encrypted before accessing it. For SEV the + * ramdisk will already be encrypted, so only do this for SME. */ - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); + if (sme_active()) + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); initrd_start = 0; From fcdcd6cdd98ff4d0cf876f863024a2fb0b491d41 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:46 -0500 Subject: [PATCH 0979/1085] x86/realmode: Don't decrypt trampoline area under SEV When SEV is active the trampoline area will need to be in encrypted memory so only mark the area decrypted if SME is active. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: kvm@vger.kernel.org Cc: Borislav Petkov Cc: Andy Lutomirski Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-5-brijesh.singh@amd.com --- arch/x86/realmode/init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index ed84d3917a59..d10105825d57 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -64,9 +64,10 @@ static void __init setup_real_mode(void) /* * If SME is active, the trampoline area will need to be in * decrypted memory in order to bring up other processors - * successfully. + * successfully. This is not needed for SEV. */ - set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + if (sme_active()) + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); memcpy(base, real_mode_blob, size); From 072f58c6ce29cf6cf429480fcd1b1e87d1d5ed18 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:47 -0500 Subject: [PATCH 0980/1085] x86/mm: Use encrypted access of boot related data with SEV When Secure Encrypted Virtualization (SEV) is active, boot data (such as EFI related data, setup data) is encrypted and needs to be accessed as such when mapped. Update the architecture override in early_memremap to keep the encryption attribute when mapping this data. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: kvm@vger.kernel.org Cc: Matt Fleming Cc: Borislav Petkov Cc: Andy Lutomirski Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-6-brijesh.singh@amd.com --- arch/x86/mm/ioremap.c | 44 +++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 34f0e1847dd6..52cc0f4ed494 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -422,6 +422,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) * areas should be mapped decrypted. And since the encryption key can * change across reboots, persistent memory should also be mapped * decrypted. + * + * If SEV is active, that implies that BIOS/UEFI also ran encrypted so + * only persistent memory should be mapped decrypted. */ static bool memremap_should_map_decrypted(resource_size_t phys_addr, unsigned long size) @@ -458,6 +461,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, case E820_TYPE_ACPI: case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: + /* For SEV, these areas are encrypted */ + if (sev_active()) + break; + /* Fallthrough */ + case E820_TYPE_PRAM: return true; default: @@ -581,7 +589,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, unsigned long flags) { - if (!sme_active()) + if (!mem_encrypt_active()) return true; if (flags & MEMREMAP_ENC) @@ -590,12 +598,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, if (flags & MEMREMAP_DEC) return false; - if (memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - return false; + if (sme_active()) { + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + return false; + } - return true; + return !memremap_should_map_decrypted(phys_addr, size); } /* @@ -608,17 +617,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, unsigned long size, pgprot_t prot) { - if (!sme_active()) + bool encrypted_prot; + + if (!mem_encrypt_active()) return prot; - if (early_memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - prot = pgprot_decrypted(prot); - else - prot = pgprot_encrypted(prot); + encrypted_prot = true; - return prot; + if (sme_active()) { + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + encrypted_prot = false; + } + + if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size)) + encrypted_prot = false; + + return encrypted_prot ? pgprot_encrypted(prot) + : pgprot_decrypted(prot); } bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) From a72ec5a34dca3b635eb2de3b485d0a1b2e591a5c Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:48 -0500 Subject: [PATCH 0981/1085] x86/mm: Include SEV for encryption memory attribute changes The current code checks only for sme_active() when determining whether to perform the encryption attribute change. Include sev_active() in this check so that memory attribute changes can occur under SME and SEV. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: John Ogness Cc: kvm@vger.kernel.org Cc: Matt Fleming Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Dan Williams Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-7-brijesh.singh@amd.com --- arch/x86/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dfb7d657cf43..3fe68483463c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) unsigned long start; int ret; - /* Nothing to do if the SME is not active */ - if (!sme_active()) + /* Nothing to do if memory encryption is not active */ + if (!mem_encrypt_active()) return 0; /* Should not be working on unaligned addresses */ From 1379edd5967346a1fa79b8dc3e9ca261576c6bc9 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:49 -0500 Subject: [PATCH 0982/1085] x86/efi: Access EFI data as encrypted when SEV is active EFI data is encrypted when the kernel is run under SEV. Update the page table references to be sure the EFI memory areas are accessed encrypted. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: linux-efi@vger.kernel.org Cc: kvm@vger.kernel.org Cc: Ard Biesheuvel Cc: Matt Fleming Cc: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20171020143059.3291-8-brijesh.singh@amd.com --- arch/x86/platform/efi/efi_64.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 20fb31579b69..9e4ee5b04b2d 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -370,7 +371,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * as trim_bios_range() will reserve the first page and isolate it away * from memory allocators anyway. */ - if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pf = _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) { pr_err("Failed to create 1:1 mapping for the first page!\n"); return 1; } @@ -413,6 +418,9 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; + if (sev_active()) + flags |= _PAGE_ENC; + pfn = md->phys_addr >> PAGE_SHIFT; if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", @@ -539,6 +547,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m if (!(md->attribute & EFI_MEMORY_RO)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + return efi_update_mappings(md, pf); } @@ -590,6 +601,9 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + efi_update_mappings(md, pf); } } From 4ac2aed837cbdbb21c12a28c04718e34c1dc225f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:50 -0500 Subject: [PATCH 0983/1085] resource: Consolidate resource walking code The walk_iomem_res_desc(), walk_system_ram_res() and walk_system_ram_range() functions each have much of the same code. Create a new function that consolidates the common code from these functions in one place to reduce the amount of duplicated code. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-9-brijesh.singh@amd.com --- kernel/resource.c | 52 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 9b5f04404152..7323c1b636cd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -400,6 +400,26 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, return 0; } +static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, + bool first_level_children_only, + void *arg, int (*func)(u64, u64, void *)) +{ + u64 orig_end = res->end; + int ret = -1; + + while ((res->start < res->end) && + !find_next_iomem_res(res, desc, first_level_children_only)) { + ret = (*func)(res->start, res->end, arg); + if (ret) + break; + + res->start = res->end + 1; + res->end = orig_end; + } + + return ret; +} + /* * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. @@ -418,26 +438,12 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = flags; - orig_end = res.end; - while ((res.start < res.end) && - (!find_next_iomem_res(&res, desc, false))) { - - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - - res.start = res.end + 1; - res.end = orig_end; - } - - return ret; + return __walk_iomem_res_desc(&res, desc, false, arg, func); } /* @@ -451,22 +457,13 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); } #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) @@ -508,6 +505,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; } + /* * This generic page_is_ram() returns true if specified address is * registered as System RAM in iomem_resource list. From 1d2e733b13b450e5854f4a8f8efcd77fa7362d62 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:51 -0500 Subject: [PATCH 0984/1085] resource: Provide resource struct in resource walk callback In preperation for a new function that will need additional resource information during the resource walk, update the resource walk callback to pass the resource structure. Since the current callback start and end arguments are pulled from the resource structure, the callback functions can obtain them from the resource structure directly. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20171020143059.3291-10-brijesh.singh@amd.com --- arch/powerpc/kernel/machine_kexec_file_64.c | 12 +++++++++--- arch/x86/kernel/crash.c | 18 +++++++++--------- arch/x86/kernel/pmem.c | 2 +- include/linux/ioport.h | 4 ++-- include/linux/kexec.h | 2 +- kernel/kexec_file.c | 5 +++-- kernel/resource.c | 9 +++++---- 7 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 992c0d258e5d..e4395f937d63 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -91,11 +91,13 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) * and that value will be returned. If all free regions are visited without * func returning non-zero, then zero will be returned. */ -int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) +int arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) { int ret = 0; u64 i; phys_addr_t mstart, mend; + struct resource res = { }; if (kbuf->top_down) { for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, @@ -105,7 +107,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } @@ -117,7 +121,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 44404e2307bb..815008c9ca18 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE -static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } -static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { struct crash_elf_data *ced = arg; Elf64_Ehdr *ehdr; @@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) ehdr = ced->ehdr; /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, start, end); + ret = elf_header_exclude_ranges(ced, res->start, res->end); if (ret) return ret; @@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) return 0; } -static int memmap_entry_callback(u64 start, u64 end, void *arg) +static int memmap_entry_callback(struct resource *res, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; struct e820_entry ei; - ei.addr = start; - ei.size = end - start + 1; + ei.addr = res->start; + ei.size = res->end - res->start + 1; ei.type = cmd->type; add_e820_entry(params, &ei); @@ -619,12 +619,12 @@ out: return ret; } -static int determine_backup_region(u64 start, u64 end, void *arg) +static int determine_backup_region(struct resource *res, void *arg) { struct kimage *image = arg; - image->arch.backup_src_start = start; - image->arch.backup_src_sz = end - start + 1; + image->arch.backup_src_start = res->start; + image->arch.backup_src_sz = res->end - res->start + 1; /* Expecting only one range for backup region */ return 1; diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3fe690067802..6b07faaa1579 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -7,7 +7,7 @@ #include #include -static int found(u64 start, u64 end, void *data) +static int found(struct resource *res, void *data) { return 1; } diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 83c8d6530f0f..c0070d7c4b99 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -272,10 +272,10 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); extern int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)); + int (*func)(struct resource *, void *)); extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, - void *arg, int (*func)(u64, u64, void *)); + void *arg, int (*func)(struct resource *, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1c08c925cefb..f16f6ceb3875 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -160,7 +160,7 @@ struct kexec_buf { }; int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(u64, u64, void *)); + int (*func)(struct resource *, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); #endif /* CONFIG_KEXEC_FILE */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9f48f4412297..e5bcd94c1efb 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, return 1; } -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +static int locate_mem_hole_callback(struct resource *res, void *arg) { struct kexec_buf *kbuf = (struct kexec_buf *)arg; + u64 start = res->start, end = res->end; unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ @@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) * func returning non-zero, then zero will be returned. */ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, diff --git a/kernel/resource.c b/kernel/resource.c index 7323c1b636cd..8430042fa77b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -402,14 +402,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, bool first_level_children_only, - void *arg, int (*func)(u64, u64, void *)) + void *arg, + int (*func)(struct resource *, void *)) { u64 orig_end = res->end; int ret = -1; while ((res->start < res->end) && !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res->start, res->end, arg); + ret = (*func)(res, arg); if (ret) break; @@ -435,7 +436,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, * and set it in 'desc' of a target resource entry. */ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, - u64 end, void *arg, int (*func)(u64, u64, void *)) + u64 end, void *arg, int (*func)(struct resource *, void *)) { struct resource res; @@ -454,7 +455,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { struct resource res; From 0e4c12b45aa88e74fdda117896d2b61c4e510cb9 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:52 -0500 Subject: [PATCH 0985/1085] x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order for memory pages to be properly mapped when SEV is active, it's necessary to use the PAGE_KERNEL protection attribute as the base protection. This ensures that memory mapping of, e.g. ACPI tables, receives the proper mapping attributes. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: Kees Cook Cc: kvm@vger.kernel.org Cc: Jérôme Glisse Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andrew Morton Cc: Dan Williams Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-11-brijesh.singh@amd.com --- arch/x86/mm/ioremap.c | 79 +++++++++++++++++++++++++++++++++++------- include/linux/ioport.h | 3 ++ kernel/resource.c | 19 ++++++++++ 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 52cc0f4ed494..6e4573b1da34 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -27,6 +27,11 @@ #include "physaddr.h" +struct ioremap_mem_flags { + bool system_ram; + bool desc_other; +}; + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, - void *arg) +static bool __ioremap_check_ram(struct resource *res) { + unsigned long start_pfn, stop_pfn; unsigned long i; - for (i = 0; i < nr_pages; ++i) - if (pfn_valid(start_pfn + i) && - !PageReserved(pfn_to_page(start_pfn + i))) - return 1; + if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) + return false; - return 0; + start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; + stop_pfn = (res->end + 1) >> PAGE_SHIFT; + if (stop_pfn > start_pfn) { + for (i = 0; i < (stop_pfn - start_pfn); ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return true; + } + + return false; +} + +static int __ioremap_check_desc_other(struct resource *res) +{ + return (res->desc != IORES_DESC_NONE); +} + +static int __ioremap_res_check(struct resource *res, void *arg) +{ + struct ioremap_mem_flags *flags = arg; + + if (!flags->system_ram) + flags->system_ram = __ioremap_check_ram(res); + + if (!flags->desc_other) + flags->desc_other = __ioremap_check_desc_other(res); + + return flags->system_ram && flags->desc_other; +} + +/* + * To avoid multiple resource walks, this function walks resources marked as + * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a + * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + */ +static void __ioremap_check_mem(resource_size_t addr, unsigned long size, + struct ioremap_mem_flags *flags) +{ + u64 start, end; + + start = (u64)addr; + end = start + size - 1; + memset(flags, 0, sizeof(*flags)); + + walk_mem_res(start, end, flags, __ioremap_res_check); } /* @@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; - resource_size_t pfn, last_pfn, last_addr; + resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; + struct ioremap_mem_flags mem_flags; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } + __ioremap_check_mem(phys_addr, size, &mem_flags); + /* * Don't allow anybody to remap normal RAM that we're using.. */ - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) { + if (mem_flags.system_ram) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pcm = new_pcm; } + /* + * If the page being mapped is in memory and SEV is active then + * make sure the memory encryption attribute is enabled in the + * resulting mapping. + */ prot = PAGE_KERNEL_IO; + if (sev_active() && mem_flags.desc_other) + prot = pgprot_encrypted(prot); + switch (pcm) { case _PAGE_CACHE_MODE_UC: default: diff --git a/include/linux/ioport.h b/include/linux/ioport.h index c0070d7c4b99..93b4183cf53d 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -271,6 +271,9 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); extern int +walk_mem_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int diff --git a/kernel/resource.c b/kernel/resource.c index 8430042fa77b..54ba6de3757c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -397,6 +397,8 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, res->start = p->start; if (res->end > p->end) res->end = p->end; + res->flags = p->flags; + res->desc = p->desc; return 0; } @@ -467,6 +469,23 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, arg, func); } +/* + * This function calls the @func callback against all memory ranges, which + * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. + */ +int walk_mem_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); +} + #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* From d7b417fa08d1187923c270bc33a3555c2fcff8b9 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:53 -0500 Subject: [PATCH 0986/1085] x86/mm: Add DMA support for SEV memory encryption DMA access to encrypted memory cannot be performed when SEV is active. In order for DMA to properly work when SEV is active, the SWIOTLB bounce buffers must be used. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov C Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Konrad Rzeszutek Wilk Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-12-brijesh.singh@amd.com --- arch/x86/mm/mem_encrypt.c | 86 +++++++++++++++++++++++++++++++++++++++ lib/swiotlb.c | 5 ++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index add836d3d174..e8bfad75ba29 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -192,6 +192,70 @@ void __init sme_early_init(void) /* Update the protection map with memory encryption mask */ for (i = 0; i < ARRAY_SIZE(protection_map); i++) protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (sev_active()) + swiotlb_force = SWIOTLB_FORCE; +} + +static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned long dma_mask; + unsigned int order; + struct page *page; + void *vaddr = NULL; + + dma_mask = dma_alloc_coherent_mask(dev, gfp); + order = get_order(size); + + /* + * Memory will be memset to zero after marking decrypted, so don't + * bother clearing it before. + */ + gfp &= ~__GFP_ZERO; + + page = alloc_pages_node(dev_to_node(dev), gfp, order); + if (page) { + dma_addr_t addr; + + /* + * Since we will be clearing the encryption bit, check the + * mask with it already cleared. + */ + addr = __sme_clr(phys_to_dma(dev, page_to_phys(page))); + if ((addr + size) > dma_mask) { + __free_pages(page, get_order(size)); + } else { + vaddr = page_address(page); + *dma_handle = addr; + } + } + + if (!vaddr) + vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp); + + if (!vaddr) + return NULL; + + /* Clear the SME encryption bit for DMA use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) { + set_memory_decrypted((unsigned long)vaddr, 1 << order); + memset(vaddr, 0, PAGE_SIZE << order); + *dma_handle = __sme_clr(*dma_handle); + } + + return vaddr; +} + +static void sev_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + /* Set the SME encryption bit for re-use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle))) + set_memory_encrypted((unsigned long)vaddr, + 1 << get_order(size)); + + swiotlb_free_coherent(dev, size, vaddr, dma_handle); } /* @@ -218,6 +282,20 @@ bool sev_active(void) } EXPORT_SYMBOL_GPL(sev_active); +static const struct dma_map_ops sev_dma_ops = { + .alloc = sev_alloc, + .free = sev_free, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, +}; + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -227,6 +305,14 @@ void __init mem_encrypt_init(void) /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); + /* + * With SEV, DMA operations cannot use encryption. New DMA ops + * are required in order to mark the DMA areas as decrypted or + * to use bounce buffers. + */ + if (sev_active()) + dma_ops = &sev_dma_ops; + pr_info("AMD Secure Memory Encryption (SME) active\n"); } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 8c6c83ef57a4..cea19aaf303c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -507,8 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); - if (sme_active()) - pr_warn_once("SME is active and system is using DMA bounce buffers\n"); + if (mem_encrypt_active()) + pr_warn_once("%s is active and system is using DMA bounce buffers\n", + sme_active() ? "SME" : "SEV"); mask = dma_get_seg_boundary(hwdev); From 1958b5fc401067662ec11a6fcbe0daa26c813603 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:54 -0500 Subject: [PATCH 0987/1085] x86/boot: Add early boot support when running with SEV active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Early in the boot process, add checks to determine if the kernel is running with Secure Encrypted Virtualization (SEV) active. Checking for SEV requires checking that the kernel is running under a hypervisor (CPUID 0x00000001, bit 31), that the SEV feature is available (CPUID 0x8000001f, bit 1) and then checking a non-interceptable SEV MSR (0xc0010131, bit 0). This check is required so that during early compressed kernel booting the pagetables (both the boot pagetables and KASLR pagetables (if enabled) are updated to include the encryption mask so that when the kernel is decompressed into encrypted memory, it can boot properly. After the kernel is decompressed and continues booting the same logic is used to check if SEV is active and set a flag indicating so. This allows to distinguish between SME and SEV, each of which have unique differences in how certain things are handled: e.g. DMA (always bounce buffered with SEV) or EFI tables (always access decrypted with SME). Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: Kees Cook Cc: kvm@vger.kernel.org Cc: Konrad Rzeszutek Wilk Cc: Radim Krčmář Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-13-brijesh.singh@amd.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/head_64.S | 16 ++++ arch/x86/boot/compressed/mem_encrypt.S | 120 +++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 2 + arch/x86/boot/compressed/pagetable.c | 8 +- arch/x86/include/asm/msr-index.h | 3 + arch/x86/include/uapi/asm/kvm_para.h | 1 - arch/x86/mm/mem_encrypt.c | 50 ++++++++--- 8 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 arch/x86/boot/compressed/mem_encrypt.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4b7575b00563..61827c2282bf 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,6 +78,7 @@ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-y += $(obj)/mem_encrypt.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index beb255b66447..20919b4f3133 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -131,6 +131,19 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ + /* + * If SEV is active then set the encryption mask in the page tables. + * This will insure that when the kernel is copied and decompressed + * it will be done so encrypted. + */ + call get_sev_encryption_bit + xorl %edx, %edx + testl %eax, %eax + jz 1f + subl $32, %eax /* Encryption bit is always above bit 31 */ + bts %eax, %edx /* Set encryption mask for page tables */ +1: + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax @@ -141,12 +154,14 @@ ENTRY(startup_32) leal pgtable + 0(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) + addl %edx, 4(%edi) /* Build Level 3 */ leal pgtable + 0x1000(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) + addl %edx, 0x04(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx @@ -157,6 +172,7 @@ ENTRY(startup_32) movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) + addl %edx, 4(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S new file mode 100644 index 000000000000..54f5f6625a73 --- /dev/null +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -0,0 +1,120 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#include +#include +#include + + .text + .code32 +ENTRY(get_sev_encryption_bit) + xor %eax, %eax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %ebx + push %ecx + push %edx + push %edi + + /* + * RIP-relative addressing is needed to access the encryption bit + * variable. Since we are running in 32-bit mode we need this call/pop + * sequence to get the proper relative addressing. + */ + call 1f +1: popl %edi + subl $1b, %edi + + movl enc_bit(%edi), %eax + cmpl $0, %eax + jge .Lsev_exit + + /* Check if running under a hypervisor */ + movl $1, %eax + cpuid + bt $31, %ecx /* Check the hypervisor bit */ + jnc .Lno_sev + + movl $0x80000000, %eax /* CPUID to check the highest leaf */ + cpuid + cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ + jb .Lno_sev + + /* + * Check for the SEV feature: + * CPUID Fn8000_001F[EAX] - Bit 1 + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + movl $0x8000001f, %eax + cpuid + bt $1, %eax /* Check if SEV is available */ + jnc .Lno_sev + + movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ + rdmsr + bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */ + jnc .Lno_sev + + movl %ebx, %eax + andl $0x3f, %eax /* Return the encryption bit location */ + movl %eax, enc_bit(%edi) + jmp .Lsev_exit + +.Lno_sev: + xor %eax, %eax + movl %eax, enc_bit(%edi) + +.Lsev_exit: + pop %edi + pop %edx + pop %ecx + pop %ebx + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + + ret +ENDPROC(get_sev_encryption_bit) + + .code64 +ENTRY(get_sev_encryption_mask) + xor %rax, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %rbp + push %rdx + + movq %rsp, %rbp /* Save current stack pointer */ + + call get_sev_encryption_bit /* Get the encryption bit position */ + testl %eax, %eax + jz .Lno_sev_mask + + xor %rdx, %rdx + bts %rax, %rdx /* Create the encryption mask */ + mov %rdx, %rax /* ... and return it */ + +.Lno_sev_mask: + movq %rbp, %rsp /* Restore original stack pointer */ + + pop %rdx + pop %rbp +#endif + + ret +ENDPROC(get_sev_encryption_mask) + + .data +enc_bit: + .int 0xffffffff diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 32d4ec2e0243..9d323dc6b159 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -109,4 +109,6 @@ static inline void console_init(void) { } #endif +unsigned long get_sev_encryption_mask(void); + #endif diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 972319ff5b01..d5364ca2e3f9 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -77,16 +77,18 @@ static unsigned long top_level_pgt; * Mapping information structure passed to kernel_ident_mapping_init(). * Due to relocation, pointers must be assigned at run time not build time. */ -static struct x86_mapping_info mapping_info = { - .page_flag = __PAGE_KERNEL_LARGE_EXEC, -}; +static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { + unsigned long sev_me_mask = get_sev_encryption_mask(); + /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ab022618a50a..34c4922bbc3f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -324,6 +324,9 @@ #define MSR_AMD64_IBSBRTARGET 0xc001103b #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV 0xc0010131 +#define MSR_AMD64_SEV_ENABLED_BIT 0 +#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 554aa8f24f91..09cc06483bed 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -110,5 +110,4 @@ struct kvm_vcpu_pv_apf_data { #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK #define KVM_PV_EOI_DISABLED 0x0 - #endif /* _UAPI_ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index e8bfad75ba29..fd9c7bb51664 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -313,7 +313,9 @@ void __init mem_encrypt_init(void) if (sev_active()) dma_ops = &sev_dma_ops; - pr_info("AMD Secure Memory Encryption (SME) active\n"); + pr_info("AMD %s active\n", + sev_active() ? "Secure Encrypted Virtualization (SEV)" + : "Secure Memory Encryption (SME)"); } void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) @@ -641,37 +643,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp) { const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; bool active_by_default; unsigned long me_mask; char buffer[16]; u64 msr; - /* Check for the SME support leaf */ + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); if (eax < 0x8000001f) return; +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) /* - * Check for the SME feature: - * CPUID Fn8000_001F[EAX] - Bit 0 - * Secure Memory Encryption support - * CPUID Fn8000_001F[EBX] - Bits 5:0 - * Pagetable bit position used to indicate encryption + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption */ eax = 0x8000001f; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & 1)) + if (!(eax & feature_mask)) return; me_mask = 1UL << (ebx & 0x3f); - /* Check if SME is enabled */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; return; + } /* * Fixups have not been applied to phys_base yet and we're running From 606b21d4a6498c23632a4693c81b7b24feedd038 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:55 -0500 Subject: [PATCH 0988/1085] x86/io: Unroll string I/O when SEV is active Secure Encrypted Virtualization (SEV) does not support string I/O, so unroll the string I/O operation into a loop operating on one element at a time. [ tglx: Gave the static key a real name instead of the obscure __sev ] Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: kvm@vger.kernel.org Cc: David Laight Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-14-brijesh.singh@amd.com --- arch/x86/include/asm/io.h | 43 +++++++++++++++++++++++++++++++++++---- arch/x86/mm/mem_encrypt.c | 8 ++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 11398d55aefa..93ae8aee1780 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -266,6 +266,21 @@ static inline void slow_down_io(void) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +#include + +extern struct static_key_false sev_enable_key; +static inline bool sev_key_active(void) +{ + return static_branch_unlikely(&sev_enable_key); +} + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +static inline bool sev_key_active(void) { return false; } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ @@ -296,14 +311,34 @@ static inline unsigned type in##bwl##_p(int port) \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + out##bwl(*value, port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + *value = in##bwl(port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } BUILDIO(b, b, char) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index fd9c7bb51664..d29b7831a053 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -41,6 +41,8 @@ static char sme_cmdline_off[] __initdata = "off"; */ u64 sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +DEFINE_STATIC_KEY_FALSE(sev_enable_key); +EXPORT_SYMBOL_GPL(sev_enable_key); static bool sev_enabled __section(.data); @@ -313,6 +315,12 @@ void __init mem_encrypt_init(void) if (sev_active()) dma_ops = &sev_dma_ops; + /* + * With SEV, we need to unroll the rep string I/O instructions. + */ + if (sev_active()) + static_branch_enable(&sev_enable_key); + pr_info("AMD %s active\n", sev_active() ? "Secure Encrypted Virtualization (SEV)" : "Secure Memory Encryption (SME)"); From dfaaec9033b80d71056e21cda920752e55f2c514 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 20 Oct 2017 09:30:56 -0500 Subject: [PATCH 0989/1085] x86: Add support for changing memory encryption attribute in early boot Some KVM-specific custom MSRs share the guest physical address with the hypervisor in early boot. When SEV is active, the shared physical address must be mapped with memory encryption attribute cleared so that both hypervisor and guest can access the data. Add APIs to change the memory encryption attribute in early boot code. Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-15-brijesh.singh@amd.com --- arch/x86/include/asm/mem_encrypt.h | 8 ++ arch/x86/mm/mem_encrypt.c | 130 +++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2b024741bce9..c9459a4c3c68 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -42,6 +42,9 @@ void __init sme_early_init(void); void __init sme_encrypt_kernel(void); void __init sme_enable(struct boot_params *bp); +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); @@ -70,6 +73,11 @@ static inline void __init sme_enable(struct boot_params *bp) { } static inline bool sme_active(void) { return false; } static inline bool sev_active(void) { return false; } +static inline int __init +early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline int __init +early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d29b7831a053..5049b8bad3e7 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -30,6 +30,8 @@ #include #include +#include "mm_internal.h" + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -260,6 +262,134 @@ static void sev_free(struct device *dev, size_t size, void *vaddr, swiotlb_free_coherent(dev, size, vaddr, dma_handle); } +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + old_prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + old_prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + old_prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + return; + } + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << page_level_shift(level); + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + kernel_physical_mapping_init(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + /* * SME and SEV are very similar but they are not the same, so there are * times that the kernel will need to distinguish between SME and SEV. The From ac26963a1175c813e3ed21c0d2435b083173136e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 20 Oct 2017 09:30:57 -0500 Subject: [PATCH 0990/1085] percpu: Introduce DEFINE_PER_CPU_DECRYPTED KVM guest defines three per-CPU variables (steal-time, apf_reason, and kvm_pic_eoi) which are shared between a guest and a hypervisor. When SEV is active, memory is encrypted with a guest-specific key, and if the guest OS wants to share the memory region with the hypervisor then it must clear the C-bit (i.e set decrypted) before sharing it. DEFINE_PER_CPU_DECRYPTED can be used to define the per-CPU variables which will be shared between a guest and a hypervisor. Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Acked-by: Tejun Heo Reviewed-by: Borislav Petkov Cc: linux-arch@vger.kernel.org Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Lameter Link: https://lkml.kernel.org/r/20171020143059.3291-16-brijesh.singh@amd.com --- include/asm-generic/vmlinux.lds.h | 19 +++++++++++++++++++ include/linux/percpu-defs.h | 15 +++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 63e56f6c1877..c58f3805e348 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -777,6 +777,24 @@ #define INIT_RAM_FS #endif +/* + * Memory encryption operates on a page basis. Since we need to clear + * the memory encryption mask for this section, it needs to be aligned + * on a page boundary and be a page-size multiple in length. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#ifdef CONFIG_AMD_MEM_ENCRYPT +#define PERCPU_DECRYPTED_SECTION \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..decrypted) \ + . = ALIGN(PAGE_SIZE); +#else +#define PERCPU_DECRYPTED_SECTION +#endif + + /* * Default discarded sections. * @@ -815,6 +833,7 @@ . = ALIGN(cacheline); \ *(.data..percpu) \ *(.data..percpu..shared_aligned) \ + PERCPU_DECRYPTED_SECTION \ VMLINUX_SYMBOL(__per_cpu_end) = .; /** diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8f16299ca068..2d2096ba1cfe 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -172,6 +172,21 @@ #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..read_mostly") +/* + * Declaration/definition used for per-CPU variables that should be accessed + * as decrypted when memory encryption is enabled in the guest. + */ +#if defined(CONFIG_VIRTUALIZATION) && defined(CONFIG_AMD_MEM_ENCRYPT) + +#define DECLARE_PER_CPU_DECRYPTED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..decrypted") + +#define DEFINE_PER_CPU_DECRYPTED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..decrypted") +#else +#define DEFINE_PER_CPU_DECRYPTED(type, name) DEFINE_PER_CPU(type, name) +#endif + /* * Intermodule exports for per-CPU variables. sparse forgets about * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to From 4716276184ec67a123a4eab81609a0688b1d650b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 20 Oct 2017 09:30:58 -0500 Subject: [PATCH 0991/1085] X86/KVM: Decrypt shared per-cpu variables when SEV is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SEV is active, guest memory is encrypted with a guest-specific key, a guest memory region shared with the hypervisor must be mapped as decrypted before it can be shared. Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Borislav Petkov Cc: Paolo Bonzini Link: https://lkml.kernel.org/r/20171020143059.3291-17-brijesh.singh@amd.com --- arch/x86/kernel/kvm.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8bb9594d0761..3df743b60c80 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; /* @@ -312,7 +312,7 @@ static void kvm_register_steal_time(void) cpu, (unsigned long long) slow_virt_to_phys(st)); } -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { @@ -426,9 +426,42 @@ void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!sev_active()) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); @@ -496,6 +529,7 @@ void __init kvm_guest_init(void) kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else + sev_map_percpu_data(); kvm_guest_cpu_init(); #endif From 819aeee065e5d1b417ecd633897427c89f3253ec Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 20 Oct 2017 09:30:59 -0500 Subject: [PATCH 0992/1085] X86/KVM: Clear encryption attribute when SEV is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The guest physical memory area holding the struct pvclock_wall_clock and struct pvclock_vcpu_time_info are shared with the hypervisor. It periodically updates the contents of the memory. When SEV is active, the encryption attributes from the shared memory pages must be cleared so that both hypervisor and guest can access the data. Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Borislav Petkov Cc: Paolo Bonzini Link: https://lkml.kernel.org/r/20171020143059.3291-18-brijesh.singh@amd.com --- arch/x86/entry/vdso/vma.c | 5 +-- arch/x86/kernel/kvmclock.c | 67 +++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1911310959f8..d63053142b16 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -114,10 +114,11 @@ static int vvar_fault(const struct vm_special_mapping *sm, struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { - ret = vm_insert_pfn( + ret = vm_insert_pfn_prot( vma, vmf->address, - __pa(pvti) >> PAGE_SHIFT); + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b609e28ce3f..77b492c2d658 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -45,7 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock wall_clock; +static struct pvclock_wall_clock *wall_clock; struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { @@ -64,15 +65,15 @@ static void kvm_get_wallclock(struct timespec *now) int low, high; int cpu; - low = (int)__pa_symbol(&wall_clock); - high = ((u64)__pa_symbol(&wall_clock) >> 32); + low = (int)slow_virt_to_phys(wall_clock); + high = ((u64)slow_virt_to_phys(wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); + pvclock_read_wallclock(wall_clock, vcpu_time, now); put_cpu(); } @@ -249,11 +250,39 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, + phys_addr_t align) +{ + phys_addr_t mem; + + mem = memblock_alloc(size, align); + if (!mem) + return 0; + + if (sev_active()) { + if (early_set_memory_decrypted((unsigned long)__va(mem), size)) + goto e_free; + } + + return mem; +e_free: + memblock_free(mem, size); + return 0; +} + +static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +{ + if (sev_active()) + early_set_memory_encrypted((unsigned long)__va(addr), size); + + memblock_free(addr, size); +} + void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem; - int size, cpu; + unsigned long mem, mem_wall_clock; + int size, cpu, wall_clock_size; u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -267,21 +296,35 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", - msr_kvm_system_time, msr_kvm_wall_clock); - - mem = memblock_alloc(size, PAGE_SIZE); - if (!mem) + wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); + mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); + if (!mem_wall_clock) return; + + wall_clock = __va(mem_wall_clock); + memset(wall_clock, 0, wall_clock_size); + + mem = kvm_memblock_alloc(size, PAGE_SIZE); + if (!mem) { + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; + return; + } + hv_clock = __va(mem); memset(hv_clock, 0, size); if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - memblock_free(mem, size); + kvm_memblock_free(mem, size); + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; } + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); From 9275b933d409d3a4efa08102ca813557b93fb0b9 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 8 Nov 2017 03:18:01 +0800 Subject: [PATCH 0993/1085] resource: Fix resource_size.cocci warnings arch/x86/kernel/crash.c:627:34-37: ERROR: Missing resource_size with res arch/x86/kernel/crash.c:528:16-19: ERROR: Missing resource_size with res Use resource_size function on resource object instead of explicit computation. Generated by: scripts/coccinelle/api/resource_size.cocci Fixes: 1d2e733b13b4 ("resource: Provide resource struct in resource walk callback") Signed-off-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Juergen Gross Cc: Tom Lendacky Cc: Brijesh Singh Cc: Kees Cook Cc: Michael Ellerman Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171107191801.GA91887@lkp-snb01 --- arch/x86/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 815008c9ca18..10e74d4778a1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -525,7 +525,7 @@ static int memmap_entry_callback(struct resource *res, void *arg) struct e820_entry ei; ei.addr = res->start; - ei.size = res->end - res->start + 1; + ei.size = resource_size(res); ei.type = cmd->type; add_e820_entry(params, &ei); @@ -624,7 +624,7 @@ static int determine_backup_region(struct resource *res, void *arg) struct kimage *image = arg; image->arch.backup_src_start = res->start; - image->arch.backup_src_sz = res->end - res->start + 1; + image->arch.backup_src_sz = resource_size(res); /* Expecting only one range for backup region */ return 1; From 48070c73058be6de9c0d754d441ed7092dfc8f12 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 30 Oct 2017 14:38:58 +0100 Subject: [PATCH 0994/1085] s390/pci: do not require AIS facility As of today QEMU does not provide the AIS facility to its guest. This prevents Linux guests from using PCI devices as the ais facility is checked during init. As this is just a performance optimization, we can move the ais check into the code where we need it (calling the SIC instruction). This is used at initialization and on interrupt. Both places do not require any serialization, so we can simply skip the instruction. Since we will now get all interrupts, we can also avoid the 2nd scan. As we can have multiple interrupts in parallel we might trigger spurious irqs more often for the non-AIS case but the core code can handle that. Signed-off-by: Christian Borntraeger Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Acked-by: Sebastian Ott Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pci_insn.h | 2 +- arch/s390/pci/pci.c | 5 +++-- arch/s390/pci/pci_insn.c | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 34abcf275799..a74efc02ad2c 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -81,6 +81,6 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range); int zpci_load(u64 *data, u64 req, u64 offset); int zpci_store(u64 data, u64 req, u64 offset); int zpci_store_block(const u64 *data, u64 req, u64 offset); -void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); +int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); #endif diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index a25d95a6612d..0fe649c0d542 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -368,7 +368,8 @@ static void zpci_irq_handler(struct airq_struct *airq) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) + break; si = 0; continue; } @@ -956,7 +957,7 @@ static int __init pci_base_init(void) if (!s390_pci_probe) return 0; - if (!test_facility(69) || !test_facility(71) || !test_facility(72)) + if (!test_facility(69) || !test_facility(71)) return 0; rc = zpci_debug_init(); diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index ea34086c8674..81b840bc6e4e 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -91,11 +92,14 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) +int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) { + if (!test_facility(72)) + return -EIO; asm volatile ( " .insn rsy,0xeb00000000d1,%[ctl],%[isc],%[u]\n" : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [u] "Q" (*unused)); + return 0; } /* PCI Load */ From f44fa88745eda1530083b361e300e1ca4e15a6c5 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 27 Oct 2017 15:53:49 +0200 Subject: [PATCH 0995/1085] s390/archrandom: Reconsider s390 arch random implementation The reworked version of the random device driver now calls the arch_get_random_* functions on a very high frequency. It does about 100.000 calls to arch_get_random_long for providing 10 MB via /dev/urandom. Each invocation was fetching entropy from the hardware random generator which has a rate limit of about 4 MB/s. As the trng invocation waits until enough entropy is gathered, the random device driver is slowed down dramatically. The s390 true random generator is not designed for such a high rate. The TRNG is more designed to be used together with the arch_get_random_seed_* functions. This is similar to the way how powerpc has implemented their arch random functionality. This patch removes the invocations of the s390 TRNG for arch_get_random_long() and arch_get_random_int() but leaving the invocations for arch_get_random_seed_long() and arch_get_random_seed_int(). So the s390 arch random implementation now contributes high quality entropy to the kernel random device for reseeding. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- arch/s390/include/asm/archrandom.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 6033901a40b2..9695f8d09edf 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -27,26 +27,27 @@ static void s390_arch_random_generate(u8 *buf, unsigned int nbytes) static inline bool arch_has_random(void) { - if (static_branch_likely(&s390_arch_random_available)) - return true; return false; } static inline bool arch_has_random_seed(void) { - return arch_has_random(); + if (static_branch_likely(&s390_arch_random_available)) + return true; + return false; } static inline bool arch_get_random_long(unsigned long *v) { - if (static_branch_likely(&s390_arch_random_available)) { - s390_arch_random_generate((u8 *)v, sizeof(*v)); - return true; - } return false; } static inline bool arch_get_random_int(unsigned int *v) +{ + return false; +} + +static inline bool arch_get_random_seed_long(unsigned long *v) { if (static_branch_likely(&s390_arch_random_available)) { s390_arch_random_generate((u8 *)v, sizeof(*v)); @@ -55,14 +56,13 @@ static inline bool arch_get_random_int(unsigned int *v) return false; } -static inline bool arch_get_random_seed_long(unsigned long *v) -{ - return arch_get_random_long(v); -} - static inline bool arch_get_random_seed_int(unsigned int *v) { - return arch_get_random_int(v); + if (static_branch_likely(&s390_arch_random_available)) { + s390_arch_random_generate((u8 *)v, sizeof(*v)); + return true; + } + return false; } #endif /* CONFIG_ARCH_RANDOM */ From 978fa72e82e375764e6e31e7a721408c5186918f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 2 Nov 2017 12:51:45 +0100 Subject: [PATCH 0996/1085] s390: remove named saved segment support Remove the support to create a z/VM named saved segment (NSS). This feature is not supported since quite a while in favour of jump labels, function tracing and (now) CPU alternatives. All of these features require to write to the kernel text section which is not possible if the kernel is contained within an NSS. Given that memory savings are minimal if kernel images are shared and in addition updates of shared images are painful, the NSS feature can be removed. Reviewed-by: Hendrik Brueckner Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 13 --- arch/s390/include/asm/ipl.h | 3 +- arch/s390/include/asm/sections.h | 2 +- arch/s390/include/asm/setup.h | 3 - arch/s390/kernel/early.c | 141 ------------------------------- arch/s390/kernel/ipl.c | 36 -------- arch/s390/kernel/machine_kexec.c | 4 - arch/s390/kernel/suspend.c | 8 +- arch/s390/kernel/vmlinux.lds.S | 5 -- arch/s390/mm/vmem.c | 4 +- 10 files changed, 9 insertions(+), 210 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 080a3fcc3cb8..9fe2fd57bba9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -824,19 +824,6 @@ config PFAULT Everybody who wants to run Linux under VM != VM4.2 should select this option. -config SHARED_KERNEL - bool "VM shared kernel support" - depends on !JUMP_LABEL - depends on !ALTERNATIVES - help - Select this option, if you want to share the text segment of the - Linux kernel between different VM guests. This reduces memory - usage with lots of guests but greatly increases kernel size. - Also if a kernel was IPL'ed from a shared segment the kexec system - call will not work. - You should only select this option if you know what you are - doing and want to exploit this feature. - config CMM def_tristate n prompt "Cooperative memory management" diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 6810bd757312..c40cb348dd79 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -12,6 +12,8 @@ #include #include +#define NSS_NAME_SIZE 8 + #define IPL_PARMBLOCK_ORIGIN 0x2000 #define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \ @@ -105,7 +107,6 @@ extern size_t append_ipl_scpdata(char *, size_t); enum { IPL_DEVNO_VALID = 1, IPL_PARMBLOCK_VALID = 2, - IPL_NSS_VALID = 4, }; enum ipl_type { diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index fbd9116eb17b..cd68bf115889 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -3,6 +3,6 @@ #include -extern char _eshared[], _ehead[]; +extern char _ehead[]; #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 490e035b3716..fb3c4a138ae3 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -97,9 +97,6 @@ extern char vmpoff_cmd[]; #define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) #define SET_CONSOLE_HVC do { console_mode = 5; } while (0) -#define NSS_NAME_SIZE 8 -extern char kernel_nss_name[]; - #ifdef CONFIG_PFAULT extern int pfault_init(void); extern void pfault_fini(void); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 60181caf8e8a..389e9f61c76f 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -30,14 +30,6 @@ #include #include "entry.h" -/* - * Create a Kernel NSS if the SAVESYS= parameter is defined - */ -#define DEFSYS_CMD_SIZE 128 -#define SAVESYS_CMD_SIZE 32 - -char kernel_nss_name[NSS_NAME_SIZE + 1]; - static void __init setup_boot_command_line(void); /* @@ -58,134 +50,6 @@ static void __init reset_tod_clock(void) S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } -#ifdef CONFIG_SHARED_KERNEL -int __init savesys_ipl_nss(char *cmd, const int cmdlen); - -asm( - " .section .init.text,\"ax\",@progbits\n" - " .align 4\n" - " .type savesys_ipl_nss, @function\n" - "savesys_ipl_nss:\n" - " stmg 6,15,48(15)\n" - " lgr 14,3\n" - " sam31\n" - " diag 2,14,0x8\n" - " sam64\n" - " lgr 2,14\n" - " lmg 6,15,48(15)\n" - " br 14\n" - " .size savesys_ipl_nss, .-savesys_ipl_nss\n" - " .previous\n"); - -static __initdata char upper_command_line[COMMAND_LINE_SIZE]; - -static noinline __init void create_kernel_nss(void) -{ - unsigned int i, stext_pfn, eshared_pfn, end_pfn, min_size; -#ifdef CONFIG_BLK_DEV_INITRD - unsigned int sinitrd_pfn, einitrd_pfn; -#endif - int response; - int hlen; - size_t len; - char *savesys_ptr; - char defsys_cmd[DEFSYS_CMD_SIZE]; - char savesys_cmd[SAVESYS_CMD_SIZE]; - - /* Do nothing if we are not running under VM */ - if (!MACHINE_IS_VM) - return; - - /* Convert COMMAND_LINE to upper case */ - for (i = 0; i < strlen(boot_command_line); i++) - upper_command_line[i] = toupper(boot_command_line[i]); - - savesys_ptr = strstr(upper_command_line, "SAVESYS="); - - if (!savesys_ptr) - return; - - savesys_ptr += 8; /* Point to the beginning of the NSS name */ - for (i = 0; i < NSS_NAME_SIZE; i++) { - if (savesys_ptr[i] == ' ' || savesys_ptr[i] == '\0') - break; - kernel_nss_name[i] = savesys_ptr[i]; - } - - stext_pfn = PFN_DOWN(__pa(&_stext)); - eshared_pfn = PFN_DOWN(__pa(&_eshared)); - end_pfn = PFN_UP(__pa(&_end)); - min_size = end_pfn << 2; - - hlen = snprintf(defsys_cmd, DEFSYS_CMD_SIZE, - "DEFSYS %s 00000-%.5X EW %.5X-%.5X SR %.5X-%.5X", - kernel_nss_name, stext_pfn - 1, stext_pfn, - eshared_pfn - 1, eshared_pfn, end_pfn); - -#ifdef CONFIG_BLK_DEV_INITRD - if (INITRD_START && INITRD_SIZE) { - sinitrd_pfn = PFN_DOWN(__pa(INITRD_START)); - einitrd_pfn = PFN_UP(__pa(INITRD_START + INITRD_SIZE)); - min_size = einitrd_pfn << 2; - hlen += snprintf(defsys_cmd + hlen, DEFSYS_CMD_SIZE - hlen, - " EW %.5X-%.5X", sinitrd_pfn, einitrd_pfn); - } -#endif - - snprintf(defsys_cmd + hlen, DEFSYS_CMD_SIZE - hlen, - " EW MINSIZE=%.7iK PARMREGS=0-13", min_size); - defsys_cmd[DEFSYS_CMD_SIZE - 1] = '\0'; - snprintf(savesys_cmd, SAVESYS_CMD_SIZE, "SAVESYS %s \n IPL %s", - kernel_nss_name, kernel_nss_name); - savesys_cmd[SAVESYS_CMD_SIZE - 1] = '\0'; - - __cpcmd(defsys_cmd, NULL, 0, &response); - - if (response != 0) { - pr_err("Defining the Linux kernel NSS failed with rc=%d\n", - response); - kernel_nss_name[0] = '\0'; - return; - } - - len = strlen(savesys_cmd); - ASCEBC(savesys_cmd, len); - response = savesys_ipl_nss(savesys_cmd, len); - - /* On success: response is equal to the command size, - * max SAVESYS_CMD_SIZE - * On error: response contains the numeric portion of cp error message. - * for SAVESYS it will be >= 263 - * for missing privilege class, it will be 1 - */ - if (response > SAVESYS_CMD_SIZE || response == 1) { - pr_err("Saving the Linux kernel NSS failed with rc=%d\n", - response); - kernel_nss_name[0] = '\0'; - return; - } - - /* re-initialize cputime accounting. */ - get_tod_clock_ext(tod_clock_base); - S390_lowcore.last_update_clock = *(__u64 *) &tod_clock_base[1]; - S390_lowcore.last_update_timer = 0x7fffffffffffffffULL; - S390_lowcore.user_timer = 0; - S390_lowcore.system_timer = 0; - asm volatile("SPT 0(%0)" : : "a" (&S390_lowcore.last_update_timer)); - - /* re-setup boot command line with new ipl vm parms */ - ipl_update_parameters(); - setup_boot_command_line(); - - ipl_flags = IPL_NSS_VALID; -} - -#else /* CONFIG_SHARED_KERNEL */ - -static inline void create_kernel_nss(void) { } - -#endif /* CONFIG_SHARED_KERNEL */ - /* * Clear bss memory */ @@ -548,10 +412,6 @@ static void __init setup_boot_command_line(void) append_to_cmdline(append_ipl_scpdata); } -/* - * Save ipl parameters, clear bss memory, initialize storage keys - * and create a kernel NSS at startup if the SAVESYS= parm is defined - */ void __init startup_init(void) { reset_tod_clock(); @@ -568,7 +428,6 @@ void __init startup_init(void) setup_arch_string(); ipl_update_parameters(); setup_boot_command_line(); - create_kernel_nss(); detect_diag9c(); detect_diag44(); detect_machine_facilities(); diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 8e622bb52f7a..310e59e6eb4b 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -279,8 +279,6 @@ static __init enum ipl_type get_ipl_type(void) { struct ipl_parameter_block *ipl = IPL_PARMBLOCK_START; - if (ipl_flags & IPL_NSS_VALID) - return IPL_TYPE_NSS; if (!(ipl_flags & IPL_DEVNO_VALID)) return IPL_TYPE_UNKNOWN; if (!(ipl_flags & IPL_PARMBLOCK_VALID)) @@ -533,22 +531,6 @@ static struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; -/* NSS ipl device attributes */ - -DEFINE_IPL_ATTR_RO(ipl_nss, name, "%s\n", kernel_nss_name); - -static struct attribute *ipl_nss_attrs[] = { - &sys_ipl_type_attr.attr, - &sys_ipl_nss_name_attr.attr, - &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_vm_parm_attr.attr, - NULL, -}; - -static struct attribute_group ipl_nss_attr_group = { - .attrs = ipl_nss_attrs, -}; - /* UNKNOWN ipl device attributes */ static struct attribute *ipl_unknown_attrs[] = { @@ -598,9 +580,6 @@ static int __init ipl_init(void) case IPL_TYPE_FCP_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); break; - case IPL_TYPE_NSS: - rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nss_attr_group); - break; default: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_unknown_attr_group); @@ -1172,18 +1151,6 @@ static int __init reipl_nss_init(void) return rc; reipl_block_ccw_init(reipl_block_nss); - if (ipl_info.type == IPL_TYPE_NSS) { - memset(reipl_block_nss->ipl_info.ccw.nss_name, - ' ', NSS_NAME_SIZE); - memcpy(reipl_block_nss->ipl_info.ccw.nss_name, - kernel_nss_name, strlen(kernel_nss_name)); - ASCEBC(reipl_block_nss->ipl_info.ccw.nss_name, NSS_NAME_SIZE); - reipl_block_nss->ipl_info.ccw.vm_flags |= - DIAG308_VM_FLAGS_NSS_VALID; - - reipl_block_ccw_fill_parms(reipl_block_nss); - } - reipl_capabilities |= IPL_TYPE_NSS; return 0; } @@ -1971,9 +1938,6 @@ void __init setup_ipl(void) ipl_info.data.fcp.lun = IPL_PARMBLOCK_START->ipl_info.fcp.lun; break; case IPL_TYPE_NSS: - strncpy(ipl_info.data.nss.name, kernel_nss_name, - sizeof(ipl_info.data.nss.name)); - break; case IPL_TYPE_UNKNOWN: /* We have no info to copy */ break; diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index e94421678b13..c72e551e5951 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -219,10 +219,6 @@ int machine_kexec_prepare(struct kimage *image) { void *reboot_code_buffer; - /* Can't replace kernel image since it is read-only. */ - if (ipl_flags & IPL_NSS_VALID) - return -EOPNOTSUPP; - if (image->type == KEXEC_TYPE_CRASH) return machine_kexec_prepare_kdump(); diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index c8ea715bfe10..af0553111be8 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -152,7 +152,7 @@ int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end)); - unsigned long eshared_pfn = PFN_DOWN(__pa(&_eshared)) - 1; + unsigned long end_rodata_pfn = PFN_DOWN(__pa(&__end_rodata)) - 1; unsigned long stext_pfn = PFN_DOWN(__pa(&_stext)); /* Always save lowcore pages (LC protection might be enabled). */ @@ -160,9 +160,9 @@ int pfn_is_nosave(unsigned long pfn) return 0; if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) return 1; - /* Skip memory holes and read-only pages (NSS, DCSS, ...). */ - if (pfn >= stext_pfn && pfn <= eshared_pfn) - return ipl_info.type == IPL_TYPE_NSS ? 1 : 0; + /* Skip memory holes and read-only pages (DCSS, ...). */ + if (pfn >= stext_pfn && pfn <= end_rodata_pfn) + return 0; if (tprot(PFN_PHYS(pfn))) return 1; return 0; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 7c9fcf7cb43d..a609d65afc56 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -59,12 +59,7 @@ SECTIONS RO_DATA_SECTION(PAGE_SIZE) -#ifdef CONFIG_SHARED_KERNEL - . = ALIGN(0x100000); /* VM shared segments are 1MB aligned */ -#endif - . = ALIGN(PAGE_SIZE); - _eshared = .; /* End of shareable data */ _sdata = .; /* Start of data section */ . = ALIGN(PAGE_SIZE); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 2e84977796a2..f890f2ad951b 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -406,13 +406,13 @@ void __init vmem_map_init(void) (_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); __set_memory((unsigned long) _etext, - (_eshared - _etext) >> PAGE_SHIFT, + (__end_rodata - _etext) >> PAGE_SHIFT, SET_MEMORY_RO); __set_memory((unsigned long) _sinittext, (_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); pr_info("Write protected kernel read-only data: %luk\n", - (_eshared - _stext) >> 10); + (__end_rodata - _stext) >> 10); } /* From 11752adb68a388724b1935d57bf543897c34d80b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 7 Nov 2017 16:18:06 -0500 Subject: [PATCH 0997/1085] locking/pvqspinlock: Implement hybrid PV queued/unfair locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, all the lock waiters entering the slowpath will do one lock stealing attempt to acquire the lock. That helps performance, especially in VMs with over-committed vCPUs. However, the current pvqspinlocks still don't perform as good as unfair locks in many cases. On the other hands, unfair locks do have the problem of lock starvation that pvqspinlocks don't have. This patch combines the best attributes of an unfair lock and a pvqspinlock into a hybrid lock with 2 modes - queued mode & unfair mode. A lock waiter goes into the unfair mode when there are waiters in the wait queue but the pending bit isn't set. Otherwise, it will go into the queued mode waiting in the queue for its turn. On a 2-socket 36-core E5-2699 v3 system (HT off), a kernel build (make -j) was done in a VM with unpinned vCPUs 3 times with the best time selected and is the number of vCPUs available. The build times of the original pvqspinlock, hybrid pvqspinlock and unfair lock with various number of vCPUs are as follows: vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 30 342.1s 329.1s 329.1s 36 314.1s 305.3s 307.3s 45 345.0s 302.1s 306.6s 54 365.4s 308.6s 307.8s 72 358.9s 293.6s 303.9s 108 343.0s 285.9s 304.2s The hybrid pvqspinlock performs better or comparable to the unfair lock. By turning on QUEUED_LOCK_STAT, the table below showed the number of lock acquisitions in unfair mode and queue mode after a kernel build with various number of vCPUs. vCPUs queued mode unfair mode ----- ----------- ----------- 30 9,130,518 294,954 36 10,856,614 386,809 45 8,467,264 11,475,373 54 6,409,987 19,670,855 72 4,782,063 25,712,180 It can be seen that as the VM became more and more over-committed, the ratio of locks acquired in unfair mode increases. This is all done automatically to get the best overall performance as possible. Using a kernel locking microbenchmark with number of locking threads equals to the number of vCPUs available on the same machine, the minimum, average and maximum (min/avg/max) numbers of locking operations done per thread in a 5-second testing interval are shown below: vCPUs hybrid pvqlock unfair lock ----- -------------- ----------- 36 822,135/881,063/950,363 75,570/313,496/ 690,465 54 542,435/581,664/625,937 35,460/204,280/ 457,172 72 397,500/428,177/499,299 17,933/150,679/ 708,001 108 257,898/288,150/340,871 3,085/181,176/1,257,109 It can be seen that the hybrid pvqspinlocks are more fair and performant than the unfair locks in this test. The table below shows the kernel build times on a smaller 2-socket 16-core 32-thread E5-2620 v4 system. vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 16 436.8s 433.4s 435.6s 36 366.2s 364.8s 364.5s 48 423.6s 376.3s 370.2s 64 433.1s 376.6s 376.8s Again, the performance of the hybrid pvqspinlock was comparable to that of the unfair lock. Signed-off-by: Waiman Long Reviewed-by: Juergen Gross Reviewed-by: Eduardo Valentin Acked-by: Peter Zijlstra Cc: Boris Ostrovsky Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1510089486-3466-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 47 +++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 15b6a39366c6..6ee477765e6c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -61,21 +61,50 @@ struct pv_node { #include "qspinlock_stat.h" /* + * Hybrid PV queued/unfair lock + * * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before - * being queued. By allowing one lock stealing attempt here when the pending - * bit is off, it helps to reduce the performance impact of lock waiter - * preemption without the drawback of lock starvation. + * being queued. + * + * The pending bit is set by the queue head vCPU of the MCS wait queue in + * pv_wait_head_or_lock() to signal that it is ready to spin on the lock. + * When that bit becomes visible to the incoming waiters, no lock stealing + * is allowed. The function will return immediately to make the waiters + * enter the MCS wait queue. So lock starvation shouldn't happen as long + * as the queued mode vCPUs are actively running to set the pending bit + * and hence disabling lock stealing. + * + * When the pending bit isn't set, the lock waiters will stay in the unfair + * mode spinning on the lock unless the MCS wait queue is empty. In this + * case, the lock waiters will enter the queued mode slowpath trying to + * become the queue head and set the pending bit. + * + * This hybrid PV queued/unfair lock combines the best attributes of a + * queued lock (no lock starvation) and an unfair lock (good performance + * on not heavily contended locks). */ -#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) -static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) +static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); - return true; + /* + * Stay in unfair lock mode as long as queued mode waiters are + * present in the MCS wait queue but the pending bit isn't set. + */ + for (;;) { + int val = atomic_read(&lock->val); + + if (!(val & _Q_LOCKED_PENDING_MASK) && + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) + break; + + cpu_relax(); } return false; From f54bb2ec02c839f6bfe3e8d438cd93d30b4809dd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:17 +0100 Subject: [PATCH 0998/1085] locking/lockdep: Add IRQs disabled/enabled assertion APIs: lockdep_assert_irqs_enabled()/disabled() Checking whether IRQs are enabled or disabled is a very common sanity check, however not free of overhead especially on fastpath where such assertion is very common. Lockdep is a good host for such concurrency correctness check and it even already tracks down IRQs disablement state. Just reuse its machinery. This will allow us to get rid of the flags pop and check overhead from fast path when kernel is built for production. Suggested-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 02720769c159..a842551fe044 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -725,9 +725,24 @@ do { \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ } while (0) + +#define lockdep_assert_irqs_enabled() do { \ + WARN_ONCE(debug_locks && !current->lockdep_recursion && \ + !current->hardirqs_enabled, \ + "IRQs not enabled as expected\n"); \ + } while (0) + +#define lockdep_assert_irqs_disabled() do { \ + WARN_ONCE(debug_locks && !current->lockdep_recursion && \ + current->hardirqs_enabled, \ + "IRQs not disabled as expected\n"); \ + } while (0) + #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) +# define lockdep_assert_irqs_enabled() do { } while (0) +# define lockdep_assert_irqs_disabled() do { } while (0) #endif #ifdef CONFIG_LOCKDEP From f71b74bca637fca7de78f1d44eaefde5bc900f9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:18 +0100 Subject: [PATCH 0999/1085] irq/softirqs: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/softirq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 4e09821f9d9e..662f7b1b7a78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); @@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq() || irqs_disabled()); + WARN_ON_ONCE(in_irq()); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); #endif @@ -396,9 +397,8 @@ void irq_exit(void) #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); #else - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) @@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - BUG_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); t->next = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, t); From 8e8eb730759f9cfd7a761b0b4ee41d714e720993 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:19 +0100 Subject: [PATCH 1000/1085] workqueue: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Tejun Heo Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1509980490-4285-4-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1070b21ba4aa..13f67b5a0a0c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); debug_work_activate(work); From ebf3adbad012b89c4a51a3beae718a587d988a3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:20 +0100 Subject: [PATCH 1001/1085] timers/nohz: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a899c5ce64..dd4b7b492c9b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep) static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; @@ -960,8 +960,7 @@ void tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - + lockdep_assert_irqs_enabled(); /* * Update the idle state in the scheduler domain hierarchy * when tick_nohz_stop_sched_tick() is called from the idle loop. From 53bef3fd47f69e40b52c9f9acd3551dfff9f8702 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:21 +0100 Subject: [PATCH 1002/1085] timers/hrtimer: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-6-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88f75f92ef36..d32520840fde 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -758,9 +758,7 @@ void clock_was_set(void) */ void hrtimers_resume(void) { - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - + lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); /* And schedule a retrigger for all others */ From 83efcbd028ad3aec36b5a3882cfa32490c135df7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:22 +0100 Subject: [PATCH 1003/1085] smp/core: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index c94dd85c8d41..084c8b3a2681 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) call_single_data_t *csd, *csd_next; static bool warned; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); From 7a10e2a9190628a4024ea394ce7bd641ae40ffd1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:23 +0100 Subject: [PATCH 1004/1085] x86: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. It also makes no more sense to fix the IRQ flags when a bug is detected as the assertion is now pure config-dependent debugging. And to quote Peter Zijlstra: The whole if !disabled, disable logic is uber paranoid programming, but I don't think we've ever seen that WARN trigger, and if it does (and then burns the kernel) we at least know what happend. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-8-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 4 +--- arch/x86/kernel/smpboot.c | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index eaa0ba66cf96..d7d3cc24baf4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -186,9 +186,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) addr_limit_user_check(); - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) - local_irq_disable(); - + lockdep_assert_irqs_disabled(); lockdep_sys_exit(); cached_flags = READ_ONCE(ti->flags); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 198416ddff01..4008b6b9ad72 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1095,7 +1095,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) unsigned long flags; int err, ret = 0; - WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); From 164446455a5d3f1402b5a0ea42acce33fd576ed7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:24 +0100 Subject: [PATCH 1005/1085] perf/core: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b315aebbcc3f..c298847d4b85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -209,7 +209,7 @@ static int event_function(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -1006,7 +1006,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) struct perf_cpu_context *cpuctx; int rotations = 0; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); @@ -1093,7 +1093,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); @@ -1102,7 +1102,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); @@ -3523,7 +3523,7 @@ void perf_event_task_tick(void) struct perf_event_context *ctx, *tmp; int throttled; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); From a934d4d15f040cdd254350e7270b35cc7521e12e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:25 +0100 Subject: [PATCH 1006/1085] irq/timings: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-10-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq/timings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c8c1d073fbf1..e0923fa4927a 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now) * order to prevent the timings circular buffer to be updated * while we are reading it. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * Number of elements in the circular buffer: If it happens it From 3c7169a3bf8216a56761a8edf775072dd36a00a0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:26 +0100 Subject: [PATCH 1007/1085] irq_work: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..899579657a0a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -188,7 +188,7 @@ void irq_work_tick(void) */ void irq_work_sync(struct irq_work *work) { - WARN_ON_ONCE(irqs_disabled()); + lockdep_assert_irqs_enabled(); while (work->flags & IRQ_WORK_BUSY) cpu_relax(); From 2c11dba00a39007b457c7607c1b1a4db95ca04bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:27 +0100 Subject: [PATCH 1008/1085] sched/clock, sched/cputime: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 2 +- kernel/sched/cputime.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index ca0f8fc945c6..e086babe6c61 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -388,7 +388,7 @@ void sched_clock_tick(void) if (unlikely(!sched_clock_running)) return; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); scd = this_scd(); __scd_stamp(scd); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..9be8b68a66da 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max) { u64 accounted; - /* Shall be converted to a lockdep-enabled lightweight check */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); accounted = steal_account_process_time(max); From a69682200db9c2c26594188f81dd2df560af4683 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:28 +0100 Subject: [PATCH 1009/1085] timers/posix-cpu-timers: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-13-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5b117110b55b..1f27887aa194 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, /* * Disarm any old timer after extracting its expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); ret = 0; old_incr = timer->it.cpu.incr; @@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); arm_timer(timer); unlock: unlock_task_sighand(p, &flags); @@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) struct k_itimer *timer, *next; unsigned long flags; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * The fast path checks that there are no expired thread or thread From af0733937317e1e03b60f3af8cf9cd59d665593c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:29 +0100 Subject: [PATCH 1010/1085] netpoll: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S. Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-14-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 912731bed7b7..57557a6a950c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -334,7 +334,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { From b04db8e19fc2e9131524dec43057c1b96d5ba3ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:30 +0100 Subject: [PATCH 1011/1085] rcu: Use lockdep to assert IRQs are disabled/enabled Lockdep now has an integrated IRQs disabled/enabled sanity check. Just use it instead of the ad-hoc RCU version. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-15-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 16 ++++++++-------- kernel/rcu/tree_plugin.h | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e3650e94ae6..08fa586d7f8c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); return READ_ONCE(*fp); } @@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -840,7 +840,7 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } @@ -855,7 +855,7 @@ void rcu_idle_enter(void) */ void rcu_user_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -880,7 +880,7 @@ void rcu_irq_exit(void) { struct rcu_dynticks *rdtp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ @@ -947,7 +947,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -1018,7 +1018,7 @@ void rcu_irq_enter(void) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..df08e5c8126b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -325,7 +325,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) @@ -2012,7 +2012,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (!rcu_is_nocb_cpu(smp_processor_id())) return false; /* Not NOCBs CPU, caller must migrate CBs. */ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), From 70e57c0f4b502f2435b7649a201861fe212c2e4e Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:46 -0800 Subject: [PATCH 1012/1085] x86/insn-eval: Compute linear address in several utility functions Computing a linear address involves several steps. The first step is to compute the effective address. This requires determining the addressing mode in use and perform arithmetic operations on the operands. Plus, each addressing mode has special cases that must be handled. Once the effective address is known, the base address of the applicable segment is added to obtain the linear address. Clearly, this is too much work for a single function. Instead, handle each addressing mode in a separate utility function. This improves readability and gives us the opportunity to handler errors better. At the moment, arithmetic to compute the effective address uses 64-byte variables. Thus, limit support to 64-bit addresses. While reworking the function insn_get_addr_ref(), the variable addr_offset is renamed as regoff to reflect its actual use (i.e., offset, from the base of pt_regs, of the register used as operand). Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-2-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 243 +++++++++++++++++++++++++++++---------- 1 file changed, 185 insertions(+), 58 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 91f08aafb65e..a4427b4e293c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -776,6 +776,182 @@ static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, return 0; } +/** + * get_eff_addr_reg() - Obtain effective address from register operand + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, with the effective address + * @eff_addr: Obtained effective address + * + * Obtain the effective address stored in the register operand as indicated by + * the ModRM byte. This function is to be used only with register addressing + * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The + * register operand, as an offset from the base of pt_regs, is saved in @regoff; + * such offset can then be used to resolve the segment associated with the + * operand. This function can be used with any of the supported address sizes + * in x86. + * + * Returns: + * + * 0 on success. @eff_addr will have the effective address stored in the + * operand indicated by ModRM. @regoff will have such operand as an offset from + * the base of pt_regs. + * + * -EINVAL on error. + */ +static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) != 3) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + if (*regoff < 0) + return -EINVAL; + + *eff_addr = regs_get_register(regs, *regoff); + + return 0; +} + +/** + * get_eff_addr_modrm() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the ModRM byte of @insn. After + * identifying the registers involved in the register-indirect memory reference, + * its value is obtained from the operands in @regs. The computed address is + * stored @eff_addr. Also, the register operand that indicates the associated + * segment is stored in @regoff, this parameter can later be used to determine + * such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + long tmp; + + if (insn->addr_bytes != 8) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + + /* + * -EDOM means that we must ignore the address_offset. In such a case, + * in 64-bit mode the effective address relative to the rIP of the + * following instruction. + */ + if (*regoff == -EDOM) { + if (user_64bit_mode(regs)) + tmp = regs->ip + insn->length; + else + tmp = 0; + } else if (*regoff < 0) { + return -EINVAL; + } else { + tmp = regs_get_register(regs, *regoff); + } + + *eff_addr = tmp + insn->displacement.value; + + return 0; +} + +/** + * get_eff_addr_sib() - Obtain referenced effective address via SIB + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the SIB byte of @insn. After + * identifying the registers involved in the indexed, register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates the + * associated segment is stored in @regoff, this parameter can later be used to + * determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. + * @base_offset will have a register, as an offset from the base of pt_regs, + * that can be used to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, + int *base_offset, long *eff_addr) +{ + long base, indx; + int indx_offset; + + if (insn->addr_bytes != 8) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + insn_get_sib(insn); + + if (!insn->sib.nbytes) + return -EINVAL; + + *base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + + /* + * Negative values in the base and index offset means an error when + * decoding the SIB byte. Except -EDOM, which means that the registers + * should not be used in the address computation. + */ + if (*base_offset == -EDOM) + base = 0; + else if (*base_offset < 0) + return -EINVAL; + else + base = regs_get_register(regs, *base_offset); + + if (indx_offset == -EDOM) + indx = 0; + else if (indx_offset < 0) + return -EINVAL; + else + indx = regs_get_register(regs, indx_offset); + + *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value)); + + *eff_addr += insn->displacement.value; + + return 0; +} /* * return the address being referenced be instruction * for rm=3 returning the content of the rm reg @@ -783,78 +959,29 @@ static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, */ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) { - int addr_offset, base_offset, indx_offset, ret; unsigned long linear_addr = -1L, seg_base; - long eff_addr, base, indx; - insn_byte_t sib; - - insn_get_modrm(insn); - insn_get_sib(insn); - sib = insn->sib.value; + int regoff, ret; + long eff_addr; if (X86_MODRM_MOD(insn->modrm.value) == 3) { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) + ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr); + if (ret) goto out; - eff_addr = regs_get_register(regs, addr_offset); - } else { if (insn->sib.nbytes) { - /* - * Negative values in the base and index offset means - * an error when decoding the SIB byte. Except -EDOM, - * which means that the registers should not be used - * in the address computation. - */ - base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); - if (base_offset == -EDOM) - base = 0; - else if (base_offset < 0) + ret = get_eff_addr_sib(insn, regs, ®off, &eff_addr); + if (ret) goto out; - else - base = regs_get_register(regs, base_offset); - - indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - - if (indx_offset == -EDOM) - indx = 0; - else if (indx_offset < 0) - goto out; - else - indx = regs_get_register(regs, indx_offset); - - eff_addr = base + indx * (1 << X86_SIB_SCALE(sib)); - - /* - * The base determines the segment used to compute - * the linear address. - */ - addr_offset = base_offset; - } else { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - /* - * -EDOM means that we must ignore the address_offset. - * In such a case, in 64-bit mode the effective address - * relative to the RIP of the following instruction. - */ - if (addr_offset == -EDOM) { - if (user_64bit_mode(regs)) - eff_addr = (long)regs->ip + insn->length; - else - eff_addr = 0; - } else if (addr_offset < 0) { + ret = get_eff_addr_modrm(insn, regs, ®off, &eff_addr); + if (ret) goto out; - } else { - eff_addr = regs_get_register(regs, addr_offset); - } } - eff_addr += insn->displacement.value; } - ret = get_seg_base_limit(insn, regs, addr_offset, &seg_base, NULL); + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL); if (ret) goto out; From 7a6daf79123a086f03b8cdfbc953958c8e1c1287 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:47 -0800 Subject: [PATCH 1013/1085] x86/insn-eval: Add support to resolve 32-bit address encodings 32-bit and 64-bit address encodings are identical. Thus, the same logic could be used to resolve the effective address. However, there are two key differences: address size and enforcement of segment limits. If running a 32-bit process on a 64-bit kernel, it is best to perform the address calculation using 32-bit data types. In this manner hardware is used for the arithmetic, including handling of signs and overflows. 32-bit addresses are generally used in protected mode; segment limits are enforced in this mode. This implementation obtains the limit of the segment associated with the instruction operands and prefixes. If the computed address is outside the segment limits, an error is returned. It is also possible to use 32-bit address in long mode and virtual-8086 mode by using an address override prefix. In such cases, segment limits are not enforced. Support to use 32-bit arithmetic is added to the utility functions that compute effective addresses. However, the end result is stored in a variable of type long (which has a width of 8 bytes in 64-bit builds). Hence, once a 32-bit effective address is computed, the 4 most significant bytes are masked out to avoid sign extension. The newly added function get_addr_ref_32() is almost identical to the existing function insn_get_addr_ref() (used for 64-bit addresses). The only difference is that it verifies that the effective address is within the limits of the segment. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-3-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 112 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index a4427b4e293c..e6cb68a4f7a4 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -814,7 +814,11 @@ static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, if (*regoff < 0) return -EINVAL; - *eff_addr = regs_get_register(regs, *regoff); + /* Ignore bytes that are outside the address size. */ + if (insn->addr_bytes == 4) + *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff; + else /* 64-bit address */ + *eff_addr = regs_get_register(regs, *regoff); return 0; } @@ -846,7 +850,7 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, { long tmp; - if (insn->addr_bytes != 8) + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) return -EINVAL; insn_get_modrm(insn); @@ -875,7 +879,13 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, tmp = regs_get_register(regs, *regoff); } - *eff_addr = tmp + insn->displacement.value; + if (insn->addr_bytes == 4) { + int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = tmp + insn->displacement.value; + } return 0; } @@ -908,7 +918,7 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, long base, indx; int indx_offset; - if (insn->addr_bytes != 8) + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) return -EINVAL; insn_get_modrm(insn); @@ -946,12 +956,102 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, else indx = regs_get_register(regs, indx_offset); - *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value)); + if (insn->addr_bytes == 4) { + int addr32, base32, idx32; - *eff_addr += insn->displacement.value; + base32 = base & 0xffffffff; + idx32 = indx & 0xffffffff; + + addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value)); + addr32 += insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value)); + *eff_addr += insn->displacement.value; + } return 0; } + +/** + * get_addr_ref_32() - Obtain a 32-bit linear address + * @insn: Instruction with ModRM, SIB bytes and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 32-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. If in protected + * mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int eff_addr, regoff; + long tmp; + int ret; + + if (insn->addr_bytes != 4) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + + } else { + if (insn->sib.nbytes) { + ret = get_eff_addr_sib(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * In protected mode, before computing the linear address, make sure + * the effective address is within the limits of the segment. + * 32-bit addresses can be used in long and virtual-8086 modes if an + * address override prefix is used. In such cases, segment limits are + * not enforced. When in virtual-8086 mode, the segment limit is -1L + * to reflect this situation. + * + * After computed, the effective address is treated as an unsigned + * quantity. + */ + if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit)) + goto out; + + /* + * Data type long could be 64 bits in size. Ensure that our 32-bit + * effective address is not sign-extended when computing the linear + * address. + */ + linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base; + +out: + return (void __user *)linear_addr; +} + /* * return the address being referenced be instruction * for rm=3 returning the content of the rm reg From cd9b594a9ef122a41bc961c330a55d87e226822f Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:48 -0800 Subject: [PATCH 1014/1085] x86/insn-eval: Add wrapper function for 32 and 64-bit addresses The function insn_get_addr_ref() is capable of handling only 64-bit addresses. A previous commit introduced a function to handle 32-bit addresses. Invoke these two functions from a third wrapper function that calls the appropriate routine based on the address size specified in the instruction structure (obtained by looking at the code segment default address size and the address override prefix, if present). While doing this, rename the original function insn_get_addr_ref() with the more appropriate name get_addr_ref_64(), ensure it is only used for 64-bit addresses. Also, since 64-bit addresses are not possible in 32-bit builds, provide a dummy function such case. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-4-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 60 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index e6cb68a4f7a4..1ac39737826b 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1052,17 +1052,36 @@ out: return (void __user *)linear_addr; } -/* - * return the address being referenced be instruction - * for rm=3 returning the content of the rm reg - * for rm!=3 calculates the address using SIB and Disp +/** + * get_addr_ref_64() - Obtain a 64-bit linear address + * @insn: Instruction struct with ModRM and SIB bytes and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * This function is to be used with 64-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. */ -void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) +#ifndef CONFIG_X86_64 +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) +{ + return (void __user *)-1L; +} +#else +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) { unsigned long linear_addr = -1L, seg_base; int regoff, ret; long eff_addr; + if (insn->addr_bytes != 8) + goto out; + if (X86_MODRM_MOD(insn->modrm.value) == 3) { ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr); if (ret) @@ -1090,3 +1109,34 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) out: return (void __user *)linear_addr; } +#endif /* CONFIG_X86_64 */ + +/** + * insn_get_addr_ref() - Obtain the linear address referred by instruction + * @insn: Instruction structure containing ModRM byte and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain the linear address referred by the instruction's ModRM, SIB and + * displacement bytes, and segment base, as applicable. In protected mode, + * segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + if (!insn || !regs) + return (void __user *)-1L; + + switch (insn->addr_bytes) { + case 4: + return get_addr_ref_32(insn, regs); + case 8: + return get_addr_ref_64(insn, regs); + default: + return (void __user *)-1L; + } +} From 86cc35109029b7f1b195cef6c74654bad95e81af Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:49 -0800 Subject: [PATCH 1015/1085] x86/insn-eval: Handle 32-bit address encodings in virtual-8086 mode It is possible to utilize 32-bit address encodings in virtual-8086 mode via an address override instruction prefix. However, the range of the effective address is still limited to [0x-0xffff]. In such a case, return error. Also, linear addresses in virtual-8086 mode are limited to 20 bits. Enforce such limit by truncating the most significant bytes of the computed linear address. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-5-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 1ac39737826b..ef102db43289 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1041,6 +1041,13 @@ static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs) if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit)) goto out; + /* + * Even though 32-bit address encodings are allowed in virtual-8086 + * mode, the address range is still limited to [0x-0xffff]. + */ + if (v8086_mode(regs) && (eff_addr & ~0xffff)) + goto out; + /* * Data type long could be 64 bits in size. Ensure that our 32-bit * effective address is not sign-extended when computing the linear @@ -1048,6 +1055,10 @@ static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs) */ linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base; + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + out: return (void __user *)linear_addr; } From 9c6c799faeed54b17857c2eed9058a25b8ee3614 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:50 -0800 Subject: [PATCH 1016/1085] x86/insn-eval: Add support to resolve 16-bit address encodings Tasks running in virtual-8086 mode, in protected mode with code segment descriptors that specify 16-bit default address sizes via the D bit, or via an address override prefix will use 16-bit addressing form encodings as described in the Intel 64 and IA-32 Architecture Software Developer's Manual Volume 2A Section 2.1.5, Table 2-1. 16-bit addressing encodings differ in several ways from the 32-bit/64-bit addressing form encodings: ModRM.rm points to different registers and, in some cases, effective addresses are indicated by the addition of the value of two registers. Also, there is no support for SIB bytes. Thus, a separate function is needed to parse this form of addressing. Three functions are introduced. get_reg_offset_16() obtains the offset from the base of pt_regs of the registers indicated by the ModRM byte of the address encoding. get_eff_addr_modrm_16() computes the effective address from the value of the register operands. get_addr_ref_16() computes the linear address using the obtained effective address and the base address of the segment. Segment limits are enforced when running in protected mode. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Adam Buchbinder Cc: Adrian Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Colin Ian King Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Thomas Garnier Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-6-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn-eval.c | 213 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 212 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index ef102db43289..35625d279458 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -480,6 +480,80 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, return regoff[regno]; } +/** + * get_reg_offset_16() - Obtain offset of register indicated by instruction + * @insn: Instruction containing ModRM byte + * @regs: Register values as seen when entering kernel mode + * @offs1: Offset of the first operand register + * @offs2: Offset of the second opeand register, if applicable + * + * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte + * in @insn. This function is to be used with 16-bit address encodings. The + * @offs1 and @offs2 will be written with the offset of the two registers + * indicated by the instruction. In cases where any of the registers is not + * referenced by the instruction, the value will be set to -EDOM. + * + * Returns: + * + * 0 on success, -EINVAL on error. + */ +static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, + int *offs1, int *offs2) +{ + /* + * 16-bit addressing can use one or two registers. Specifics of + * encodings are given in Table 2-1. "16-Bit Addressing Forms with the + * ModR/M Byte" of the Intel Software Development Manual. + */ + static const int regoff1[] = { + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bx), + }; + + static const int regoff2[] = { + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + -EDOM, + -EDOM, + -EDOM, + -EDOM, + }; + + if (!offs1 || !offs2) + return -EINVAL; + + /* Operand is a register, use the generic function. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + *offs1 = insn_get_modrm_rm_off(insn, regs); + *offs2 = -EDOM; + return 0; + } + + *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)]; + *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)]; + + /* + * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- + * only addressing. This means that no registers are involved in + * computing the effective address. Thus, ensure that the first + * register offset is invalild. The second register offset is already + * invalid under the aforementioned conditions. + */ + if ((X86_MODRM_MOD(insn->modrm.value) == 0) && + (X86_MODRM_RM(insn->modrm.value) == 6)) + *offs1 = -EDOM; + + return 0; +} + /** * get_desc() - Obtain pointer to a segment descriptor * @sel: Segment selector @@ -815,7 +889,9 @@ static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, return -EINVAL; /* Ignore bytes that are outside the address size. */ - if (insn->addr_bytes == 4) + if (insn->addr_bytes == 2) + *eff_addr = regs_get_register(regs, *regoff) & 0xffff; + else if (insn->addr_bytes == 4) *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff; else /* 64-bit address */ *eff_addr = regs_get_register(regs, *regoff); @@ -890,6 +966,74 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, return 0; } +/** + * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the 16-bit effective address referenced by the ModRM byte of @insn. + * After identifying the registers involved in the register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates + * the associated segment is stored in @regoff, this parameter can later be used + * to determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs, + int *regoff, short *eff_addr) +{ + int addr_offset1, addr_offset2, ret; + short addr1 = 0, addr2 = 0, displacement; + + if (insn->addr_bytes != 2) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2); + if (ret < 0) + return -EINVAL; + + /* + * Don't fail on invalid offset values. They might be invalid because + * they cannot be used for this particular value of ModRM. Instead, use + * them in the computation only if they contain a valid value. + */ + if (addr_offset1 != -EDOM) + addr1 = regs_get_register(regs, addr_offset1) & 0xffff; + + if (addr_offset2 != -EDOM) + addr2 = regs_get_register(regs, addr_offset2) & 0xffff; + + displacement = insn->displacement.value & 0xffff; + *eff_addr = addr1 + addr2 + displacement; + + /* + * The first operand register could indicate to use of either SS or DS + * registers to obtain the segment selector. The second operand + * register can only indicate the use of DS. Thus, the first operand + * will be used to obtain the segment selector. + */ + *regoff = addr_offset1; + + return 0; +} + /** * get_eff_addr_sib() - Obtain referenced effective address via SIB * @insn: Instruction. Must be valid. @@ -974,6 +1118,71 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, return 0; } +/** + * get_addr_ref_16() - Obtain the 16-bit address referred by instruction + * @insn: Instruction containing ModRM byte and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 16-bit address encodings. Obtain the memory + * address referred by the instruction's ModRM and displacement bytes. Also, the + * segment used as base is determined by either any segment override prefixes in + * @insn or the default segment of the registers involved in the address + * computation. In protected mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by the instruction operands on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int ret, regoff; + short eff_addr; + long tmp; + + insn_get_modrm(insn); + insn_get_displacement(insn); + + if (insn->addr_bytes != 2) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm_16(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * Before computing the linear address, make sure the effective address + * is within the limits of the segment. In virtual-8086 mode, segment + * limits are not enforced. In such a case, the segment limit is -1L to + * reflect this fact. + */ + if ((unsigned long)(eff_addr & 0xffff) > seg_limit) + goto out; + + linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base; + + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + +out: + return (void __user *)linear_addr; +} + /** * get_addr_ref_32() - Obtain a 32-bit linear address * @insn: Instruction with ModRM, SIB bytes and displacement @@ -1143,6 +1352,8 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) return (void __user *)-1L; switch (insn->addr_bytes) { + case 2: + return get_addr_ref_16(insn, regs); case 4: return get_addr_ref_32(insn, regs); case 8: From 3522c2a6a4f341058b8291326a945e2a2d2aaf55 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:51 -0800 Subject: [PATCH 1017/1085] x86/cpufeature: Add User-Mode Instruction Prevention definitions User-Mode Instruction Prevention is a security feature present in new Intel processors that, when set, prevents the execution of a subset of instructions if such instructions are executed in user mode (CPL > 0). Attempting to execute such instructions causes a general protection exception. The subset of instructions comprises: * SGDT - Store Global Descriptor Table * SIDT - Store Interrupt Descriptor Table * SLDT - Store Local Descriptor Table * SMSW - Store Machine Status Word * STR - Store Task Register This feature is also added to the list of disabled-features to allow a cleaner handling of build-time configuration. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/include/uapi/asm/processor-flags.h | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cdf5be866863..c0b0e9e8aa66 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c10c9128f54e..14d6d5007314 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_INTEL_UMIP +# define DISABLE_UMIP 0 +#else +# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -63,7 +69,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 53b4ca55ebb6..7e1e730396ae 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -105,6 +105,8 @@ #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */ +#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT) #define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ #define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ From 1e5db223696afa55e6a038fac638f759e1fdcc01 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:52 -0800 Subject: [PATCH 1018/1085] x86/umip: Add emulation code for UMIP instructions The feature User-Mode Instruction Prevention present in recent Intel processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) from being executed with CPL > 0. Otherwise, a general protection fault is issued. Rather than relaying to the user space the general protection fault caused by the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be trapped and the instruction emulated to provide a dummy result. This allows to both conserve the current kernel behavior and not reveal the system resources that UMIP intends to protect (i.e., the locations of the global descriptor and interrupt descriptor tables, the segment selectors of the local descriptor table, the value of the task state register and the contents of the CR0 register). This emulation is needed because certain applications (e.g., WineHQ and DOSEMU2) rely on this subset of instructions to function. Given that sldt and str are not commonly used in programs that run on WineHQ or DOSEMU2, they are not emulated. Also, emulation is provided only for 32-bit processes; 64-bit processes that attempt to use the instructions that UMIP protects will receive the SIGSEGV signal issued as a consequence of the general protection fault. The instructions protected by UMIP can be split in two groups. Those which return a kernel memory address (sgdt and sidt) and those which return a value (smsw, sldt and str; the last two not emulated). For the instructions that return a kernel memory address, applications such as WineHQ rely on the result being located in the kernel memory space, not the actual location of the table. The result is emulated as a hard-coded value that lies close to the top of the kernel memory. The limit for the GDT and the IDT are set to zero. The instruction smsw is emulated to return the value that the register CR0 has at boot time as set in the head_32. Care is taken to appropriately emulate the results when segmentation is used. That is, rather than relying on USER_DS and USER_CS, the function insn_get_addr_ref() inspects the segment descriptor pointed by the registers in pt_regs. This ensures that we correctly obtain the segment base address and the address and operand sizes even if the user space application uses a local descriptor table. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-8-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/umip.h | 12 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/umip.c | 321 ++++++++++++++++++++++++++++++++++++ 3 files changed, 334 insertions(+) create mode 100644 arch/x86/include/asm/umip.h create mode 100644 arch/x86/kernel/umip.c diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h new file mode 100644 index 000000000000..db43f2a0d92c --- /dev/null +++ b/arch/x86/include/asm/umip.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_UMIP_H +#define _ASM_X86_UMIP_H + +#include +#include + +#ifdef CONFIG_X86_INTEL_UMIP +bool fixup_umip_exception(struct pt_regs *regs); +#else +static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; } +#endif /* CONFIG_X86_INTEL_UMIP */ +#endif /* _ASM_X86_UMIP_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 295abaa58add..81bb565f4497 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -127,6 +127,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_INTEL_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c new file mode 100644 index 000000000000..d80b816228ea --- /dev/null +++ b/arch/x86/kernel/umip.c @@ -0,0 +1,321 @@ +/* + * umip.c Emulation for instruction protected by the Intel User-Mode + * Instruction Prevention feature + * + * Copyright (c) 2017, Intel Corporation. + * Ricardo Neri + */ + +#include +#include +#include +#include +#include + +/** DOC: Emulation for User-Mode Instruction Prevention (UMIP) + * + * The feature User-Mode Instruction Prevention present in recent Intel + * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * from being executed with CPL > 0. Otherwise, a general protection fault is + * issued. + * + * Rather than relaying to the user space the general protection fault caused by + * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be + * trapped and emulate the result of such instructions to provide dummy values. + * This allows to both conserve the current kernel behavior and not reveal the + * system resources that UMIP intends to protect (i.e., the locations of the + * global descriptor and interrupt descriptor tables, the segment selectors of + * the local descriptor table, the value of the task state register and the + * contents of the CR0 register). + * + * This emulation is needed because certain applications (e.g., WineHQ and + * DOSEMU2) rely on this subset of instructions to function. + * + * The instructions protected by UMIP can be split in two groups. Those which + * return a kernel memory address (sgdt and sidt) and those which return a + * value (sldt, str and smsw). + * + * For the instructions that return a kernel memory address, applications + * such as WineHQ rely on the result being located in the kernel memory space, + * not the actual location of the table. The result is emulated as a hard-coded + * value that, lies close to the top of the kernel memory. The limit for the GDT + * and the IDT are set to zero. + * + * Given that sldt and str are not commonly used in programs that run on WineHQ + * or DOSEMU2, they are not emulated. + * + * The instruction smsw is emulated to return the value that the register CR0 + * has at boot time as set in the head_32. + * + * Also, emulation is provided only for 32-bit processes; 64-bit processes + * that attempt to use the instructions that UMIP protects will receive the + * SIGSEGV signal issued as a consequence of the general protection fault. + * + * Care is taken to appropriately emulate the results when segmentation is + * used. That is, rather than relying on USER_DS and USER_CS, the function + * insn_get_addr_ref() inspects the segment descriptor pointed by the + * registers in pt_regs. This ensures that we correctly obtain the segment + * base address and the address and operand sizes even if the user space + * application uses a local descriptor table. + */ + +#define UMIP_DUMMY_GDT_BASE 0xfffe0000 +#define UMIP_DUMMY_IDT_BASE 0xffff0000 + +/* + * The SGDT and SIDT instructions store the contents of the global descriptor + * table and interrupt table registers, respectively. The destination is a + * memory operand of X+2 bytes. X bytes are used to store the base address of + * the table and 2 bytes are used to store the limit. In 32-bit processes, the + * only processes for which emulation is provided, X has a value of 4. + */ +#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_LIMIT_SIZE 2 + +#define UMIP_INST_SGDT 0 /* 0F 01 /0 */ +#define UMIP_INST_SIDT 1 /* 0F 01 /1 */ +#define UMIP_INST_SMSW 3 /* 0F 01 /4 */ + +/** + * identify_insn() - Identify a UMIP-protected instruction + * @insn: Instruction structure with opcode and ModRM byte. + * + * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected + * instruction that can be emulated. + * + * Returns: + * + * On success, a constant identifying a specific UMIP-protected instruction that + * can be emulated. + * + * -EINVAL on error or when not an UMIP-protected instruction that can be + * emulated. + */ +static int identify_insn(struct insn *insn) +{ + /* By getting modrm we also get the opcode. */ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + /* All the instructions of interest start with 0x0f. */ + if (insn->opcode.bytes[0] != 0xf) + return -EINVAL; + + if (insn->opcode.bytes[1] == 0x1) { + switch (X86_MODRM_REG(insn->modrm.value)) { + case 0: + return UMIP_INST_SGDT; + case 1: + return UMIP_INST_SIDT; + case 4: + return UMIP_INST_SMSW; + default: + return -EINVAL; + } + } + + /* SLDT AND STR are not emulated */ + return -EINVAL; +} + +/** + * emulate_umip_insn() - Emulate UMIP instructions and return dummy values + * @insn: Instruction structure with operands + * @umip_inst: A constant indicating the instruction to emulate + * @data: Buffer into which the dummy result is stored + * @data_size: Size of the emulated result + * + * Emulate an instruction protected by UMIP and provide a dummy result. The + * result of the emulation is saved in @data. The size of the results depends + * on both the instruction and type of operand (register vs memory address). + * The size of the result is updated in @data_size. Caller is responsible + * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + + * UMIP_GDT_IDT_LIMIT_SIZE bytes. + * + * Returns: + * + * 0 on success, -EINVAL on error while emulating. + */ +static int emulate_umip_insn(struct insn *insn, int umip_inst, + unsigned char *data, int *data_size) +{ + unsigned long dummy_base_addr, dummy_value; + unsigned short dummy_limit = 0; + + if (!data || !data_size || !insn) + return -EINVAL; + /* + * These two instructions return the base address and limit of the + * global and interrupt descriptor table, respectively. According to the + * Intel Software Development manual, the base address can be 24-bit, + * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is + * 16-bit, the returned value of the base address is supposed to be a + * zero-extended 24-byte number. However, it seems that a 32-byte number + * is always returned irrespective of the operand size. + */ + if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + /* SGDT and SIDT do not use registers operands. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + + if (umip_inst == UMIP_INST_SGDT) + dummy_base_addr = UMIP_DUMMY_GDT_BASE; + else + dummy_base_addr = UMIP_DUMMY_IDT_BASE; + + *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + + memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); + + } else if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + + /* + * Even though the CR0 register has 4 bytes, the number + * of bytes to be copied in the result buffer is determined + * by whether the operand is a register or a memory location. + * If operand is a register, return as many bytes as the operand + * size. If operand is memory, return only the two least + * siginificant bytes of CR0. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + *data_size = insn->opnd_bytes; + else + *data_size = 2; + + memcpy(data, &dummy_value, *data_size); + /* STR and SLDT are not emulated */ + } else { + return -EINVAL; + } + + return 0; +} + +/** + * fixup_umip_exception() - Fixup a general protection fault caused by UMIP + * @regs: Registers as saved when entering the #GP handler + * + * The instructions sgdt, sidt, str, smsw, sldt cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). If the offending + * user-space process is not in long mode, this function fixes the exception + * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not + * fixed up. Also long mode user-space processes are not fixed up. + * + * If operands are memory addresses, results are copied to user-space memory as + * indicated by the instruction pointed by eIP using the registers indicated in + * the instruction operands. If operands are registers, results are copied into + * the context that was saved when entering kernel mode. + * + * Returns: + * + * True if emulation was successful; false if not. + */ +bool fixup_umip_exception(struct pt_regs *regs) +{ + int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; + unsigned long seg_base = 0, *reg_addr; + /* 10 bytes is the maximum size of the result of UMIP instructions */ + unsigned char dummy_data[10] = { 0 }; + unsigned char buf[MAX_INSN_SIZE]; + void __user *uaddr; + struct insn insn; + char seg_defs; + + if (!regs) + return false; + + /* Do not emulate 64-bit processes. */ + if (user_64bit_mode(regs)) + return false; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + + if (seg_base == -1L) + return false; + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + + /* + * The copy_from_user above could have failed if user code is protected + * by a memory protection key. Give up on emulation in such a case. + * Should we issue a page fault? + */ + if (!nr_copied) + return false; + + insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(&insn); + if (nr_copied < insn.length) + return false; + + umip_inst = identify_insn(&insn); + if (umip_inst < 0) + return false; + + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + return false; + + /* + * If operand is a register, write result to the copy of the register + * value that was pushed to the stack when entering into kernel mode. + * Upon exit, the value we write will be restored to the actual hardware + * register. + */ + if (X86_MODRM_MOD(insn.modrm.value) == 3) { + reg_offset = insn_get_modrm_rm_off(&insn, regs); + + /* + * Negative values are usually errors. In memory addressing, + * the exception is -EDOM. Since we expect a register operand, + * all negative values are errors. + */ + if (reg_offset < 0) + return false; + + reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); + memcpy(reg_addr, dummy_data, dummy_data_size); + } else { + uaddr = insn_get_addr_ref(&insn, regs); + if ((unsigned long)uaddr == -1L) + return false; + + nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); + if (nr_copied > 0) + return false; + } + + /* increase IP to let the program keep going */ + regs->ip += insn.length; + return true; +} From c6a960bbf6a36572a06bde866d94a7338c7f256a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:53 -0800 Subject: [PATCH 1019/1085] x86/umip: Force a page fault when unable to copy emulated result to user fixup_umip_exception() will be called from do_general_protection(). If the former returns false, the latter will issue a SIGSEGV with SEND_SIG_PRIV. However, when emulation is successful but the emulated result cannot be copied to user space memory, it is more accurate to issue a SIGSEGV with SEGV_MAPERR with the offending address. A new function, inspired in force_sig_info_fault(), is introduced to model the page fault. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-9-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/umip.c | 49 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index d80b816228ea..6ba82be68cff 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -11,6 +11,10 @@ #include #include #include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "umip: " fmt /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * @@ -195,6 +199,41 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, return 0; } +/** + * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR + * @addr: Address that caused the signal + * @regs: Register set containing the instruction pointer + * + * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is + * intended to be used to provide a segmentation fault when the result of the + * UMIP emulation could not be copied to the user space memory. + * + * Returns: none + */ +static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) +{ + siginfo_t info; + struct task_struct *tsk = current; + + tsk->thread.cr2 = (unsigned long)addr; + tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; + tsk->thread.trap_nr = X86_TRAP_PF; + + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = addr; + force_sig_info(SIGSEGV, &info, tsk); + + if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) + return; + + pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n", + tsk->comm, task_pid_nr(tsk), regs->ip, + regs->sp, X86_PF_USER | X86_PF_WRITE, + regs->ip); +} + /** * fixup_umip_exception() - Fixup a general protection fault caused by UMIP * @regs: Registers as saved when entering the #GP handler @@ -311,8 +350,14 @@ bool fixup_umip_exception(struct pt_regs *regs) return false; nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); - if (nr_copied > 0) - return false; + if (nr_copied > 0) { + /* + * If copy fails, send a signal and tell caller that + * fault was fixed up. + */ + force_sig_info_umip_fault(uaddr, regs); + return true; + } } /* increase IP to let the program keep going */ From aa35f896979d9610bb11df485cf7bb6ca241febb Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:54 -0800 Subject: [PATCH 1020/1085] x86/umip: Enable User-Mode Instruction Prevention at runtime User-Mode Instruction Prevention (UMIP) is enabled by setting/clearing a bit in %cr4. It makes sense to enable UMIP at some point while booting, before user spaces come up. Like SMAP and SMEP, is not critical to have it enabled very early during boot. This is because UMIP is relevant only when there is a user space to be protected from. Given these similarities, UMIP can be enabled along with SMAP and SMEP. At the moment, UMIP is disabled by default at build time. It can be enabled at build time by selecting CONFIG_X86_INTEL_UMIP. If enabled at build time, it can be disabled at run time by adding clearcpuid=514 to the kernel parameters. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-10-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 ++++++++++ arch/x86/kernel/cpu/common.c | 25 ++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae940a0ed3b..e19fa9f7079a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1802,6 +1802,16 @@ config X86_SMAP If unsure, say Y. +config X86_INTEL_UMIP + def_bool n + depends on CPU_SUP_INTEL + prompt "Intel User Mode Instruction Prevention" if EXPERT + ---help--- + The User Mode Instruction Prevention (UMIP) is a security + feature in newer Intel processors. If enabled, a general + protection fault is issued if the instructions SGDT, SLDT, + SIDT, SMSW and STR are executed in user mode. + config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cdf79ab628c2..47f8a85be11c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -329,6 +329,28 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1147,9 +1169,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP */ + /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); + setup_umip(c); /* * The vendor-specific functions might have changed features. From 6fc9dc81bff0ea461db534e2672acfdaf76f3e4e Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:55 -0800 Subject: [PATCH 1021/1085] x86/traps: Fix up general protection faults caused by UMIP If the User-Mode Instruction Prevention CPU feature is available and enabled, a general protection fault will be issued if the instructions sgdt, sldt, sidt, str or smsw are executed from user-mode context (CPL > 0). If the fault was caused by any of the instructions protected by UMIP, fixup_umip_exception() will emulate dummy results for these instructions as follows: in virtual-8086 and protected modes, sgdt, sidt and smsw are emulated; str and sldt are not emulated. No emulation is done for user-space long mode processes. If emulation is successful, the emulated result is passed to the user space program and no SIGSEGV signal is emitted. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-11-git-send-email-ricardo.neri-calderon@linux.intel.com [ Added curly braces. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 42a9c4458f5d..ab54bf398803 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -513,6 +514,11 @@ do_general_protection(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); + if (static_cpu_has(X86_FEATURE_UMIP)) { + if (user_mode(regs) && fixup_umip_exception(regs)) + return; + } + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); From 9390afebe1d3f5a0be18b1afdd0ce09d67cebf9e Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:56 -0800 Subject: [PATCH 1022/1085] selftests/x86: Add tests for User-Mode Instruction Prevention Certain user space programs that run on virtual-8086 mode may utilize instructions protected by the User-Mode Instruction Prevention (UMIP) security feature present in new Intel processors: SGDT, SIDT and SMSW. In such a case, a general protection fault is issued if UMIP is enabled. When such a fault happens, the kernel traps it and emulates the results of these instructions with dummy values. The purpose of this new test is to verify whether the impacted instructions can be executed without causing such #GP. If no #GP exceptions occur, we expect to exit virtual-8086 mode from INT3. The instructions protected by UMIP are executed in representative use cases: a) displacement-only memory addressing b) register-indirect memory addressing c) results stored directly in operands Unfortunately, it is not possible to check the results against a set of expected values because no emulation will occur in systems that do not have the UMIP feature. Instead, results are printed for verification. A simple verification is done to ensure that results of all tests are identical. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-12-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/entry_from_vm86.c | 73 ++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index d075ea0e5ca1..f7d9ceac7e06 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -95,6 +95,22 @@ asm ( "int3\n\t" "vmcode_int80:\n\t" "int $0x80\n\t" + "vmcode_umip:\n\t" + /* addressing via displacements */ + "smsw (2052)\n\t" + "sidt (2054)\n\t" + "sgdt (2060)\n\t" + /* addressing via registers */ + "mov $2066, %bx\n\t" + "smsw (%bx)\n\t" + "mov $2068, %bx\n\t" + "sidt (%bx)\n\t" + "mov $2074, %bx\n\t" + "sgdt (%bx)\n\t" + /* register operands, only for smsw */ + "smsw %ax\n\t" + "mov %ax, (2080)\n\t" + "int3\n\t" ".size vmcode, . - vmcode\n\t" "end_vmcode:\n\t" ".code32\n\t" @@ -103,7 +119,7 @@ asm ( extern unsigned char vmcode[], end_vmcode[]; extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], - vmcode_sti[], vmcode_int3[], vmcode_int80[]; + vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_umip[]; /* Returns false if the test was skipped. */ static bool do_test(struct vm86plus_struct *v86, unsigned long eip, @@ -160,6 +176,58 @@ static bool do_test(struct vm86plus_struct *v86, unsigned long eip, return true; } +void do_umip_tests(struct vm86plus_struct *vm86, unsigned char *test_mem) +{ + struct table_desc { + unsigned short limit; + unsigned long base; + } __attribute__((packed)); + + /* Initialize variables with arbitrary values */ + struct table_desc gdt1 = { .base = 0x3c3c3c3c, .limit = 0x9999 }; + struct table_desc gdt2 = { .base = 0x1a1a1a1a, .limit = 0xaeae }; + struct table_desc idt1 = { .base = 0x7b7b7b7b, .limit = 0xf1f1 }; + struct table_desc idt2 = { .base = 0x89898989, .limit = 0x1313 }; + unsigned short msw1 = 0x1414, msw2 = 0x2525, msw3 = 3737; + + /* UMIP -- exit with INT3 unless kernel emulation did not trap #GP */ + do_test(vm86, vmcode_umip - vmcode, VM86_TRAP, 3, "UMIP tests"); + + /* Results from displacement-only addressing */ + msw1 = *(unsigned short *)(test_mem + 2052); + memcpy(&idt1, test_mem + 2054, sizeof(idt1)); + memcpy(&gdt1, test_mem + 2060, sizeof(gdt1)); + + /* Results from register-indirect addressing */ + msw2 = *(unsigned short *)(test_mem + 2066); + memcpy(&idt2, test_mem + 2068, sizeof(idt2)); + memcpy(&gdt2, test_mem + 2074, sizeof(gdt2)); + + /* Results when using register operands */ + msw3 = *(unsigned short *)(test_mem + 2080); + + printf("[INFO]\tResult from SMSW:[0x%04x]\n", msw1); + printf("[INFO]\tResult from SIDT: limit[0x%04x]base[0x%08lx]\n", + idt1.limit, idt1.base); + printf("[INFO]\tResult from SGDT: limit[0x%04x]base[0x%08lx]\n", + gdt1.limit, gdt1.base); + + if (msw1 != msw2 || msw1 != msw3) + printf("[FAIL]\tAll the results of SMSW should be the same.\n"); + else + printf("[PASS]\tAll the results from SMSW are identical.\n"); + + if (memcmp(&gdt1, &gdt2, sizeof(gdt1))) + printf("[FAIL]\tAll the results of SGDT should be the same.\n"); + else + printf("[PASS]\tAll the results from SGDT are identical.\n"); + + if (memcmp(&idt1, &idt2, sizeof(idt1))) + printf("[FAIL]\tAll the results of SIDT should be the same.\n"); + else + printf("[PASS]\tAll the results from SIDT are identical.\n"); +} + int main(void) { struct vm86plus_struct v86; @@ -218,6 +286,9 @@ int main(void) v86.regs.eax = (unsigned int)-1; do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80"); + /* UMIP -- should exit with INTx 0x80 unless UMIP was not disabled */ + do_umip_tests(&v86, addr); + /* Execute a null pointer */ v86.regs.cs = 0; v86.regs.ss = 0; From a9e017d5619eb371460c8e516f4684def62bef3a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:57 -0800 Subject: [PATCH 1023/1085] selftests/x86: Add tests for the STR and SLDT instructions The STR and SLDT instructions are not valid when running on virtual-8086 mode and generate an invalid operand exception. These two instructions are protected by the Intel User-Mode Instruction Prevention (UMIP) security feature. In protected mode, if UMIP is enabled, these instructions generate a general protection fault if called from CPL > 0. Linux traps the general protection fault and emulates the instructions sgdt, sidt and smsw; but not str and sldt. These tests are added to verify that the emulation code does not emulate these two instructions but the expected invalid operand exception is seen. Tests fallback to exit with INT3 in case emulation does happen. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-13-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/entry_from_vm86.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index f7d9ceac7e06..361466a2eaef 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -111,6 +111,11 @@ asm ( "smsw %ax\n\t" "mov %ax, (2080)\n\t" "int3\n\t" + "vmcode_umip_str:\n\t" + "str %eax\n\t" + "vmcode_umip_sldt:\n\t" + "sldt %eax\n\t" + "int3\n\t" ".size vmcode, . - vmcode\n\t" "end_vmcode:\n\t" ".code32\n\t" @@ -119,7 +124,8 @@ asm ( extern unsigned char vmcode[], end_vmcode[]; extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], - vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_umip[]; + vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_umip[], + vmcode_umip_str[], vmcode_umip_sldt[]; /* Returns false if the test was skipped. */ static bool do_test(struct vm86plus_struct *v86, unsigned long eip, @@ -226,6 +232,16 @@ void do_umip_tests(struct vm86plus_struct *vm86, unsigned char *test_mem) printf("[FAIL]\tAll the results of SIDT should be the same.\n"); else printf("[PASS]\tAll the results from SIDT are identical.\n"); + + sethandler(SIGILL, sighandler, 0); + do_test(vm86, vmcode_umip_str - vmcode, VM86_SIGNAL, 0, + "STR instruction"); + clearhandler(SIGILL); + + sethandler(SIGILL, sighandler, 0); + do_test(vm86, vmcode_umip_sldt - vmcode, VM86_SIGNAL, 0, + "SLDT instruction"); + clearhandler(SIGILL); } int main(void) From 47427379ea80f1368ccec6ba20874fc19a9f0cc4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 7 Nov 2017 10:28:06 -0800 Subject: [PATCH 1024/1085] documentation: fb: update list of available compiled-in fonts Update list of available compiled-in fonts in lib/fonts/: add 6x10 and drop RomanLarge (which was reverted 12 years ago). Also sort the list alphabetically. Signed-off-by: Randy Dunlap Acked-by: Geert Uytterhoeven # v1 Signed-off-by: Jonathan Corbet --- Documentation/fb/fbcon.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt index a38d3aa4d189..79c22d096bbc 100644 --- a/Documentation/fb/fbcon.txt +++ b/Documentation/fb/fbcon.txt @@ -77,8 +77,8 @@ C. Boot options 1. fbcon=font: Select the initial font to use. The value 'name' can be any of the - compiled-in fonts: VGA8x16, 7x14, 10x18, VGA8x8, MINI4x6, RomanLarge, - SUN8x16, SUN12x22, ProFont6x11, Acorn8x8, PEARL8x8. + compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, + PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, VGA8x16, VGA8x8. Note, not all drivers can handle font with widths not divisible by 8, such as vga16fb. From 408358b50deaf59b07c82a7bff8c7e7cce031fae Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Tue, 7 Nov 2017 10:22:32 -0500 Subject: [PATCH 1025/1085] s390: vfio-ccw: Do not attempt to free no-op, test and tic cda. Because we do not make use of the cda (channel data address) for test, no-op ccws no address translation takes place. This means cda could contain a guest address which we do not want to attempt to free. Let's check the command type and skip cda free when it is not needed. For a TIC ccw, ccw->cda points to either a ccw in an existing chain or it points to a whole new allocated chain. In either case the data will be freed when the owning chain is freed. Signed-off-by: Jason J. Herne Reviewed-by: Dong Jia Shi Reviewed-by: Pierre Morel Message-Id: <1510068152-21988-1-git-send-email-jjherne@linux.vnet.ibm.com> Reviewed-by: Halil Pasic Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index d8f98ad9b029..7ebbd8b482f5 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -332,6 +332,8 @@ static void ccwchain_cda_free(struct ccwchain *chain, int idx) { struct ccw1 *ccw = chain->ch_ccw + idx; + if (ccw_is_test(ccw) || ccw_is_noop(ccw) || ccw_is_tic(ccw)) + return; if (!ccw->count) return; From 46cdc6d533c925c7477feea7caa404466bac7ac8 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sat, 7 Oct 2017 22:12:48 -0400 Subject: [PATCH 1026/1085] vfs: fix mounting a filesystem with i_version The mount i_version flag is not enabled in the new sb_flags. This patch adds the missing SB_I_VERSION flag. Fixes: e462ec5 "VFS: Differentiate mount flags (MS_*) from internal superblock flags" Cc: David Howells Cc: Al Viro Signed-off-by: Mimi Zohar --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 54059b142d6b..2f641c715d42 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2823,7 +2823,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | - SB_POSIXACL); + SB_POSIXACL | + SB_I_VERSION); if (flags & MS_REMOUNT) retval = do_remount(&path, flags, sb_flags, mnt_flags, From 2068626d1345f23fd2b926d834d4f74b37cd7134 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 27 Jun 2017 16:10:39 -0400 Subject: [PATCH 1027/1085] ima: don't remove the securityfs policy file The securityfs policy file is removed unless additional rules can be appended to the IMA policy (CONFIG_IMA_WRITE_POLICY), regardless as to whether the policy is configured so that it can be displayed. This patch changes this behavior, removing the securityfs policy file, only if CONFIG_IMA_READ_POLICY is also not enabled. Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index ad491c51e833..4d50b982b453 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -429,10 +429,10 @@ static int ima_release_policy(struct inode *inode, struct file *file) } ima_update_policy(); -#ifndef CONFIG_IMA_WRITE_POLICY +#if !defined(CONFIG_IMA_WRITE_POLICY) && !defined(CONFIG_IMA_READ_POLICY) securityfs_remove(ima_policy); ima_policy = NULL; -#else +#elif defined(CONFIG_IMA_WRITE_POLICY) clear_bit(IMA_FS_BUSY, &ima_fs_flags); #endif return 0; From f3cc6b25dcc5616f0d5c720009b2ac66f97df2ff Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sat, 17 Jun 2017 23:56:23 -0400 Subject: [PATCH 1028/1085] ima: always measure and audit files in policy All files matching a "measure" rule must be included in the IMA measurement list, even when the file hash cannot be calculated. Similarly, all files matching an "audit" rule must be audited, even when the file hash can not be calculated. The file data hash field contained in the IMA measurement list template data will contain 0's instead of the actual file hash digest. Note: In general, adding, deleting or in anyway changing which files are included in the IMA measurement list is not a good idea, as it might result in not being able to unseal trusted keys sealed to a specific TPM PCR value. This patch not only adds file measurements that were not previously measured, but specifies that the file hash value for these files will be 0's. As the IMA measurement list ordering is not consistent from one boot to the next, it is unlikely that anyone is sealing keys based on the IMA measurement list. Remote attestation servers should be able to process these new measurement records, but might complain about these unknown records. Signed-off-by: Mimi Zohar Reviewed-by: Dmitry Kasatkin --- security/integrity/ima/ima_api.c | 65 ++++++++++++++++++----------- security/integrity/ima/ima_crypto.c | 10 +++++ security/integrity/ima/ima_main.c | 9 ++-- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index c2edba8de35e..c7e8db0ea4c0 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -199,42 +199,59 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, struct inode *inode = file_inode(file); const char *filename = file->f_path.dentry->d_name.name; int result = 0; + int length; + void *tmpbuf; + u64 i_version; struct { struct ima_digest_data hdr; char digest[IMA_MAX_DIGEST_SIZE]; } hash; - if (!(iint->flags & IMA_COLLECTED)) { - u64 i_version = file_inode(file)->i_version; + if (iint->flags & IMA_COLLECTED) + goto out; - if (file->f_flags & O_DIRECT) { - audit_cause = "failed(directio)"; - result = -EACCES; - goto out; - } + /* + * Dectecting file change is based on i_version. On filesystems + * which do not support i_version, support is limited to an initial + * measurement/appraisal/audit. + */ + i_version = file_inode(file)->i_version; + hash.hdr.algo = algo; - hash.hdr.algo = algo; + /* Initialize hash digest to 0's in case of failure */ + memset(&hash.digest, 0, sizeof(hash.digest)); - result = (!buf) ? ima_calc_file_hash(file, &hash.hdr) : - ima_calc_buffer_hash(buf, size, &hash.hdr); - if (!result) { - int length = sizeof(hash.hdr) + hash.hdr.length; - void *tmpbuf = krealloc(iint->ima_hash, length, - GFP_NOFS); - if (tmpbuf) { - iint->ima_hash = tmpbuf; - memcpy(iint->ima_hash, &hash, length); - iint->version = i_version; - iint->flags |= IMA_COLLECTED; - } else - result = -ENOMEM; - } + if (buf) + result = ima_calc_buffer_hash(buf, size, &hash.hdr); + else + result = ima_calc_file_hash(file, &hash.hdr); + + if (result && result != -EBADF && result != -EINVAL) + goto out; + + length = sizeof(hash.hdr) + hash.hdr.length; + tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS); + if (!tmpbuf) { + result = -ENOMEM; + goto out; } + + iint->ima_hash = tmpbuf; + memcpy(iint->ima_hash, &hash, length); + iint->version = i_version; + + /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ + if (!result) + iint->flags |= IMA_COLLECTED; out: - if (result) + if (result) { + if (file->f_flags & O_DIRECT) + audit_cause = "failed(directio)"; + integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename, "collect_data", audit_cause, result, 0); + } return result; } @@ -278,7 +295,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, } result = ima_store_template(entry, violation, inode, filename, pcr); - if (!result || result == -EEXIST) { + if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) { iint->flags |= IMA_MEASURED; iint->measured_pcrs |= (0x1 << pcr); } diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 802d5d20f36f..a856d8c9c9f3 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -441,6 +441,16 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) loff_t i_size; int rc; + /* + * For consistency, fail file's opened with the O_DIRECT flag on + * filesystems mounted with/without DAX option. + */ + if (file->f_flags & O_DIRECT) { + hash->length = hash_digest_size[ima_hash_algo]; + hash->algo = ima_hash_algo; + return -EINVAL; + } + i_size = i_size_read(file_inode(file)); if (ima_ahash_minsize && i_size >= ima_ahash_minsize) { diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 2aebb7984437..12738c8f39c2 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -235,11 +235,8 @@ static int process_measurement(struct file *file, char *buf, loff_t size, hash_algo = ima_get_hash_algo(xattr_value, xattr_len); rc = ima_collect_measurement(iint, file, buf, size, hash_algo); - if (rc != 0) { - if (file->f_flags & O_DIRECT) - rc = (iint->flags & IMA_PERMIT_DIRECTIO) ? 0 : -EACCES; + if (rc != 0 && rc != -EBADF && rc != -EINVAL) goto out_digsig; - } if (!pathbuf) /* ima_rdwr_violation possibly pre-fetched */ pathname = ima_d_path(&file->f_path, &pathbuf, filename); @@ -247,12 +244,14 @@ static int process_measurement(struct file *file, char *buf, loff_t size, if (action & IMA_MEASURE) ima_store_measurement(iint, file, pathname, xattr_value, xattr_len, pcr); - if (action & IMA_APPRAISE_SUBMASK) + if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) rc = ima_appraise_measurement(func, iint, file, pathname, xattr_value, xattr_len, opened); if (action & IMA_AUDIT) ima_audit_measurement(iint, pathname); + if ((file->f_flags & O_DIRECT) && (iint->flags & IMA_PERMIT_DIRECTIO)) + rc = 0; out_digsig: if ((mask & MAY_WRITE) && (iint->flags & IMA_DIGSIG) && !(iint->flags & IMA_NEW_FILE)) From a7d3d0392a325d630225b7dbccf2558f944114e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Sep 2017 09:49:45 +0200 Subject: [PATCH 1029/1085] integrity: use kernel_read_file_from_path() to read x509 certs The CONFIG_IMA_LOAD_X509 and CONFIG_EVM_LOAD_X509 options permit loading x509 signed certificates onto the trusted keyrings without verifying the x509 certificate file's signature. This patch replaces the call to the integrity_read_file() specific function with the common kernel_read_file_from_path() function. To avoid verifying the file signature, this patch defines READING_X509_CERTFICATE. Signed-off-by: Christoph Hellwig Signed-off-by: Mimi Zohar --- include/linux/fs.h | 1 + security/integrity/digsig.c | 14 +++++---- security/integrity/iint.c | 49 ------------------------------- security/integrity/ima/ima_main.c | 4 +++ security/integrity/integrity.h | 2 -- 5 files changed, 14 insertions(+), 56 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 339e73742e73..456325084f1d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2792,6 +2792,7 @@ extern int do_pipe_flags(int *, int); id(KEXEC_IMAGE, kexec-image) \ id(KEXEC_INITRAMFS, kexec-initramfs) \ id(POLICY, security-policy) \ + id(X509_CERTIFICATE, x509-certificate) \ id(MAX_ID, ) #define __fid_enumify(ENUM, dummy) READING_ ## ENUM, diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 06554c448dce..6f9e4ce568cd 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -112,21 +112,25 @@ int __init integrity_init_keyring(const unsigned int id) int __init integrity_load_x509(const unsigned int id, const char *path) { key_ref_t key; - char *data; + void *data; + loff_t size; int rc; if (!keyring[id]) return -EINVAL; - rc = integrity_read_file(path, &data); - if (rc < 0) + rc = kernel_read_file_from_path(path, &data, &size, 0, + READING_X509_CERTIFICATE); + if (rc < 0) { + pr_err("Unable to open file: %s (%d)", path, rc); return rc; + } key = key_create_or_update(make_key_ref(keyring[id], 1), "asymmetric", NULL, data, - rc, + size, ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA); @@ -139,6 +143,6 @@ int __init integrity_load_x509(const unsigned int id, const char *path) key_ref_to_ptr(key)->description, path); key_ref_put(key); } - kfree(data); + vfree(data); return 0; } diff --git a/security/integrity/iint.c b/security/integrity/iint.c index 6fc888ca468e..c84e05866052 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -199,55 +199,6 @@ int integrity_kernel_read(struct file *file, loff_t offset, return ret; } -/* - * integrity_read_file - read entire file content into the buffer - * - * This is function opens a file, allocates the buffer of required - * size, read entire file content to the buffer and closes the file - * - * It is used only by init code. - * - */ -int __init integrity_read_file(const char *path, char **data) -{ - struct file *file; - loff_t size; - char *buf; - int rc = -EINVAL; - - if (!path || !*path) - return -EINVAL; - - file = filp_open(path, O_RDONLY, 0); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - pr_err("Unable to open file: %s (%d)", path, rc); - return rc; - } - - size = i_size_read(file_inode(file)); - if (size <= 0) - goto out; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } - - rc = integrity_kernel_read(file, 0, buf, size); - if (rc == size) { - *data = buf; - } else { - kfree(buf); - if (rc >= 0) - rc = -EIO; - } -out: - fput(file); - return rc; -} - /* * integrity_load_keys - load integrity keys hook * diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 12738c8f39c2..d47f92e97f80 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -405,6 +405,10 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, if (!file && read_id == READING_MODULE) /* MODULE_SIG_FORCE enabled */ return 0; + /* permit signed certs */ + if (!file && read_id == READING_X509_CERTIFICATE) + return 0; + if (!file || !buf || size == 0) { /* should never happen */ if (ima_appraise & IMA_APPRAISE_ENFORCE) return -EACCES; diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index a53e7e4ab06c..e1bf040fb110 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -120,8 +120,6 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode); int integrity_kernel_read(struct file *file, loff_t offset, void *addr, unsigned long count); -int __init integrity_read_file(const char *path, char **data); - #define INTEGRITY_KEYRING_EVM 0 #define INTEGRITY_KEYRING_IMA 1 #define INTEGRITY_KEYRING_MODULE 2 From bb02b186d02f90f693bc573c392df843b024f4ef Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 21 Jun 2017 21:13:18 -0400 Subject: [PATCH 1030/1085] ima: call ima_file_free() prior to calling fasync The file hash is calculated and written out as an xattr after calling fasync(). In order for the file data and metadata to be written out to disk at the same time, this patch calculates the file hash and stores it as an xattr before calling fasync. Signed-off-by: Mimi Zohar --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 61517f57f8ef..49e1f2f1a4cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -201,11 +201,11 @@ static void __fput(struct file *file) eventpoll_release(file); locks_remove_file(file); + ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } - ima_file_free(file); if (file->f_op->release) file->f_op->release(inode, file); security_file_free(file); From 096b85464832d2a7bd7bd6d4db2fafed2ab77244 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 13 Oct 2017 15:09:25 -0700 Subject: [PATCH 1031/1085] EVM: Include security.apparmor in EVM measurements Apparmor will be gaining support for security.apparmor labels, and it would be helpful to include these in EVM validation now so appropriate signatures can be generated even before full support is merged. Signed-off-by: Matthew Garrett Acked-by: John Johansen Signed-off-by: Mimi Zohar --- include/uapi/linux/xattr.h | 3 +++ security/integrity/evm/evm_main.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 1590c49cae57..e630b9cd70cb 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -65,6 +65,9 @@ #define XATTR_NAME_SMACKTRANSMUTE XATTR_SECURITY_PREFIX XATTR_SMACK_TRANSMUTE #define XATTR_NAME_SMACKMMAP XATTR_SECURITY_PREFIX XATTR_SMACK_MMAP +#define XATTR_APPARMOR_SUFFIX "apparmor" +#define XATTR_NAME_APPARMOR XATTR_SECURITY_PREFIX XATTR_APPARMOR_SUFFIX + #define XATTR_CAPS_SUFFIX "capability" #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 063d38aef64e..9826c02e2db8 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -49,6 +49,9 @@ char *evm_config_xattrnames[] = { XATTR_NAME_SMACKMMAP, #endif #endif +#ifdef CONFIG_SECURITY_APPARMOR + XATTR_NAME_APPARMOR, +#endif #ifdef CONFIG_IMA_APPRAISE XATTR_NAME_IMA, #endif From f00d79750712511d0a83c108eea0d44b680a915f Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 11 Oct 2017 12:10:14 -0700 Subject: [PATCH 1032/1085] EVM: Allow userspace to signal an RSA key has been loaded EVM will only perform validation once a key has been loaded. This key may either be a symmetric trusted key (for HMAC validation and creation) or the public half of an asymmetric key (for digital signature validation). The /sys/kernel/security/evm interface allows userland to signal that a symmetric key has been loaded, but does not allow userland to signal that an asymmetric public key has been loaded. This patch extends the interface to permit userspace to pass a bitmask of loaded key types. It also allows userspace to block loading of a symmetric key in order to avoid a compromised system from being able to load an additional key type later. Signed-off-by: Matthew Garrett Signed-off-by: Mimi Zohar --- Documentation/ABI/testing/evm | 47 +++++++++++++++++++++--------- security/integrity/evm/evm.h | 3 ++ security/integrity/evm/evm_secfs.c | 31 +++++++++++--------- 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/Documentation/ABI/testing/evm b/Documentation/ABI/testing/evm index 8374d4557e5d..a0bbccb00736 100644 --- a/Documentation/ABI/testing/evm +++ b/Documentation/ABI/testing/evm @@ -7,17 +7,36 @@ Description: HMAC-sha1 value across the extended attributes, storing the value as the extended attribute 'security.evm'. - EVM depends on the Kernel Key Retention System to provide it - with a trusted/encrypted key for the HMAC-sha1 operation. - The key is loaded onto the root's keyring using keyctl. Until - EVM receives notification that the key has been successfully - loaded onto the keyring (echo 1 > /evm), EVM - can not create or validate the 'security.evm' xattr, but - returns INTEGRITY_UNKNOWN. Loading the key and signaling EVM - should be done as early as possible. Normally this is done - in the initramfs, which has already been measured as part - of the trusted boot. For more information on creating and - loading existing trusted/encrypted keys, refer to: - Documentation/keys-trusted-encrypted.txt. (A sample dracut - patch, which loads the trusted/encrypted key and enables - EVM, is available from http://linux-ima.sourceforge.net/#EVM.) + EVM supports two classes of security.evm. The first is + an HMAC-sha1 generated locally with a + trusted/encrypted key stored in the Kernel Key + Retention System. The second is a digital signature + generated either locally or remotely using an + asymmetric key. These keys are loaded onto root's + keyring using keyctl, and EVM is then enabled by + echoing a value to /evm: + + 1: enable HMAC validation and creation + 2: enable digital signature validation + 3: enable HMAC and digital signature validation and HMAC + creation + + Further writes will be blocked if HMAC support is enabled or + if bit 32 is set: + + echo 0x80000002 >/evm + + will enable digital signature validation and block + further writes to /evm. + + Until this is done, EVM can not create or validate the + 'security.evm' xattr, but returns INTEGRITY_UNKNOWN. + Loading keys and signaling EVM should be done as early + as possible. Normally this is done in the initramfs, + which has already been measured as part of the trusted + boot. For more information on creating and loading + existing trusted/encrypted keys, refer to: + Documentation/keys-trusted-encrypted.txt. Both dracut + (via 97masterkey and 98integrity) and systemd (via + core/ima-setup) have support for loading keys at boot + time. diff --git a/security/integrity/evm/evm.h b/security/integrity/evm/evm.h index f5f12727771a..241aca315b0c 100644 --- a/security/integrity/evm/evm.h +++ b/security/integrity/evm/evm.h @@ -23,6 +23,9 @@ #define EVM_INIT_HMAC 0x0001 #define EVM_INIT_X509 0x0002 +#define EVM_SETUP 0x80000000 /* userland has signaled key load */ + +#define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP) extern int evm_initialized; extern char *evm_hmac; diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c index c8dccd54d501..319cf16d6603 100644 --- a/security/integrity/evm/evm_secfs.c +++ b/security/integrity/evm/evm_secfs.c @@ -40,7 +40,7 @@ static ssize_t evm_read_key(struct file *filp, char __user *buf, if (*ppos != 0) return 0; - sprintf(temp, "%d", evm_initialized); + sprintf(temp, "%d", (evm_initialized & ~EVM_SETUP)); rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); return rc; @@ -61,24 +61,29 @@ static ssize_t evm_read_key(struct file *filp, char __user *buf, static ssize_t evm_write_key(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char temp[80]; - int i; + int i, ret; - if (!capable(CAP_SYS_ADMIN) || (evm_initialized & EVM_INIT_HMAC)) + if (!capable(CAP_SYS_ADMIN) || (evm_initialized & EVM_SETUP)) return -EPERM; - if (count >= sizeof(temp) || count == 0) + ret = kstrtoint_from_user(buf, count, 0, &i); + + if (ret) + return ret; + + /* Reject invalid values */ + if (!i || (i & ~EVM_INIT_MASK) != 0) return -EINVAL; - if (copy_from_user(temp, buf, count) != 0) - return -EFAULT; + if (i & EVM_INIT_HMAC) { + ret = evm_init_key(); + if (ret != 0) + return ret; + /* Forbid further writes after the symmetric key is loaded */ + i |= EVM_SETUP; + } - temp[count] = '\0'; - - if ((sscanf(temp, "%d", &i) != 1) || (i != 1)) - return -EINVAL; - - evm_init_key(); + evm_initialized |= i; return count; } From 0485d066d82c308e28e76b7fc6cdec46ae46eeb6 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 11 Oct 2017 12:11:12 -0700 Subject: [PATCH 1033/1085] EVM: Only complain about a missing HMAC key once A system can validate EVM digital signatures without requiring an HMAC key, but every EVM validation will generate a kernel error. Change this so we only generate an error once. Signed-off-by: Matthew Garrett Signed-off-by: Mimi Zohar --- security/integrity/evm/evm_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 1d32cd20009a..bcd64baf8788 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -80,7 +80,7 @@ static struct shash_desc *init_desc(char type) if (type == EVM_XATTR_HMAC) { if (!(evm_initialized & EVM_INIT_HMAC)) { - pr_err("HMAC key is not set\n"); + pr_err_once("HMAC key is not set\n"); return ERR_PTR(-ENOKEY); } tfm = &hmac_tfm; From ebe7c0a7be92bbd34c6ff5b55810546a0ee05bee Mon Sep 17 00:00:00 2001 From: Boshi Wang Date: Fri, 20 Oct 2017 16:01:03 +0800 Subject: [PATCH 1034/1085] ima: fix hash algorithm initialization The hash_setup function always sets the hash_setup_done flag, even when the hash algorithm is invalid. This prevents the default hash algorithm defined as CONFIG_IMA_DEFAULT_HASH from being used. This patch sets hash_setup_done flag only for valid hash algorithms. Fixes: e7a2ad7eb6f4 "ima: enable support for larger default filedata hash algorithms" Signed-off-by: Boshi Wang Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index d47f92e97f80..d6ddaad91e82 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -51,6 +51,8 @@ static int __init hash_setup(char *str) ima_hash_algo = HASH_ALGO_SHA1; else if (strncmp(str, "md5", 3) == 0) ima_hash_algo = HASH_ALGO_MD5; + else + return 1; goto out; } @@ -60,6 +62,8 @@ static int __init hash_setup(char *str) break; } } + if (i == HASH_ALGO__LAST) + return 1; out: hash_setup_done = 1; return 1; From fda784e50aace694ec2e4e16e2de07b91a938563 Mon Sep 17 00:00:00 2001 From: "Bruno E. O. Meneguele" Date: Tue, 24 Oct 2017 15:37:00 -0200 Subject: [PATCH 1035/1085] module: export module signature enforcement status A static variable sig_enforce is used as status var to indicate the real value of CONFIG_MODULE_SIG_FORCE, once this one is set the var will hold true, but if the CONFIG is not set the status var will hold whatever value is present in the module.sig_enforce kernel cmdline param: true when =1 and false when =0 or not present. Considering this cmdline param take place over the CONFIG value when it's not set, other places in the kernel could misbehave since they would have only the CONFIG_MODULE_SIG_FORCE value to rely on. Exporting this status var allows the kernel to rely in the effective value of module signature enforcement, being it from CONFIG value or cmdline param. Signed-off-by: Bruno E. O. Meneguele Signed-off-by: Mimi Zohar --- include/linux/module.h | 7 +++++++ kernel/module.c | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/module.h b/include/linux/module.h index fe5aa3736707..c69b49abe877 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -639,6 +639,8 @@ static inline bool is_livepatch_module(struct module *mod) } #endif /* CONFIG_LIVEPATCH */ +bool is_module_sig_enforced(void); + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -753,6 +755,11 @@ static inline bool module_requested_async_probing(struct module *module) return false; } +static inline bool is_module_sig_enforced(void) +{ + return false; +} + #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..d1c194b057a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); #endif /* !CONFIG_MODULE_SIG_FORCE */ +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); From 7c9bc0983f890ed9782e755a0e070930cd979333 Mon Sep 17 00:00:00 2001 From: "Bruno E. O. Meneguele" Date: Tue, 24 Oct 2017 15:37:01 -0200 Subject: [PATCH 1036/1085] ima: check signature enforcement against cmdline param instead of CONFIG When the user requests MODULE_CHECK policy and its kernel is compiled with CONFIG_MODULE_SIG_FORCE not set, all modules would not load, just those loaded in initram time. One option the user would have would be set a kernel cmdline param (module.sig_enforce) to true, but the IMA module check code doesn't rely on this value, it checks just CONFIG_MODULE_SIG_FORCE. This patch solves this problem checking for the exported value of module.sig_enforce cmdline param intead of CONFIG_MODULE_SIG_FORCE, which holds the effective value (CONFIG || param). Signed-off-by: Bruno E. O. Meneguele Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index d6ddaad91e82..770654694efc 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -362,12 +362,12 @@ void ima_post_path_mknod(struct dentry *dentry) */ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) { + bool sig_enforce = is_module_sig_enforced(); + if (!file && read_id == READING_MODULE) { -#ifndef CONFIG_MODULE_SIG_FORCE - if ((ima_appraise & IMA_APPRAISE_MODULES) && + if (!sig_enforce && (ima_appraise & IMA_APPRAISE_MODULES) && (ima_appraise & IMA_APPRAISE_ENFORCE)) return -EACCES; /* INTEGRITY_UNKNOWN */ -#endif return 0; /* We rely on module signature checking */ } return 0; From 39adb92598a7466e00f72bb8a197d8811017418a Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 7 Oct 2017 16:02:21 +0200 Subject: [PATCH 1037/1085] ima: Fix bool initialization/comparison Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_fs.c | 2 +- security/integrity/ima/ima_policy.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 4d50b982b453..fa540c0469da 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -32,7 +32,7 @@ bool ima_canonical_fmt; static int __init default_canonical_fmt_setup(char *str) { #ifdef __BIG_ENDIAN - ima_canonical_fmt = 1; + ima_canonical_fmt = true; #endif return 1; } diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 95209a5f8595..ee4613fa5840 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -196,9 +196,9 @@ static int __init policy_setup(char *str) if ((strcmp(p, "tcb") == 0) && !ima_policy) ima_policy = DEFAULT_TCB; else if (strcmp(p, "appraise_tcb") == 0) - ima_use_appraise_tcb = 1; + ima_use_appraise_tcb = true; else if (strcmp(p, "secure_boot") == 0) - ima_use_secure_boot = 1; + ima_use_secure_boot = true; } return 1; @@ -207,7 +207,7 @@ __setup("ima_policy=", policy_setup); static int __init default_appraise_policy_setup(char *str) { - ima_use_appraise_tcb = 1; + ima_use_appraise_tcb = true; return 1; } __setup("ima_appraise_tcb", default_appraise_policy_setup); From e5729f86a2987c9404f9b2fb494b9a6fc4412baf Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 17 Oct 2017 22:53:14 -0200 Subject: [PATCH 1038/1085] ima: Remove redundant conditional operator A non-zero value is converted to 1 when assigned to a bool variable, so the conditional operator in is_ima_appraise_enabled is redundant. The value of a comparison operator is either 1 or 0 so the conditional operator in ima_inode_setxattr is redundant as well. Confirmed that the patch is correct by comparing the object file from before and after the patch. They are identical. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_appraise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 809ba70fbbbf..ec7dfa02c051 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -40,7 +40,7 @@ __setup("ima_appraise=", default_appraise_setup); */ bool is_ima_appraise_enabled(void) { - return (ima_appraise & IMA_APPRAISE_ENFORCE) ? 1 : 0; + return ima_appraise & IMA_APPRAISE_ENFORCE; } /* @@ -405,7 +405,7 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; ima_reset_appraise_flags(d_backing_inode(dentry), - (xvalue->type == EVM_IMA_XATTR_DIGSIG) ? 1 : 0); + xvalue->type == EVM_IMA_XATTR_DIGSIG); result = 0; } return result; From 399c5acd5837f6c93d784d8dd25ebae5a50362e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 6 Nov 2017 15:02:41 +0100 Subject: [PATCH 1039/1085] s390/dasd: avoid calling do_gettimeofday() do_gettimeofday() is deprecated because it's not y2038-safe on 32-bit architectures. Since it is basically a wrapper around ktime_get_real_ts64(), we can just call that function directly instead. Signed-off-by: Arnd Bergmann [sth@linux.vnet.ibm.com: fix build] Signed-off-by: Stefan Haberland Signed-off-by: Heiko Carstens --- drivers/s390/block/dasd_eer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 8713fefd794b..7b83ce96e034 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -295,7 +295,7 @@ static void dasd_eer_write_standard_trigger(struct dasd_device *device, { struct dasd_ccw_req *temp_cqr; int data_size; - struct timeval tv; + struct timespec64 ts; struct dasd_eer_header header; unsigned long flags; struct eerbuffer *eerb; @@ -309,9 +309,9 @@ static void dasd_eer_write_standard_trigger(struct dasd_device *device, header.total_size = sizeof(header) + data_size + 4; /* "EOR" */ header.trigger = trigger; - do_gettimeofday(&tv); - header.tv_sec = tv.tv_sec; - header.tv_usec = tv.tv_usec; + ktime_get_real_ts64(&ts); + header.tv_sec = ts.tv_sec; + header.tv_usec = ts.tv_nsec / NSEC_PER_USEC; strncpy(header.busid, dev_name(&device->cdev->dev), DASD_EER_BUSID_SIZE); @@ -339,7 +339,7 @@ static void dasd_eer_write_snss_trigger(struct dasd_device *device, { int data_size; int snss_rc; - struct timeval tv; + struct timespec64 ts; struct dasd_eer_header header; unsigned long flags; struct eerbuffer *eerb; @@ -352,9 +352,9 @@ static void dasd_eer_write_snss_trigger(struct dasd_device *device, header.total_size = sizeof(header) + data_size + 4; /* "EOR" */ header.trigger = DASD_EER_STATECHANGE; - do_gettimeofday(&tv); - header.tv_sec = tv.tv_sec; - header.tv_usec = tv.tv_usec; + ktime_get_real_ts64(&ts); + header.tv_sec = ts.tv_sec; + header.tv_usec = ts.tv_nsec / NSEC_PER_USEC; strncpy(header.busid, dev_name(&device->cdev->dev), DASD_EER_BUSID_SIZE); From dac6dc267d0e906fa9263462d4ebd78970a8b511 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 6 Nov 2017 13:28:38 +0100 Subject: [PATCH 1040/1085] s390/disassembler: remove insn_to_mnemonic() insn_to_mnemonic() was introduced ages ago for KVM debugging, but is unused in the meantime. Therefore remove it. Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/include/asm/dis.h | 1 - arch/s390/kernel/dis.c | 28 ---------------------------- 2 files changed, 29 deletions(-) diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h index 37f617dfbede..793508c8fc48 100644 --- a/arch/s390/include/asm/dis.h +++ b/arch/s390/include/asm/dis.h @@ -44,7 +44,6 @@ struct pt_regs; void show_code(struct pt_regs *regs); void print_fn_code(unsigned char *code, unsigned long len); -int insn_to_mnemonic(unsigned char *instruction, char *buf, unsigned int len); struct s390_insn *find_insn(unsigned char *code); static inline int is_known_insn(unsigned char *code) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 94e96a7bd6d8..2cf4a084763e 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1961,34 +1961,6 @@ struct s390_insn *find_insn(unsigned char *code) return NULL; } -/** - * insn_to_mnemonic - decode an s390 instruction - * @instruction: instruction to decode - * @buf: buffer to fill with mnemonic - * @len: length of buffer - * - * Decode the instruction at @instruction and store the corresponding - * mnemonic into @buf of length @len. - * @buf is left unchanged if the instruction could not be decoded. - * Returns: - * %0 on success, %-ENOENT if the instruction was not found. - */ -int insn_to_mnemonic(unsigned char *instruction, char *buf, unsigned int len) -{ - struct s390_insn *insn; - - insn = find_insn(instruction); - if (!insn) - return -ENOENT; - if (insn->name[0] == '\0') - snprintf(buf, len, "%s", - long_insn_name[(int) insn->name[1]]); - else - snprintf(buf, len, "%.5s", insn->name); - return 0; -} -EXPORT_SYMBOL_GPL(insn_to_mnemonic); - static int print_insn(char *buffer, unsigned char *code, unsigned long addr) { struct s390_insn *insn; From 8bc1e4ec79c0fcd48ff8914566fe960c34f27bc2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 6 Nov 2017 13:29:56 +0100 Subject: [PATCH 1041/1085] s390/disassembler: generate opcode tables from text file The current way of adding new instructions to the opcode tables is painful and error prone. Therefore add, similar to binutils, a text file which contains all opcodes and the corresponding mnemonics and instruction formats. A small gen_opcode_table tool then generates a header file with the required enums and opcode table initializers at the prepare step of the kernel build. This way only a simple text file has to be maintained, which can be rather easily extended. Unlike before where there were plenty of opcode tables and a large switch statement to find the correct opcode table, there is now only one opcode table left which contains all instructions. A second opcode offset table now contains offsets within the opcode table to find instructions which have the same opcode prefix. In order to save space all 1-byte opcode instructions are grouped together at the end of the opcode table. This is also quite similar to like it was before. In addition also move and change code and definitions within the disassembler. As a side effect this reduces the size required for the code and opcode tables by ~1.5k. Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 1 + arch/s390/include/asm/dis.h | 27 +- arch/s390/kernel/dis.c | 2120 ++++------------------------ arch/s390/tools/Makefile | 10 + arch/s390/tools/gen_opcode_table.c | 336 +++++ arch/s390/tools/opcodes.txt | 1183 ++++++++++++++++ 6 files changed, 1817 insertions(+), 1860 deletions(-) create mode 100644 arch/s390/tools/gen_opcode_table.c create mode 100644 arch/s390/tools/opcodes.txt diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 64c2fe9dcfbe..6b3f41985f28 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -133,6 +133,7 @@ archclean: archprepare: $(Q)$(MAKE) $(build)=$(tools) include/generated/facilities.h + $(Q)$(MAKE) $(build)=$(tools) include/generated/dis.h # Don't use tabs in echo arguments define archhelp diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h index 793508c8fc48..b267287e35b6 100644 --- a/arch/s390/include/asm/dis.h +++ b/arch/s390/include/asm/dis.h @@ -8,32 +8,7 @@ #ifndef __ASM_S390_DIS_H__ #define __ASM_S390_DIS_H__ -/* Type of operand */ -#define OPERAND_GPR 0x1 /* Operand printed as %rx */ -#define OPERAND_FPR 0x2 /* Operand printed as %fx */ -#define OPERAND_AR 0x4 /* Operand printed as %ax */ -#define OPERAND_CR 0x8 /* Operand printed as %cx */ -#define OPERAND_VR 0x10 /* Operand printed as %vx */ -#define OPERAND_DISP 0x20 /* Operand printed as displacement */ -#define OPERAND_BASE 0x40 /* Operand printed as base register */ -#define OPERAND_INDEX 0x80 /* Operand printed as index register */ -#define OPERAND_PCREL 0x100 /* Operand printed as pc-relative symbol */ -#define OPERAND_SIGNED 0x200 /* Operand printed as signed value */ -#define OPERAND_LENGTH 0x400 /* Operand printed as length (+1) */ - - -struct s390_operand { - int bits; /* The number of bits in the operand. */ - int shift; /* The number of bits to shift. */ - int flags; /* One bit syntax flags. */ -}; - -struct s390_insn { - const char name[5]; - unsigned char opfrag; - unsigned char format; -}; - +#include static inline int insn_length(unsigned char code) { diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 2cf4a084763e..b811d3a8417d 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -21,52 +21,91 @@ #include #include #include - #include +#include #include #include -#include #include #include #include #include +/* Type of operand */ +#define OPERAND_GPR 0x1 /* Operand printed as %rx */ +#define OPERAND_FPR 0x2 /* Operand printed as %fx */ +#define OPERAND_AR 0x4 /* Operand printed as %ax */ +#define OPERAND_CR 0x8 /* Operand printed as %cx */ +#define OPERAND_VR 0x10 /* Operand printed as %vx */ +#define OPERAND_DISP 0x20 /* Operand printed as displacement */ +#define OPERAND_BASE 0x40 /* Operand printed as base register */ +#define OPERAND_INDEX 0x80 /* Operand printed as index register */ +#define OPERAND_PCREL 0x100 /* Operand printed as pc-relative symbol */ +#define OPERAND_SIGNED 0x200 /* Operand printed as signed value */ +#define OPERAND_LENGTH 0x400 /* Operand printed as length (+1) */ + +struct s390_operand { + unsigned char bits; /* The number of bits in the operand. */ + unsigned char shift; /* The number of bits to shift. */ + unsigned short flags; /* One bit syntax flags. */ +}; + +struct s390_insn { + union { + const char name[5]; + struct { + unsigned char zero; + unsigned int offset; + } __packed; + }; + unsigned char opfrag; + unsigned char format; +}; + +struct s390_opcode_offset { + unsigned char opcode; + unsigned char mask; + unsigned char byte; + unsigned short offset; + unsigned short count; +} __packed; + enum { - UNUSED, /* Indicates the end of the operand list */ - R_8, /* GPR starting at position 8 */ - R_12, /* GPR starting at position 12 */ - R_16, /* GPR starting at position 16 */ - R_20, /* GPR starting at position 20 */ - R_24, /* GPR starting at position 24 */ - R_28, /* GPR starting at position 28 */ - R_32, /* GPR starting at position 32 */ - F_8, /* FPR starting at position 8 */ - F_12, /* FPR starting at position 12 */ - F_16, /* FPR starting at position 16 */ - F_20, /* FPR starting at position 16 */ - F_24, /* FPR starting at position 24 */ - F_28, /* FPR starting at position 28 */ - F_32, /* FPR starting at position 32 */ + UNUSED, A_8, /* Access reg. starting at position 8 */ A_12, /* Access reg. starting at position 12 */ A_24, /* Access reg. starting at position 24 */ A_28, /* Access reg. starting at position 28 */ - C_8, /* Control reg. starting at position 8 */ - C_12, /* Control reg. starting at position 12 */ - V_8, /* Vector reg. starting at position 8, extension bit at 36 */ - V_12, /* Vector reg. starting at position 12, extension bit at 37 */ - V_16, /* Vector reg. starting at position 16, extension bit at 38 */ - V_32, /* Vector reg. starting at position 32, extension bit at 39 */ - W_12, /* Vector reg. at bit 12, extension at bit 37, used as index */ B_16, /* Base register starting at position 16 */ B_32, /* Base register starting at position 32 */ - X_12, /* Index register starting at position 12 */ + C_8, /* Control reg. starting at position 8 */ + C_12, /* Control reg. starting at position 12 */ + D20_20, /* 20 bit displacement starting at 20 */ D_20, /* Displacement starting at position 20 */ D_36, /* Displacement starting at position 36 */ - D20_20, /* 20 bit displacement starting at 20 */ + F_8, /* FPR starting at position 8 */ + F_12, /* FPR starting at position 12 */ + F_16, /* FPR starting at position 16 */ + F_24, /* FPR starting at position 24 */ + F_28, /* FPR starting at position 28 */ + F_32, /* FPR starting at position 32 */ + I8_8, /* 8 bit signed value starting at 8 */ + I8_32, /* 8 bit signed value starting at 32 */ + I16_16, /* 16 bit signed value starting at 16 */ + I16_32, /* 16 bit signed value starting at 32 */ + I32_16, /* 32 bit signed value starting at 16 */ + J12_12, /* 12 bit PC relative offset at 12 */ + J16_16, /* 16 bit PC relative offset at 16 */ + J16_32, /* 16 bit PC relative offset at 32 */ + J24_24, /* 24 bit PC relative offset at 24 */ + J32_16, /* 32 bit PC relative offset at 16 */ L4_8, /* 4 bit length starting at position 8 */ L4_12, /* 4 bit length starting at position 12 */ L8_8, /* 8 bit length starting at position 8 */ + R_8, /* GPR starting at position 8 */ + R_12, /* GPR starting at position 12 */ + R_16, /* GPR starting at position 16 */ + R_24, /* GPR starting at position 24 */ + R_28, /* GPR starting at position 28 */ U4_8, /* 4 bit unsigned value starting at 8 */ U4_12, /* 4 bit unsigned value starting at 12 */ U4_16, /* 4 bit unsigned value starting at 16 */ @@ -80,1748 +119,224 @@ enum { U8_24, /* 8 bit unsigned value starting at 24 */ U8_28, /* 8 bit unsigned value starting at 28 */ U8_32, /* 8 bit unsigned value starting at 32 */ - I8_8, /* 8 bit signed value starting at 8 */ - I8_16, /* 8 bit signed value starting at 16 */ - I8_24, /* 8 bit signed value starting at 24 */ - I8_32, /* 8 bit signed value starting at 32 */ - J12_12, /* PC relative offset at 12 */ - I16_16, /* 16 bit signed value starting at 16 */ - I16_32, /* 32 bit signed value starting at 16 */ - U16_16, /* 16 bit unsigned value starting at 16 */ - U16_32, /* 32 bit unsigned value starting at 16 */ - J16_16, /* PC relative jump offset at 16 */ - J16_32, /* PC relative offset at 16 */ - I24_24, /* 24 bit signed value starting at 24 */ - J32_16, /* PC relative long offset at 16 */ - I32_16, /* 32 bit signed value starting at 16 */ - U32_16, /* 32 bit unsigned value starting at 16 */ - M_16, /* 4 bit optional mask starting at 16 */ - M_20, /* 4 bit optional mask starting at 20 */ - M_24, /* 4 bit optional mask starting at 24 */ - M_28, /* 4 bit optional mask starting at 28 */ - M_32, /* 4 bit optional mask starting at 32 */ - RO_28, /* optional GPR starting at position 28 */ + U12_16, /* 12 bit unsigned value starting at 16 */ + U16_16, /* 16 bit unsigned value starting at 16 */ + U16_32, /* 16 bit unsigned value starting at 32 */ + U32_16, /* 32 bit unsigned value starting at 16 */ + VX_12, /* Vector index register starting at position 12 */ + V_8, /* Vector reg. starting at position 8 */ + V_12, /* Vector reg. starting at position 12 */ + V_16, /* Vector reg. starting at position 16 */ + V_32, /* Vector reg. starting at position 32 */ + X_12, /* Index register starting at position 12 */ }; -/* - * Enumeration of the different instruction formats. - * For details consult the principles of operation. - */ -enum { - INSTR_INVALID, - INSTR_E, - INSTR_IE_UU, - INSTR_MII_UPI, - INSTR_RIE_R0IU, INSTR_RIE_R0UU, INSTR_RIE_RRP, INSTR_RIE_RRPU, - INSTR_RIE_RRUUU, INSTR_RIE_RUPI, INSTR_RIE_RUPU, INSTR_RIE_RRI0, - INSTR_RIE_RUI0, INSTR_RIL_RI, INSTR_RIL_RP, INSTR_RIL_RU, INSTR_RIL_UP, - INSTR_RIS_R0RDU, INSTR_RIS_R0UU, INSTR_RIS_RURDI, INSTR_RIS_RURDU, - INSTR_RI_RI, INSTR_RI_RP, INSTR_RI_RU, INSTR_RI_UP, - INSTR_RRE_00, INSTR_RRE_0R, INSTR_RRE_AA, INSTR_RRE_AR, INSTR_RRE_F0, - INSTR_RRE_FF, INSTR_RRE_FR, INSTR_RRE_R0, INSTR_RRE_RA, INSTR_RRE_RF, - INSTR_RRE_RR, INSTR_RRE_RR_OPT, - INSTR_RRF_0UFF, INSTR_RRF_F0FF, INSTR_RRF_F0FF2, INSTR_RRF_F0FR, - INSTR_RRF_FFRU, INSTR_RRF_FUFF, INSTR_RRF_FUFF2, INSTR_RRF_M0RR, - INSTR_RRF_R0RR, INSTR_RRF_R0RR2, INSTR_RRF_RMRR, INSTR_RRF_RURR, - INSTR_RRF_U0FF, INSTR_RRF_U0RF, INSTR_RRF_U0RR, INSTR_RRF_UUFF, - INSTR_RRF_UUFR, INSTR_RRF_UURF, - INSTR_RRR_F0FF, INSTR_RRS_RRRDU, - INSTR_RR_FF, INSTR_RR_R0, INSTR_RR_RR, INSTR_RR_U0, INSTR_RR_UR, - INSTR_RSE_CCRD, INSTR_RSE_RRRD, INSTR_RSE_RURD, - INSTR_RSI_RRP, - INSTR_RSL_LRDFU, INSTR_RSL_R0RD, - INSTR_RSY_AARD, INSTR_RSY_CCRD, INSTR_RSY_RRRD, INSTR_RSY_RURD, - INSTR_RSY_RURD2, INSTR_RSY_RDRM, INSTR_RSY_RMRD, - INSTR_RS_AARD, INSTR_RS_CCRD, INSTR_RS_R0RD, INSTR_RS_RRRD, - INSTR_RS_RURD, - INSTR_RXE_FRRD, INSTR_RXE_RRRD, INSTR_RXE_RRRDM, - INSTR_RXF_FRRDF, - INSTR_RXY_FRRD, INSTR_RXY_RRRD, INSTR_RXY_URRD, - INSTR_RX_FRRD, INSTR_RX_RRRD, INSTR_RX_URRD, - INSTR_SIL_RDI, INSTR_SIL_RDU, - INSTR_SIY_IRD, INSTR_SIY_URD, - INSTR_SI_URD, - INSTR_SMI_U0RDP, - INSTR_SSE_RDRD, - INSTR_SSF_RRDRD, INSTR_SSF_RRDRD2, - INSTR_SS_L0RDRD, INSTR_SS_LIRDRD, INSTR_SS_LLRDRD, INSTR_SS_RRRDRD, - INSTR_SS_RRRDRD2, INSTR_SS_RRRDRD3, - INSTR_S_00, INSTR_S_RD, - INSTR_VRI_V0IM, INSTR_VRI_V0I0, INSTR_VRI_V0IIM, INSTR_VRI_VVIM, - INSTR_VRI_VVV0IM, INSTR_VRI_VVV0I0, INSTR_VRI_VVIMM, - INSTR_VRR_VV00MMM, INSTR_VRR_VV000MM, INSTR_VRR_VV0000M, - INSTR_VRR_VV00000, INSTR_VRR_VVV0M0M, INSTR_VRR_VV00M0M, - INSTR_VRR_VVV000M, INSTR_VRR_VVV000V, INSTR_VRR_VVV0000, - INSTR_VRR_VVV0MMM, INSTR_VRR_VVV00MM, INSTR_VRR_VVVMM0V, - INSTR_VRR_VVVM0MV, INSTR_VRR_VVVM00V, INSTR_VRR_VRR0000, - INSTR_VRS_VVRDM, INSTR_VRS_VVRD0, INSTR_VRS_VRRDM, INSTR_VRS_VRRD0, - INSTR_VRS_RVRDM, - INSTR_VRV_VVRDM, INSTR_VRV_VWRDM, - INSTR_VRX_VRRDM, INSTR_VRX_VRRD0, - INSTR_VSI_URDV, INSTR_VRS_RRDV, INSTR_VRI_V0UU2, INSTR_VRR_RV0U, - INSTR_VRI_VR0UU, INSTR_VRI_VVUUU2, INSTR_VRR_0V, INSTR_VRI_VVV0UU2, - INSTR_VRR_0VV0U, INSTR_VRR_VVV, INSTR_VRR_VVVU0UV, - INSTR_VRR_VVVUU0V, INSTR_VRR_VVV0UUU, -}; - -static const struct s390_operand operands[] = -{ - [UNUSED] = { 0, 0, 0 }, - [R_8] = { 4, 8, OPERAND_GPR }, - [R_12] = { 4, 12, OPERAND_GPR }, - [R_16] = { 4, 16, OPERAND_GPR }, - [R_20] = { 4, 20, OPERAND_GPR }, - [R_24] = { 4, 24, OPERAND_GPR }, - [R_28] = { 4, 28, OPERAND_GPR }, - [R_32] = { 4, 32, OPERAND_GPR }, - [F_8] = { 4, 8, OPERAND_FPR }, - [F_12] = { 4, 12, OPERAND_FPR }, - [F_16] = { 4, 16, OPERAND_FPR }, - [F_20] = { 4, 16, OPERAND_FPR }, - [F_24] = { 4, 24, OPERAND_FPR }, - [F_28] = { 4, 28, OPERAND_FPR }, - [F_32] = { 4, 32, OPERAND_FPR }, +static const struct s390_operand operands[] = { + [UNUSED] = { 0, 0, 0 }, [A_8] = { 4, 8, OPERAND_AR }, [A_12] = { 4, 12, OPERAND_AR }, [A_24] = { 4, 24, OPERAND_AR }, [A_28] = { 4, 28, OPERAND_AR }, + [B_16] = { 4, 16, OPERAND_BASE | OPERAND_GPR }, + [B_32] = { 4, 32, OPERAND_BASE | OPERAND_GPR }, [C_8] = { 4, 8, OPERAND_CR }, [C_12] = { 4, 12, OPERAND_CR }, + [D20_20] = { 20, 20, OPERAND_DISP | OPERAND_SIGNED }, + [D_20] = { 12, 20, OPERAND_DISP }, + [D_36] = { 12, 36, OPERAND_DISP }, + [F_8] = { 4, 8, OPERAND_FPR }, + [F_12] = { 4, 12, OPERAND_FPR }, + [F_16] = { 4, 16, OPERAND_FPR }, + [F_24] = { 4, 24, OPERAND_FPR }, + [F_28] = { 4, 28, OPERAND_FPR }, + [F_32] = { 4, 32, OPERAND_FPR }, + [I8_8] = { 8, 8, OPERAND_SIGNED }, + [I8_32] = { 8, 32, OPERAND_SIGNED }, + [I16_16] = { 16, 16, OPERAND_SIGNED }, + [I16_32] = { 16, 32, OPERAND_SIGNED }, + [I32_16] = { 32, 16, OPERAND_SIGNED }, + [J12_12] = { 12, 12, OPERAND_PCREL }, + [J16_16] = { 16, 16, OPERAND_PCREL }, + [J16_32] = { 16, 32, OPERAND_PCREL }, + [J24_24] = { 24, 24, OPERAND_PCREL }, + [J32_16] = { 32, 16, OPERAND_PCREL }, + [L4_8] = { 4, 8, OPERAND_LENGTH }, + [L4_12] = { 4, 12, OPERAND_LENGTH }, + [L8_8] = { 8, 8, OPERAND_LENGTH }, + [R_8] = { 4, 8, OPERAND_GPR }, + [R_12] = { 4, 12, OPERAND_GPR }, + [R_16] = { 4, 16, OPERAND_GPR }, + [R_24] = { 4, 24, OPERAND_GPR }, + [R_28] = { 4, 28, OPERAND_GPR }, + [U4_8] = { 4, 8, 0 }, + [U4_12] = { 4, 12, 0 }, + [U4_16] = { 4, 16, 0 }, + [U4_20] = { 4, 20, 0 }, + [U4_24] = { 4, 24, 0 }, + [U4_28] = { 4, 28, 0 }, + [U4_32] = { 4, 32, 0 }, + [U4_36] = { 4, 36, 0 }, + [U8_8] = { 8, 8, 0 }, + [U8_16] = { 8, 16, 0 }, + [U8_24] = { 8, 24, 0 }, + [U8_28] = { 8, 28, 0 }, + [U8_32] = { 8, 32, 0 }, + [U12_16] = { 12, 16, 0 }, + [U16_16] = { 16, 16, 0 }, + [U16_32] = { 16, 32, 0 }, + [U32_16] = { 32, 16, 0 }, + [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, [V_8] = { 4, 8, OPERAND_VR }, [V_12] = { 4, 12, OPERAND_VR }, [V_16] = { 4, 16, OPERAND_VR }, [V_32] = { 4, 32, OPERAND_VR }, - [W_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, - [B_16] = { 4, 16, OPERAND_BASE | OPERAND_GPR }, - [B_32] = { 4, 32, OPERAND_BASE | OPERAND_GPR }, [X_12] = { 4, 12, OPERAND_INDEX | OPERAND_GPR }, - [D_20] = { 12, 20, OPERAND_DISP }, - [D_36] = { 12, 36, OPERAND_DISP }, - [D20_20] = { 20, 20, OPERAND_DISP | OPERAND_SIGNED }, - [L4_8] = { 4, 8, OPERAND_LENGTH }, - [L4_12] = { 4, 12, OPERAND_LENGTH }, - [L8_8] = { 8, 8, OPERAND_LENGTH }, - [U4_8] = { 4, 8, 0 }, - [U4_12] = { 4, 12, 0 }, - [U4_16] = { 4, 16, 0 }, - [U4_20] = { 4, 20, 0 }, - [U4_24] = { 4, 24, 0 }, - [U4_28] = { 4, 28, 0 }, - [U4_32] = { 4, 32, 0 }, - [U4_36] = { 4, 36, 0 }, - [U8_8] = { 8, 8, 0 }, - [U8_16] = { 8, 16, 0 }, - [U8_24] = { 8, 24, 0 }, - [U8_28] = { 8, 28, 0 }, - [U8_32] = { 8, 32, 0 }, - [J12_12] = { 12, 12, OPERAND_PCREL }, - [I8_8] = { 8, 8, OPERAND_SIGNED }, - [I8_16] = { 8, 16, OPERAND_SIGNED }, - [I8_24] = { 8, 24, OPERAND_SIGNED }, - [I8_32] = { 8, 32, OPERAND_SIGNED }, - [I16_32] = { 16, 32, OPERAND_SIGNED }, - [I16_16] = { 16, 16, OPERAND_SIGNED }, - [U16_16] = { 16, 16, 0 }, - [U16_32] = { 16, 32, 0 }, - [J16_16] = { 16, 16, OPERAND_PCREL }, - [J16_32] = { 16, 32, OPERAND_PCREL }, - [I24_24] = { 24, 24, OPERAND_SIGNED }, - [J32_16] = { 32, 16, OPERAND_PCREL }, - [I32_16] = { 32, 16, OPERAND_SIGNED }, - [U32_16] = { 32, 16, 0 }, - [M_16] = { 4, 16, 0 }, - [M_20] = { 4, 20, 0 }, - [M_24] = { 4, 24, 0 }, - [M_28] = { 4, 28, 0 }, - [M_32] = { 4, 32, 0 }, - [RO_28] = { 4, 28, OPERAND_GPR } }; -static const unsigned char formats[][7] = { - [INSTR_E] = { 0xff, 0,0,0,0,0,0 }, - [INSTR_IE_UU] = { 0xff, U4_24,U4_28,0,0,0,0 }, - [INSTR_MII_UPI] = { 0xff, U4_8,J12_12,I24_24 }, - [INSTR_RIE_R0IU] = { 0xff, R_8,I16_16,U4_32,0,0,0 }, - [INSTR_RIE_R0UU] = { 0xff, R_8,U16_16,U4_32,0,0,0 }, - [INSTR_RIE_RRI0] = { 0xff, R_8,R_12,I16_16,0,0,0 }, - [INSTR_RIE_RRPU] = { 0xff, R_8,R_12,U4_32,J16_16,0,0 }, - [INSTR_RIE_RRP] = { 0xff, R_8,R_12,J16_16,0,0,0 }, - [INSTR_RIE_RRUUU] = { 0xff, R_8,R_12,U8_16,U8_24,U8_32,0 }, - [INSTR_RIE_RUI0] = { 0xff, R_8,I16_16,U4_12,0,0,0 }, - [INSTR_RIE_RUPI] = { 0xff, R_8,I8_32,U4_12,J16_16,0,0 }, - [INSTR_RIE_RUPU] = { 0xff, R_8,U8_32,U4_12,J16_16,0,0 }, - [INSTR_RIL_RI] = { 0x0f, R_8,I32_16,0,0,0,0 }, - [INSTR_RIL_RP] = { 0x0f, R_8,J32_16,0,0,0,0 }, - [INSTR_RIL_RU] = { 0x0f, R_8,U32_16,0,0,0,0 }, - [INSTR_RIL_UP] = { 0x0f, U4_8,J32_16,0,0,0,0 }, - [INSTR_RIS_R0RDU] = { 0xff, R_8,U8_32,D_20,B_16,0,0 }, - [INSTR_RIS_RURDI] = { 0xff, R_8,I8_32,U4_12,D_20,B_16,0 }, - [INSTR_RIS_RURDU] = { 0xff, R_8,U8_32,U4_12,D_20,B_16,0 }, - [INSTR_RI_RI] = { 0x0f, R_8,I16_16,0,0,0,0 }, - [INSTR_RI_RP] = { 0x0f, R_8,J16_16,0,0,0,0 }, - [INSTR_RI_RU] = { 0x0f, R_8,U16_16,0,0,0,0 }, - [INSTR_RI_UP] = { 0x0f, U4_8,J16_16,0,0,0,0 }, - [INSTR_RRE_00] = { 0xff, 0,0,0,0,0,0 }, - [INSTR_RRE_0R] = { 0xff, R_28,0,0,0,0,0 }, - [INSTR_RRE_AA] = { 0xff, A_24,A_28,0,0,0,0 }, - [INSTR_RRE_AR] = { 0xff, A_24,R_28,0,0,0,0 }, - [INSTR_RRE_F0] = { 0xff, F_24,0,0,0,0,0 }, - [INSTR_RRE_FF] = { 0xff, F_24,F_28,0,0,0,0 }, - [INSTR_RRE_FR] = { 0xff, F_24,R_28,0,0,0,0 }, - [INSTR_RRE_R0] = { 0xff, R_24,0,0,0,0,0 }, - [INSTR_RRE_RA] = { 0xff, R_24,A_28,0,0,0,0 }, - [INSTR_RRE_RF] = { 0xff, R_24,F_28,0,0,0,0 }, - [INSTR_RRE_RR] = { 0xff, R_24,R_28,0,0,0,0 }, - [INSTR_RRE_RR_OPT]= { 0xff, R_24,RO_28,0,0,0,0 }, - [INSTR_RRF_0UFF] = { 0xff, F_24,F_28,U4_20,0,0,0 }, - [INSTR_RRF_F0FF2] = { 0xff, F_24,F_16,F_28,0,0,0 }, - [INSTR_RRF_F0FF] = { 0xff, F_16,F_24,F_28,0,0,0 }, - [INSTR_RRF_F0FR] = { 0xff, F_24,F_16,R_28,0,0,0 }, - [INSTR_RRF_FFRU] = { 0xff, F_24,F_16,R_28,U4_20,0,0 }, - [INSTR_RRF_FUFF] = { 0xff, F_24,F_16,F_28,U4_20,0,0 }, - [INSTR_RRF_FUFF2] = { 0xff, F_24,F_28,F_16,U4_20,0,0 }, - [INSTR_RRF_M0RR] = { 0xff, R_24,R_28,M_16,0,0,0 }, - [INSTR_RRF_R0RR] = { 0xff, R_24,R_16,R_28,0,0,0 }, - [INSTR_RRF_R0RR2] = { 0xff, R_24,R_28,R_16,0,0,0 }, - [INSTR_RRF_RMRR] = { 0xff, R_24,R_16,R_28,M_20,0,0 }, - [INSTR_RRF_RURR] = { 0xff, R_24,R_28,R_16,U4_20,0,0 }, - [INSTR_RRF_U0FF] = { 0xff, F_24,U4_16,F_28,0,0,0 }, - [INSTR_RRF_U0RF] = { 0xff, R_24,U4_16,F_28,0,0,0 }, - [INSTR_RRF_U0RR] = { 0xff, R_24,R_28,U4_16,0,0,0 }, - [INSTR_RRF_UUFF] = { 0xff, F_24,U4_16,F_28,U4_20,0,0 }, - [INSTR_RRF_UUFR] = { 0xff, F_24,U4_16,R_28,U4_20,0,0 }, - [INSTR_RRF_UURF] = { 0xff, R_24,U4_16,F_28,U4_20,0,0 }, - [INSTR_RRR_F0FF] = { 0xff, F_24,F_28,F_16,0,0,0 }, - [INSTR_RRS_RRRDU] = { 0xff, R_8,R_12,U4_32,D_20,B_16,0 }, - [INSTR_RR_FF] = { 0xff, F_8,F_12,0,0,0,0 }, - [INSTR_RR_R0] = { 0xff, R_8, 0,0,0,0,0 }, - [INSTR_RR_RR] = { 0xff, R_8,R_12,0,0,0,0 }, - [INSTR_RR_U0] = { 0xff, U8_8, 0,0,0,0,0 }, - [INSTR_RR_UR] = { 0xff, U4_8,R_12,0,0,0,0 }, - [INSTR_RSE_CCRD] = { 0xff, C_8,C_12,D_20,B_16,0,0 }, - [INSTR_RSE_RRRD] = { 0xff, R_8,R_12,D_20,B_16,0,0 }, - [INSTR_RSE_RURD] = { 0xff, R_8,U4_12,D_20,B_16,0,0 }, - [INSTR_RSI_RRP] = { 0xff, R_8,R_12,J16_16,0,0,0 }, - [INSTR_RSL_LRDFU] = { 0xff, F_32,D_20,L8_8,B_16,U4_36,0 }, - [INSTR_RSL_R0RD] = { 0xff, D_20,L4_8,B_16,0,0,0 }, - [INSTR_RSY_AARD] = { 0xff, A_8,A_12,D20_20,B_16,0,0 }, - [INSTR_RSY_CCRD] = { 0xff, C_8,C_12,D20_20,B_16,0,0 }, - [INSTR_RSY_RDRM] = { 0xff, R_8,D20_20,B_16,U4_12,0,0 }, - [INSTR_RSY_RMRD] = { 0xff, R_8,U4_12,D20_20,B_16,0,0 }, - [INSTR_RSY_RRRD] = { 0xff, R_8,R_12,D20_20,B_16,0,0 }, - [INSTR_RSY_RURD] = { 0xff, R_8,U4_12,D20_20,B_16,0,0 }, - [INSTR_RSY_RURD2] = { 0xff, R_8,D20_20,B_16,U4_12,0,0 }, - [INSTR_RS_AARD] = { 0xff, A_8,A_12,D_20,B_16,0,0 }, - [INSTR_RS_CCRD] = { 0xff, C_8,C_12,D_20,B_16,0,0 }, - [INSTR_RS_R0RD] = { 0xff, R_8,D_20,B_16,0,0,0 }, - [INSTR_RS_RRRD] = { 0xff, R_8,R_12,D_20,B_16,0,0 }, - [INSTR_RS_RURD] = { 0xff, R_8,U4_12,D_20,B_16,0,0 }, - [INSTR_RXE_FRRD] = { 0xff, F_8,D_20,X_12,B_16,0,0 }, - [INSTR_RXE_RRRD] = { 0xff, R_8,D_20,X_12,B_16,0,0 }, - [INSTR_RXE_RRRDM] = { 0xff, R_8,D_20,X_12,B_16,M_32,0 }, - [INSTR_RXF_FRRDF] = { 0xff, F_32,F_8,D_20,X_12,B_16,0 }, - [INSTR_RXY_FRRD] = { 0xff, F_8,D20_20,X_12,B_16,0,0 }, - [INSTR_RXY_RRRD] = { 0xff, R_8,D20_20,X_12,B_16,0,0 }, - [INSTR_RXY_URRD] = { 0xff, U4_8,D20_20,X_12,B_16,0,0 }, - [INSTR_RX_FRRD] = { 0xff, F_8,D_20,X_12,B_16,0,0 }, - [INSTR_RX_RRRD] = { 0xff, R_8,D_20,X_12,B_16,0,0 }, - [INSTR_RX_URRD] = { 0xff, U4_8,D_20,X_12,B_16,0,0 }, - [INSTR_SIL_RDI] = { 0xff, D_20,B_16,I16_32,0,0,0 }, - [INSTR_SIL_RDU] = { 0xff, D_20,B_16,U16_32,0,0,0 }, - [INSTR_SIY_IRD] = { 0xff, D20_20,B_16,I8_8,0,0,0 }, - [INSTR_SIY_URD] = { 0xff, D20_20,B_16,U8_8,0,0,0 }, - [INSTR_SI_URD] = { 0xff, D_20,B_16,U8_8,0,0,0 }, - [INSTR_SMI_U0RDP] = { 0xff, U4_8,J16_32,D_20,B_16,0,0 }, - [INSTR_SSE_RDRD] = { 0xff, D_20,B_16,D_36,B_32,0,0 }, - [INSTR_SSF_RRDRD] = { 0x0f, D_20,B_16,D_36,B_32,R_8,0 }, - [INSTR_SSF_RRDRD2]= { 0x0f, R_8,D_20,B_16,D_36,B_32,0 }, - [INSTR_SS_L0RDRD] = { 0xff, D_20,L8_8,B_16,D_36,B_32,0 }, - [INSTR_SS_LIRDRD] = { 0xff, D_20,L4_8,B_16,D_36,B_32,U4_12 }, - [INSTR_SS_LLRDRD] = { 0xff, D_20,L4_8,B_16,D_36,L4_12,B_32 }, - [INSTR_SS_RRRDRD2]= { 0xff, R_8,D_20,B_16,R_12,D_36,B_32 }, - [INSTR_SS_RRRDRD3]= { 0xff, R_8,R_12,D_20,B_16,D_36,B_32 }, - [INSTR_SS_RRRDRD] = { 0xff, D_20,R_8,B_16,D_36,B_32,R_12 }, - [INSTR_S_00] = { 0xff, 0,0,0,0,0,0 }, - [INSTR_S_RD] = { 0xff, D_20,B_16,0,0,0,0 }, - [INSTR_VRI_V0IM] = { 0xff, V_8,I16_16,M_32,0,0,0 }, - [INSTR_VRI_V0I0] = { 0xff, V_8,I16_16,0,0,0,0 }, - [INSTR_VRI_V0IIM] = { 0xff, V_8,I8_16,I8_24,M_32,0,0 }, - [INSTR_VRI_VVIM] = { 0xff, V_8,I16_16,V_12,M_32,0,0 }, - [INSTR_VRI_VVV0IM]= { 0xff, V_8,V_12,V_16,I8_24,M_32,0 }, - [INSTR_VRI_VVV0I0]= { 0xff, V_8,V_12,V_16,I8_24,0,0 }, - [INSTR_VRI_VVIMM] = { 0xff, V_8,V_12,I16_16,M_32,M_28,0 }, - [INSTR_VRR_VV00MMM]={ 0xff, V_8,V_12,M_32,M_28,M_24,0 }, - [INSTR_VRR_VV000MM]={ 0xff, V_8,V_12,M_32,M_28,0,0 }, - [INSTR_VRR_VV0000M]={ 0xff, V_8,V_12,M_32,0,0,0 }, - [INSTR_VRR_VV00000]={ 0xff, V_8,V_12,0,0,0,0 }, - [INSTR_VRR_VVV0M0M]={ 0xff, V_8,V_12,V_16,M_32,M_24,0 }, - [INSTR_VRR_VV00M0M]={ 0xff, V_8,V_12,M_32,M_24,0,0 }, - [INSTR_VRR_VVV000M]={ 0xff, V_8,V_12,V_16,M_32,0,0 }, - [INSTR_VRR_VVV000V]={ 0xff, V_8,V_12,V_16,V_32,0,0 }, - [INSTR_VRR_VVV0000]={ 0xff, V_8,V_12,V_16,0,0,0 }, - [INSTR_VRR_VVV0MMM]={ 0xff, V_8,V_12,V_16,M_32,M_28,M_24 }, - [INSTR_VRR_VVV00MM]={ 0xff, V_8,V_12,V_16,M_32,M_28,0 }, - [INSTR_VRR_VVVMM0V]={ 0xff, V_8,V_12,V_16,V_32,M_20,M_24 }, - [INSTR_VRR_VVVM0MV]={ 0xff, V_8,V_12,V_16,V_32,M_28,M_20 }, - [INSTR_VRR_VVVM00V]={ 0xff, V_8,V_12,V_16,V_32,M_20,0 }, - [INSTR_VRR_VRR0000]={ 0xff, V_8,R_12,R_16,0,0,0 }, - [INSTR_VRS_VVRDM] = { 0xff, V_8,V_12,D_20,B_16,M_32,0 }, - [INSTR_VRS_VVRD0] = { 0xff, V_8,V_12,D_20,B_16,0,0 }, - [INSTR_VRS_VRRDM] = { 0xff, V_8,R_12,D_20,B_16,M_32,0 }, - [INSTR_VRS_VRRD0] = { 0xff, V_8,R_12,D_20,B_16,0,0 }, - [INSTR_VRS_RVRDM] = { 0xff, R_8,V_12,D_20,B_16,M_32,0 }, - [INSTR_VRV_VVRDM] = { 0xff, V_8,V_12,D_20,B_16,M_32,0 }, - [INSTR_VRV_VWRDM] = { 0xff, V_8,D_20,W_12,B_16,M_32,0 }, - [INSTR_VRX_VRRDM] = { 0xff, V_8,D_20,X_12,B_16,M_32,0 }, - [INSTR_VRX_VRRD0] = { 0xff, V_8,D_20,X_12,B_16,0,0 }, - [INSTR_VRI_V0UU2] = {0xff, V_8,U16_16,U4_32,0,0,0 }, - [INSTR_VRI_VR0UU] = {0xff, V_8,V_12,U8_28,U8_16,U4_24,0 }, - [INSTR_VRI_VVUUU2]= {0xff, V_8,V_12,U8_28,U8_16,U4_24,0 }, - [INSTR_VRI_VVV0UU2]= {0xff, V_8,V_12,V_16,U8_28,U4_24,0 }, - [INSTR_VRR_0VV0U] = {0xff, V_12,V_16,U4_24,0,0,0 }, - [INSTR_VRR_0V] = {0xff, V_12,0,0,0,0,0 }, - [INSTR_VRR_RV0U] = {0xff, R_8,V_12,U4_24,0,0,0 }, - [INSTR_VRR_VVV0UUU]= {0xff, V_8,V_12,V_16,U4_32,U4_28,U4_24 }, - [INSTR_VRR_VVVU0UV]= {0xff, V_8,V_12,V_16,V_32,U4_28,U4_20 }, - [INSTR_VRR_VVVUU0V]= {0xff, V_8,V_12,V_16,V_32,U4_20,U4_24 }, - [INSTR_VRR_VVV] = {0xff, V_8,V_12,V_16,0,0,0 }, - [INSTR_VRS_RRDV] = {0xff, V_32,R_12,D_20,B_16,0,0 }, - [INSTR_VSI_URDV] = {0xff, V_32,D_20,B_16,U8_8,0,0 }, +static const unsigned char formats[][6] = { + [INSTR_E] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_IE_UU] = { U4_24, U4_28, 0, 0, 0, 0 }, + [INSTR_MII_UPP] = { U4_8, J12_12, J24_24 }, + [INSTR_RIE_R0IU] = { R_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_R0UU] = { R_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_RRI0] = { R_8, R_12, I16_16, 0, 0, 0 }, + [INSTR_RIE_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RIE_RRPU] = { R_8, R_12, U4_32, J16_16, 0, 0 }, + [INSTR_RIE_RRUUU] = { R_8, R_12, U8_16, U8_24, U8_32, 0 }, + [INSTR_RIE_RUI0] = { R_8, I16_16, U4_12, 0, 0, 0 }, + [INSTR_RIE_RUPI] = { R_8, I8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIE_RUPU] = { R_8, U8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIL_RI] = { R_8, I32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RP] = { R_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RU] = { R_8, U32_16, 0, 0, 0, 0 }, + [INSTR_RIL_UP] = { U4_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIS_RURDI] = { R_8, I8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RIS_RURDU] = { R_8, U8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RI_RI] = { R_8, I16_16, 0, 0, 0, 0 }, + [INSTR_RI_RP] = { R_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RI_RU] = { R_8, U16_16, 0, 0, 0, 0 }, + [INSTR_RI_UP] = { U4_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RRE_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_RRE_AA] = { A_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_AR] = { A_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_F0] = { F_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_FF] = { F_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_FR] = { F_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_R0] = { R_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_RA] = { R_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_RF] = { R_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_RR] = { R_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRF_0UFF] = { F_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_0URF] = { R_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_F0FF] = { F_16, F_24, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FF2] = { F_24, F_16, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FR] = { F_24, F_16, R_28, 0, 0, 0 }, + [INSTR_RRF_FFRU] = { F_24, F_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF] = { F_24, F_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF2] = { F_24, F_28, F_16, U4_20, 0, 0 }, + [INSTR_RRF_R0RR] = { R_24, R_16, R_28, 0, 0, 0 }, + [INSTR_RRF_R0RR2] = { R_24, R_28, R_16, 0, 0, 0 }, + [INSTR_RRF_RURR] = { R_24, R_28, R_16, U4_20, 0, 0 }, + [INSTR_RRF_RURR2] = { R_24, R_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_U0FF] = { F_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RF] = { R_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RR] = { R_24, R_28, U4_16, 0, 0, 0 }, + [INSTR_RRF_UUFF] = { F_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_UUFR] = { F_24, U4_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_UURF] = { R_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRS_RRRDU] = { R_8, R_12, U4_32, D_20, B_16 }, + [INSTR_RR_FF] = { F_8, F_12, 0, 0, 0, 0 }, + [INSTR_RR_R0] = { R_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_RR] = { R_8, R_12, 0, 0, 0, 0 }, + [INSTR_RR_U0] = { U8_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_UR] = { U4_8, R_12, 0, 0, 0, 0 }, + [INSTR_RSI_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RSL_LRDFU] = { F_32, D_20, L8_8, B_16, U4_36, 0 }, + [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, + [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RS_AARD] = { A_8, A_12, D_20, B_16, 0, 0 }, + [INSTR_RS_CCRD] = { C_8, C_12, D_20, B_16, 0, 0 }, + [INSTR_RS_R0RD] = { R_8, D_20, B_16, 0, 0, 0 }, + [INSTR_RS_RRRD] = { R_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_RS_RURD] = { R_8, U4_12, D_20, B_16, 0, 0 }, + [INSTR_RXE_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RXE_RRRDU] = { R_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_RXF_FRRDF] = { F_32, F_8, D_20, X_12, B_16, 0 }, + [INSTR_RXY_FRRD] = { F_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_RRRD] = { R_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_URRD] = { U4_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RX_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_RRRD] = { R_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_URRD] = { U4_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, + [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, + [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SMI_U0RDP] = { U4_8, J16_32, D_20, B_16, 0, 0 }, + [INSTR_SSE_RDRD] = { D_20, B_16, D_36, B_32, 0, 0 }, + [INSTR_SSF_RRDRD] = { D_20, B_16, D_36, B_32, R_8, 0 }, + [INSTR_SSF_RRDRD2] = { R_8, D_20, B_16, D_36, B_32, 0 }, + [INSTR_SS_L0RDRD] = { D_20, L8_8, B_16, D_36, B_32, 0 }, + [INSTR_SS_L2RDRD] = { D_20, B_16, D_36, L8_8, B_32, 0 }, + [INSTR_SS_LIRDRD] = { D_20, L4_8, B_16, D_36, B_32, U4_12 }, + [INSTR_SS_LLRDRD] = { D_20, L4_8, B_16, D_36, L4_12, B_32 }, + [INSTR_SS_RRRDRD] = { D_20, R_8, B_16, D_36, B_32, R_12 }, + [INSTR_SS_RRRDRD2] = { R_8, D_20, B_16, R_12, D_36, B_32 }, + [INSTR_SS_RRRDRD3] = { R_8, R_12, D_20, B_16, D_36, B_32 }, + [INSTR_S_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_S_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0IU] = { V_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0U] = { V_8, U16_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, + [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, + [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, + [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, + [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, + [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, + [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, + [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_RV0U] = { R_8, V_12, U4_24, 0, 0, 0 }, + [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, + [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, + [INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 }, + [INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 }, + [INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 }, + [INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 }, + [INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 }, + [INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 }, + [INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 }, + [INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 }, + [INSTR_VRR_VVV0V] = { V_8, V_12, V_16, V_32, 0, 0 }, + [INSTR_VRR_VVVU0UV] = { V_8, V_12, V_16, V_32, U4_28, U4_20 }, + [INSTR_VRR_VVVU0V] = { V_8, V_12, V_16, V_32, U4_20, 0 }, + [INSTR_VRR_VVVUU0V] = { V_8, V_12, V_16, V_32, U4_20, U4_24 }, + [INSTR_VRS_RRDV] = { V_32, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_RVRDU] = { R_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VRRD] = { V_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_VRRDU] = { V_8, R_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VVRD] = { V_8, V_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_VVRDU] = { V_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRV_VVXRDU] = { V_8, D_20, VX_12, B_16, U4_32, 0 }, + [INSTR_VRX_VRRD] = { V_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_VRX_VRRDU] = { V_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_VRX_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VSI_URDV] = { V_32, D_20, B_16, U8_8, 0, 0 }, }; -enum { - LONG_INSN_ALGHSIK, - LONG_INSN_ALHHHR, - LONG_INSN_ALHHLR, - LONG_INSN_ALHSIK, - LONG_INSN_ALSIHN, - LONG_INSN_CDFBRA, - LONG_INSN_CDGBRA, - LONG_INSN_CDGTRA, - LONG_INSN_CDLFBR, - LONG_INSN_CDLFTR, - LONG_INSN_CDLGBR, - LONG_INSN_CDLGTR, - LONG_INSN_CEFBRA, - LONG_INSN_CEGBRA, - LONG_INSN_CELFBR, - LONG_INSN_CELGBR, - LONG_INSN_CFDBRA, - LONG_INSN_CFEBRA, - LONG_INSN_CFXBRA, - LONG_INSN_CGDBRA, - LONG_INSN_CGDTRA, - LONG_INSN_CGEBRA, - LONG_INSN_CGXBRA, - LONG_INSN_CGXTRA, - LONG_INSN_CLFDBR, - LONG_INSN_CLFDTR, - LONG_INSN_CLFEBR, - LONG_INSN_CLFHSI, - LONG_INSN_CLFXBR, - LONG_INSN_CLFXTR, - LONG_INSN_CLGDBR, - LONG_INSN_CLGDTR, - LONG_INSN_CLGEBR, - LONG_INSN_CLGFRL, - LONG_INSN_CLGHRL, - LONG_INSN_CLGHSI, - LONG_INSN_CLGXBR, - LONG_INSN_CLGXTR, - LONG_INSN_CLHHSI, - LONG_INSN_CXFBRA, - LONG_INSN_CXGBRA, - LONG_INSN_CXGTRA, - LONG_INSN_CXLFBR, - LONG_INSN_CXLFTR, - LONG_INSN_CXLGBR, - LONG_INSN_CXLGTR, - LONG_INSN_FIDBRA, - LONG_INSN_FIEBRA, - LONG_INSN_FIXBRA, - LONG_INSN_LDXBRA, - LONG_INSN_LEDBRA, - LONG_INSN_LEXBRA, - LONG_INSN_LLGFAT, - LONG_INSN_LLGFRL, - LONG_INSN_LLGHRL, - LONG_INSN_LLGTAT, - LONG_INSN_LLZRGF, - LONG_INSN_LOCFHR, - LONG_INSN_LOCGHI, - LONG_INSN_LOCHHI, - LONG_INSN_POPCNT, - LONG_INSN_RIEMIT, - LONG_INSN_RINEXT, - LONG_INSN_RISBGN, - LONG_INSN_RISBHG, - LONG_INSN_RISBLG, - LONG_INSN_SLHHHR, - LONG_INSN_SLHHLR, - LONG_INSN_STOCFH, - LONG_INSN_TABORT, - LONG_INSN_TBEGIN, - LONG_INSN_TBEGINC, - LONG_INSN_PCISTG, - LONG_INSN_MPCIFC, - LONG_INSN_STPCIFC, - LONG_INSN_PCISTB, - LONG_INSN_VPOPCT, - LONG_INSN_VERLLV, - LONG_INSN_VESRAV, - LONG_INSN_VESRLV, - LONG_INSN_VSBCBI, - LONG_INSN_STCCTM, - LONG_INSN_MSGRKC, - LONG_INSN_LLGFSG, - LONG_INSN_VSTRLR, - LONG_INSN_VBPERM, -}; - -static char *long_insn_name[] = { - [LONG_INSN_ALGHSIK] = "alghsik", - [LONG_INSN_ALHHHR] = "alhhhr", - [LONG_INSN_ALHHLR] = "alhhlr", - [LONG_INSN_ALHSIK] = "alhsik", - [LONG_INSN_ALSIHN] = "alsihn", - [LONG_INSN_CDFBRA] = "cdfbra", - [LONG_INSN_CDGBRA] = "cdgbra", - [LONG_INSN_CDGTRA] = "cdgtra", - [LONG_INSN_CDLFBR] = "cdlfbr", - [LONG_INSN_CDLFTR] = "cdlftr", - [LONG_INSN_CDLGBR] = "cdlgbr", - [LONG_INSN_CDLGTR] = "cdlgtr", - [LONG_INSN_CEFBRA] = "cefbra", - [LONG_INSN_CEGBRA] = "cegbra", - [LONG_INSN_CELFBR] = "celfbr", - [LONG_INSN_CELGBR] = "celgbr", - [LONG_INSN_CFDBRA] = "cfdbra", - [LONG_INSN_CFEBRA] = "cfebra", - [LONG_INSN_CFXBRA] = "cfxbra", - [LONG_INSN_CGDBRA] = "cgdbra", - [LONG_INSN_CGDTRA] = "cgdtra", - [LONG_INSN_CGEBRA] = "cgebra", - [LONG_INSN_CGXBRA] = "cgxbra", - [LONG_INSN_CGXTRA] = "cgxtra", - [LONG_INSN_CLFDBR] = "clfdbr", - [LONG_INSN_CLFDTR] = "clfdtr", - [LONG_INSN_CLFEBR] = "clfebr", - [LONG_INSN_CLFHSI] = "clfhsi", - [LONG_INSN_CLFXBR] = "clfxbr", - [LONG_INSN_CLFXTR] = "clfxtr", - [LONG_INSN_CLGDBR] = "clgdbr", - [LONG_INSN_CLGDTR] = "clgdtr", - [LONG_INSN_CLGEBR] = "clgebr", - [LONG_INSN_CLGFRL] = "clgfrl", - [LONG_INSN_CLGHRL] = "clghrl", - [LONG_INSN_CLGHSI] = "clghsi", - [LONG_INSN_CLGXBR] = "clgxbr", - [LONG_INSN_CLGXTR] = "clgxtr", - [LONG_INSN_CLHHSI] = "clhhsi", - [LONG_INSN_CXFBRA] = "cxfbra", - [LONG_INSN_CXGBRA] = "cxgbra", - [LONG_INSN_CXGTRA] = "cxgtra", - [LONG_INSN_CXLFBR] = "cxlfbr", - [LONG_INSN_CXLFTR] = "cxlftr", - [LONG_INSN_CXLGBR] = "cxlgbr", - [LONG_INSN_CXLGTR] = "cxlgtr", - [LONG_INSN_FIDBRA] = "fidbra", - [LONG_INSN_FIEBRA] = "fiebra", - [LONG_INSN_FIXBRA] = "fixbra", - [LONG_INSN_LDXBRA] = "ldxbra", - [LONG_INSN_LEDBRA] = "ledbra", - [LONG_INSN_LEXBRA] = "lexbra", - [LONG_INSN_LLGFAT] = "llgfat", - [LONG_INSN_LLGFRL] = "llgfrl", - [LONG_INSN_LLGHRL] = "llghrl", - [LONG_INSN_LLGTAT] = "llgtat", - [LONG_INSN_LLZRGF] = "llzrgf", - [LONG_INSN_POPCNT] = "popcnt", - [LONG_INSN_RIEMIT] = "riemit", - [LONG_INSN_RINEXT] = "rinext", - [LONG_INSN_RISBGN] = "risbgn", - [LONG_INSN_RISBHG] = "risbhg", - [LONG_INSN_RISBLG] = "risblg", - [LONG_INSN_SLHHHR] = "slhhhr", - [LONG_INSN_SLHHLR] = "slhhlr", - [LONG_INSN_TABORT] = "tabort", - [LONG_INSN_TBEGIN] = "tbegin", - [LONG_INSN_TBEGINC] = "tbeginc", - [LONG_INSN_PCISTG] = "pcistg", - [LONG_INSN_MPCIFC] = "mpcifc", - [LONG_INSN_STPCIFC] = "stpcifc", - [LONG_INSN_PCISTB] = "pcistb", - [LONG_INSN_VPOPCT] = "vpopct", - [LONG_INSN_VERLLV] = "verllv", - [LONG_INSN_VESRAV] = "vesrav", - [LONG_INSN_VESRLV] = "vesrlv", - [LONG_INSN_VSBCBI] = "vsbcbi", - [LONG_INSN_STCCTM] = "stcctm", - [LONG_INSN_LOCFHR] = "locfhr", - [LONG_INSN_LOCGHI] = "locghi", - [LONG_INSN_LOCHHI] = "lochhi", - [LONG_INSN_STOCFH] = "stocfh", - [LONG_INSN_MSGRKC] = "msgrkc", - [LONG_INSN_LLGFSG] = "llgfsg", - [LONG_INSN_VSTRLR] = "vstrlr", - [LONG_INSN_VBPERM] = "vbperm", -}; - -static struct s390_insn opcode[] = { - { "bprp", 0xc5, INSTR_MII_UPI }, - { "bpp", 0xc7, INSTR_SMI_U0RDP }, - { "trtr", 0xd0, INSTR_SS_L0RDRD }, - { "lmd", 0xef, INSTR_SS_RRRDRD3 }, - { "spm", 0x04, INSTR_RR_R0 }, - { "balr", 0x05, INSTR_RR_RR }, - { "bctr", 0x06, INSTR_RR_RR }, - { "bcr", 0x07, INSTR_RR_UR }, - { "svc", 0x0a, INSTR_RR_U0 }, - { "bsm", 0x0b, INSTR_RR_RR }, - { "bassm", 0x0c, INSTR_RR_RR }, - { "basr", 0x0d, INSTR_RR_RR }, - { "mvcl", 0x0e, INSTR_RR_RR }, - { "clcl", 0x0f, INSTR_RR_RR }, - { "lpr", 0x10, INSTR_RR_RR }, - { "lnr", 0x11, INSTR_RR_RR }, - { "ltr", 0x12, INSTR_RR_RR }, - { "lcr", 0x13, INSTR_RR_RR }, - { "nr", 0x14, INSTR_RR_RR }, - { "clr", 0x15, INSTR_RR_RR }, - { "or", 0x16, INSTR_RR_RR }, - { "xr", 0x17, INSTR_RR_RR }, - { "lr", 0x18, INSTR_RR_RR }, - { "cr", 0x19, INSTR_RR_RR }, - { "ar", 0x1a, INSTR_RR_RR }, - { "sr", 0x1b, INSTR_RR_RR }, - { "mr", 0x1c, INSTR_RR_RR }, - { "dr", 0x1d, INSTR_RR_RR }, - { "alr", 0x1e, INSTR_RR_RR }, - { "slr", 0x1f, INSTR_RR_RR }, - { "lpdr", 0x20, INSTR_RR_FF }, - { "lndr", 0x21, INSTR_RR_FF }, - { "ltdr", 0x22, INSTR_RR_FF }, - { "lcdr", 0x23, INSTR_RR_FF }, - { "hdr", 0x24, INSTR_RR_FF }, - { "ldxr", 0x25, INSTR_RR_FF }, - { "mxr", 0x26, INSTR_RR_FF }, - { "mxdr", 0x27, INSTR_RR_FF }, - { "ldr", 0x28, INSTR_RR_FF }, - { "cdr", 0x29, INSTR_RR_FF }, - { "adr", 0x2a, INSTR_RR_FF }, - { "sdr", 0x2b, INSTR_RR_FF }, - { "mdr", 0x2c, INSTR_RR_FF }, - { "ddr", 0x2d, INSTR_RR_FF }, - { "awr", 0x2e, INSTR_RR_FF }, - { "swr", 0x2f, INSTR_RR_FF }, - { "lper", 0x30, INSTR_RR_FF }, - { "lner", 0x31, INSTR_RR_FF }, - { "lter", 0x32, INSTR_RR_FF }, - { "lcer", 0x33, INSTR_RR_FF }, - { "her", 0x34, INSTR_RR_FF }, - { "ledr", 0x35, INSTR_RR_FF }, - { "axr", 0x36, INSTR_RR_FF }, - { "sxr", 0x37, INSTR_RR_FF }, - { "ler", 0x38, INSTR_RR_FF }, - { "cer", 0x39, INSTR_RR_FF }, - { "aer", 0x3a, INSTR_RR_FF }, - { "ser", 0x3b, INSTR_RR_FF }, - { "mder", 0x3c, INSTR_RR_FF }, - { "der", 0x3d, INSTR_RR_FF }, - { "aur", 0x3e, INSTR_RR_FF }, - { "sur", 0x3f, INSTR_RR_FF }, - { "sth", 0x40, INSTR_RX_RRRD }, - { "la", 0x41, INSTR_RX_RRRD }, - { "stc", 0x42, INSTR_RX_RRRD }, - { "ic", 0x43, INSTR_RX_RRRD }, - { "ex", 0x44, INSTR_RX_RRRD }, - { "bal", 0x45, INSTR_RX_RRRD }, - { "bct", 0x46, INSTR_RX_RRRD }, - { "bc", 0x47, INSTR_RX_URRD }, - { "lh", 0x48, INSTR_RX_RRRD }, - { "ch", 0x49, INSTR_RX_RRRD }, - { "ah", 0x4a, INSTR_RX_RRRD }, - { "sh", 0x4b, INSTR_RX_RRRD }, - { "mh", 0x4c, INSTR_RX_RRRD }, - { "bas", 0x4d, INSTR_RX_RRRD }, - { "cvd", 0x4e, INSTR_RX_RRRD }, - { "cvb", 0x4f, INSTR_RX_RRRD }, - { "st", 0x50, INSTR_RX_RRRD }, - { "lae", 0x51, INSTR_RX_RRRD }, - { "n", 0x54, INSTR_RX_RRRD }, - { "cl", 0x55, INSTR_RX_RRRD }, - { "o", 0x56, INSTR_RX_RRRD }, - { "x", 0x57, INSTR_RX_RRRD }, - { "l", 0x58, INSTR_RX_RRRD }, - { "c", 0x59, INSTR_RX_RRRD }, - { "a", 0x5a, INSTR_RX_RRRD }, - { "s", 0x5b, INSTR_RX_RRRD }, - { "m", 0x5c, INSTR_RX_RRRD }, - { "d", 0x5d, INSTR_RX_RRRD }, - { "al", 0x5e, INSTR_RX_RRRD }, - { "sl", 0x5f, INSTR_RX_RRRD }, - { "std", 0x60, INSTR_RX_FRRD }, - { "mxd", 0x67, INSTR_RX_FRRD }, - { "ld", 0x68, INSTR_RX_FRRD }, - { "cd", 0x69, INSTR_RX_FRRD }, - { "ad", 0x6a, INSTR_RX_FRRD }, - { "sd", 0x6b, INSTR_RX_FRRD }, - { "md", 0x6c, INSTR_RX_FRRD }, - { "dd", 0x6d, INSTR_RX_FRRD }, - { "aw", 0x6e, INSTR_RX_FRRD }, - { "sw", 0x6f, INSTR_RX_FRRD }, - { "ste", 0x70, INSTR_RX_FRRD }, - { "ms", 0x71, INSTR_RX_RRRD }, - { "le", 0x78, INSTR_RX_FRRD }, - { "ce", 0x79, INSTR_RX_FRRD }, - { "ae", 0x7a, INSTR_RX_FRRD }, - { "se", 0x7b, INSTR_RX_FRRD }, - { "mde", 0x7c, INSTR_RX_FRRD }, - { "de", 0x7d, INSTR_RX_FRRD }, - { "au", 0x7e, INSTR_RX_FRRD }, - { "su", 0x7f, INSTR_RX_FRRD }, - { "ssm", 0x80, INSTR_S_RD }, - { "lpsw", 0x82, INSTR_S_RD }, - { "diag", 0x83, INSTR_RS_RRRD }, - { "brxh", 0x84, INSTR_RSI_RRP }, - { "brxle", 0x85, INSTR_RSI_RRP }, - { "bxh", 0x86, INSTR_RS_RRRD }, - { "bxle", 0x87, INSTR_RS_RRRD }, - { "srl", 0x88, INSTR_RS_R0RD }, - { "sll", 0x89, INSTR_RS_R0RD }, - { "sra", 0x8a, INSTR_RS_R0RD }, - { "sla", 0x8b, INSTR_RS_R0RD }, - { "srdl", 0x8c, INSTR_RS_R0RD }, - { "sldl", 0x8d, INSTR_RS_R0RD }, - { "srda", 0x8e, INSTR_RS_R0RD }, - { "slda", 0x8f, INSTR_RS_R0RD }, - { "stm", 0x90, INSTR_RS_RRRD }, - { "tm", 0x91, INSTR_SI_URD }, - { "mvi", 0x92, INSTR_SI_URD }, - { "ts", 0x93, INSTR_S_RD }, - { "ni", 0x94, INSTR_SI_URD }, - { "cli", 0x95, INSTR_SI_URD }, - { "oi", 0x96, INSTR_SI_URD }, - { "xi", 0x97, INSTR_SI_URD }, - { "lm", 0x98, INSTR_RS_RRRD }, - { "trace", 0x99, INSTR_RS_RRRD }, - { "lam", 0x9a, INSTR_RS_AARD }, - { "stam", 0x9b, INSTR_RS_AARD }, - { "mvcle", 0xa8, INSTR_RS_RRRD }, - { "clcle", 0xa9, INSTR_RS_RRRD }, - { "stnsm", 0xac, INSTR_SI_URD }, - { "stosm", 0xad, INSTR_SI_URD }, - { "sigp", 0xae, INSTR_RS_RRRD }, - { "mc", 0xaf, INSTR_SI_URD }, - { "lra", 0xb1, INSTR_RX_RRRD }, - { "stctl", 0xb6, INSTR_RS_CCRD }, - { "lctl", 0xb7, INSTR_RS_CCRD }, - { "cs", 0xba, INSTR_RS_RRRD }, - { "cds", 0xbb, INSTR_RS_RRRD }, - { "clm", 0xbd, INSTR_RS_RURD }, - { "stcm", 0xbe, INSTR_RS_RURD }, - { "icm", 0xbf, INSTR_RS_RURD }, - { "mvn", 0xd1, INSTR_SS_L0RDRD }, - { "mvc", 0xd2, INSTR_SS_L0RDRD }, - { "mvz", 0xd3, INSTR_SS_L0RDRD }, - { "nc", 0xd4, INSTR_SS_L0RDRD }, - { "clc", 0xd5, INSTR_SS_L0RDRD }, - { "oc", 0xd6, INSTR_SS_L0RDRD }, - { "xc", 0xd7, INSTR_SS_L0RDRD }, - { "mvck", 0xd9, INSTR_SS_RRRDRD }, - { "mvcp", 0xda, INSTR_SS_RRRDRD }, - { "mvcs", 0xdb, INSTR_SS_RRRDRD }, - { "tr", 0xdc, INSTR_SS_L0RDRD }, - { "trt", 0xdd, INSTR_SS_L0RDRD }, - { "ed", 0xde, INSTR_SS_L0RDRD }, - { "edmk", 0xdf, INSTR_SS_L0RDRD }, - { "pku", 0xe1, INSTR_SS_L0RDRD }, - { "unpku", 0xe2, INSTR_SS_L0RDRD }, - { "mvcin", 0xe8, INSTR_SS_L0RDRD }, - { "pka", 0xe9, INSTR_SS_L0RDRD }, - { "unpka", 0xea, INSTR_SS_L0RDRD }, - { "plo", 0xee, INSTR_SS_RRRDRD2 }, - { "srp", 0xf0, INSTR_SS_LIRDRD }, - { "mvo", 0xf1, INSTR_SS_LLRDRD }, - { "pack", 0xf2, INSTR_SS_LLRDRD }, - { "unpk", 0xf3, INSTR_SS_LLRDRD }, - { "zap", 0xf8, INSTR_SS_LLRDRD }, - { "cp", 0xf9, INSTR_SS_LLRDRD }, - { "ap", 0xfa, INSTR_SS_LLRDRD }, - { "sp", 0xfb, INSTR_SS_LLRDRD }, - { "mp", 0xfc, INSTR_SS_LLRDRD }, - { "dp", 0xfd, INSTR_SS_LLRDRD }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_01[] = { - { "ptff", 0x04, INSTR_E }, - { "pfpo", 0x0a, INSTR_E }, - { "sam64", 0x0e, INSTR_E }, - { "pr", 0x01, INSTR_E }, - { "upt", 0x02, INSTR_E }, - { "sckpf", 0x07, INSTR_E }, - { "tam", 0x0b, INSTR_E }, - { "sam24", 0x0c, INSTR_E }, - { "sam31", 0x0d, INSTR_E }, - { "trap2", 0xff, INSTR_E }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_a5[] = { - { "iihh", 0x00, INSTR_RI_RU }, - { "iihl", 0x01, INSTR_RI_RU }, - { "iilh", 0x02, INSTR_RI_RU }, - { "iill", 0x03, INSTR_RI_RU }, - { "nihh", 0x04, INSTR_RI_RU }, - { "nihl", 0x05, INSTR_RI_RU }, - { "nilh", 0x06, INSTR_RI_RU }, - { "nill", 0x07, INSTR_RI_RU }, - { "oihh", 0x08, INSTR_RI_RU }, - { "oihl", 0x09, INSTR_RI_RU }, - { "oilh", 0x0a, INSTR_RI_RU }, - { "oill", 0x0b, INSTR_RI_RU }, - { "llihh", 0x0c, INSTR_RI_RU }, - { "llihl", 0x0d, INSTR_RI_RU }, - { "llilh", 0x0e, INSTR_RI_RU }, - { "llill", 0x0f, INSTR_RI_RU }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_a7[] = { - { "tmhh", 0x02, INSTR_RI_RU }, - { "tmhl", 0x03, INSTR_RI_RU }, - { "brctg", 0x07, INSTR_RI_RP }, - { "lghi", 0x09, INSTR_RI_RI }, - { "aghi", 0x0b, INSTR_RI_RI }, - { "mghi", 0x0d, INSTR_RI_RI }, - { "cghi", 0x0f, INSTR_RI_RI }, - { "tmlh", 0x00, INSTR_RI_RU }, - { "tmll", 0x01, INSTR_RI_RU }, - { "brc", 0x04, INSTR_RI_UP }, - { "bras", 0x05, INSTR_RI_RP }, - { "brct", 0x06, INSTR_RI_RP }, - { "lhi", 0x08, INSTR_RI_RI }, - { "ahi", 0x0a, INSTR_RI_RI }, - { "mhi", 0x0c, INSTR_RI_RI }, - { "chi", 0x0e, INSTR_RI_RI }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_aa[] = { - { { 0, LONG_INSN_RINEXT }, 0x00, INSTR_RI_RI }, - { "rion", 0x01, INSTR_RI_RI }, - { "tric", 0x02, INSTR_RI_RI }, - { "rioff", 0x03, INSTR_RI_RI }, - { { 0, LONG_INSN_RIEMIT }, 0x04, INSTR_RI_RI }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_b2[] = { - { "stckf", 0x7c, INSTR_S_RD }, - { "lpp", 0x80, INSTR_S_RD }, - { "lcctl", 0x84, INSTR_S_RD }, - { "lpctl", 0x85, INSTR_S_RD }, - { "qsi", 0x86, INSTR_S_RD }, - { "lsctl", 0x87, INSTR_S_RD }, - { "qctri", 0x8e, INSTR_S_RD }, - { "stfle", 0xb0, INSTR_S_RD }, - { "lpswe", 0xb2, INSTR_S_RD }, - { "srnmb", 0xb8, INSTR_S_RD }, - { "srnmt", 0xb9, INSTR_S_RD }, - { "lfas", 0xbd, INSTR_S_RD }, - { "scctr", 0xe0, INSTR_RRE_RR }, - { "spctr", 0xe1, INSTR_RRE_RR }, - { "ecctr", 0xe4, INSTR_RRE_RR }, - { "epctr", 0xe5, INSTR_RRE_RR }, - { "ppa", 0xe8, INSTR_RRF_U0RR }, - { "etnd", 0xec, INSTR_RRE_R0 }, - { "ecpga", 0xed, INSTR_RRE_RR }, - { "tend", 0xf8, INSTR_S_00 }, - { "niai", 0xfa, INSTR_IE_UU }, - { { 0, LONG_INSN_TABORT }, 0xfc, INSTR_S_RD }, - { "stidp", 0x02, INSTR_S_RD }, - { "sck", 0x04, INSTR_S_RD }, - { "stck", 0x05, INSTR_S_RD }, - { "sckc", 0x06, INSTR_S_RD }, - { "stckc", 0x07, INSTR_S_RD }, - { "spt", 0x08, INSTR_S_RD }, - { "stpt", 0x09, INSTR_S_RD }, - { "spka", 0x0a, INSTR_S_RD }, - { "ipk", 0x0b, INSTR_S_00 }, - { "ptlb", 0x0d, INSTR_S_00 }, - { "spx", 0x10, INSTR_S_RD }, - { "stpx", 0x11, INSTR_S_RD }, - { "stap", 0x12, INSTR_S_RD }, - { "sie", 0x14, INSTR_S_RD }, - { "pc", 0x18, INSTR_S_RD }, - { "sac", 0x19, INSTR_S_RD }, - { "cfc", 0x1a, INSTR_S_RD }, - { "servc", 0x20, INSTR_RRE_RR }, - { "ipte", 0x21, INSTR_RRE_RR }, - { "ipm", 0x22, INSTR_RRE_R0 }, - { "ivsk", 0x23, INSTR_RRE_RR }, - { "iac", 0x24, INSTR_RRE_R0 }, - { "ssar", 0x25, INSTR_RRE_R0 }, - { "epar", 0x26, INSTR_RRE_R0 }, - { "esar", 0x27, INSTR_RRE_R0 }, - { "pt", 0x28, INSTR_RRE_RR }, - { "iske", 0x29, INSTR_RRE_RR }, - { "rrbe", 0x2a, INSTR_RRE_RR }, - { "sske", 0x2b, INSTR_RRF_M0RR }, - { "tb", 0x2c, INSTR_RRE_0R }, - { "dxr", 0x2d, INSTR_RRE_FF }, - { "pgin", 0x2e, INSTR_RRE_RR }, - { "pgout", 0x2f, INSTR_RRE_RR }, - { "csch", 0x30, INSTR_S_00 }, - { "hsch", 0x31, INSTR_S_00 }, - { "msch", 0x32, INSTR_S_RD }, - { "ssch", 0x33, INSTR_S_RD }, - { "stsch", 0x34, INSTR_S_RD }, - { "tsch", 0x35, INSTR_S_RD }, - { "tpi", 0x36, INSTR_S_RD }, - { "sal", 0x37, INSTR_S_00 }, - { "rsch", 0x38, INSTR_S_00 }, - { "stcrw", 0x39, INSTR_S_RD }, - { "stcps", 0x3a, INSTR_S_RD }, - { "rchp", 0x3b, INSTR_S_00 }, - { "schm", 0x3c, INSTR_S_00 }, - { "bakr", 0x40, INSTR_RRE_RR }, - { "cksm", 0x41, INSTR_RRE_RR }, - { "sqdr", 0x44, INSTR_RRE_FF }, - { "sqer", 0x45, INSTR_RRE_FF }, - { "stura", 0x46, INSTR_RRE_RR }, - { "msta", 0x47, INSTR_RRE_R0 }, - { "palb", 0x48, INSTR_RRE_00 }, - { "ereg", 0x49, INSTR_RRE_RR }, - { "esta", 0x4a, INSTR_RRE_RR }, - { "lura", 0x4b, INSTR_RRE_RR }, - { "tar", 0x4c, INSTR_RRE_AR }, - { "cpya", 0x4d, INSTR_RRE_AA }, - { "sar", 0x4e, INSTR_RRE_AR }, - { "ear", 0x4f, INSTR_RRE_RA }, - { "csp", 0x50, INSTR_RRE_RR }, - { "msr", 0x52, INSTR_RRE_RR }, - { "mvpg", 0x54, INSTR_RRE_RR }, - { "mvst", 0x55, INSTR_RRE_RR }, - { "sthyi", 0x56, INSTR_RRE_RR }, - { "cuse", 0x57, INSTR_RRE_RR }, - { "bsg", 0x58, INSTR_RRE_RR }, - { "bsa", 0x5a, INSTR_RRE_RR }, - { "clst", 0x5d, INSTR_RRE_RR }, - { "srst", 0x5e, INSTR_RRE_RR }, - { "cmpsc", 0x63, INSTR_RRE_RR }, - { "siga", 0x74, INSTR_S_RD }, - { "xsch", 0x76, INSTR_S_00 }, - { "rp", 0x77, INSTR_S_RD }, - { "stcke", 0x78, INSTR_S_RD }, - { "sacf", 0x79, INSTR_S_RD }, - { "stsi", 0x7d, INSTR_S_RD }, - { "srnm", 0x99, INSTR_S_RD }, - { "stfpc", 0x9c, INSTR_S_RD }, - { "lfpc", 0x9d, INSTR_S_RD }, - { "tre", 0xa5, INSTR_RRE_RR }, - { "cuutf", 0xa6, INSTR_RRF_M0RR }, - { "cutfu", 0xa7, INSTR_RRF_M0RR }, - { "stfl", 0xb1, INSTR_S_RD }, - { "trap4", 0xff, INSTR_S_RD }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_b3[] = { - { "maylr", 0x38, INSTR_RRF_F0FF }, - { "mylr", 0x39, INSTR_RRF_F0FF }, - { "mayr", 0x3a, INSTR_RRF_F0FF }, - { "myr", 0x3b, INSTR_RRF_F0FF }, - { "mayhr", 0x3c, INSTR_RRF_F0FF }, - { "myhr", 0x3d, INSTR_RRF_F0FF }, - { "lpdfr", 0x70, INSTR_RRE_FF }, - { "lndfr", 0x71, INSTR_RRE_FF }, - { "cpsdr", 0x72, INSTR_RRF_F0FF2 }, - { "lcdfr", 0x73, INSTR_RRE_FF }, - { "sfasr", 0x85, INSTR_RRE_R0 }, - { { 0, LONG_INSN_CELFBR }, 0x90, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDLFBR }, 0x91, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CXLFBR }, 0x92, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CEFBRA }, 0x94, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDFBRA }, 0x95, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CXFBRA }, 0x96, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CFEBRA }, 0x98, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CFDBRA }, 0x99, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CFXBRA }, 0x9a, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CLFEBR }, 0x9c, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLFDBR }, 0x9d, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLFXBR }, 0x9e, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CELGBR }, 0xa0, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDLGBR }, 0xa1, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CXLGBR }, 0xa2, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CEGBRA }, 0xa4, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDGBRA }, 0xa5, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CXGBRA }, 0xa6, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CGEBRA }, 0xa8, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CGDBRA }, 0xa9, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CGXBRA }, 0xaa, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CLGEBR }, 0xac, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLGDBR }, 0xad, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLGXBR }, 0xae, INSTR_RRF_UUFR }, - { "ldgr", 0xc1, INSTR_RRE_FR }, - { "cegr", 0xc4, INSTR_RRE_FR }, - { "cdgr", 0xc5, INSTR_RRE_FR }, - { "cxgr", 0xc6, INSTR_RRE_FR }, - { "cger", 0xc8, INSTR_RRF_U0RF }, - { "cgdr", 0xc9, INSTR_RRF_U0RF }, - { "cgxr", 0xca, INSTR_RRF_U0RF }, - { "lgdr", 0xcd, INSTR_RRE_RF }, - { "mdtra", 0xd0, INSTR_RRF_FUFF2 }, - { "ddtra", 0xd1, INSTR_RRF_FUFF2 }, - { "adtra", 0xd2, INSTR_RRF_FUFF2 }, - { "sdtra", 0xd3, INSTR_RRF_FUFF2 }, - { "ldetr", 0xd4, INSTR_RRF_0UFF }, - { "ledtr", 0xd5, INSTR_RRF_UUFF }, - { "ltdtr", 0xd6, INSTR_RRE_FF }, - { "fidtr", 0xd7, INSTR_RRF_UUFF }, - { "mxtra", 0xd8, INSTR_RRF_FUFF2 }, - { "dxtra", 0xd9, INSTR_RRF_FUFF2 }, - { "axtra", 0xda, INSTR_RRF_FUFF2 }, - { "sxtra", 0xdb, INSTR_RRF_FUFF2 }, - { "lxdtr", 0xdc, INSTR_RRF_0UFF }, - { "ldxtr", 0xdd, INSTR_RRF_UUFF }, - { "ltxtr", 0xde, INSTR_RRE_FF }, - { "fixtr", 0xdf, INSTR_RRF_UUFF }, - { "kdtr", 0xe0, INSTR_RRE_FF }, - { { 0, LONG_INSN_CGDTRA }, 0xe1, INSTR_RRF_UURF }, - { "cudtr", 0xe2, INSTR_RRE_RF }, - { "csdtr", 0xe3, INSTR_RRE_RF }, - { "cdtr", 0xe4, INSTR_RRE_FF }, - { "eedtr", 0xe5, INSTR_RRE_RF }, - { "esdtr", 0xe7, INSTR_RRE_RF }, - { "kxtr", 0xe8, INSTR_RRE_FF }, - { { 0, LONG_INSN_CGXTRA }, 0xe9, INSTR_RRF_UUFR }, - { "cuxtr", 0xea, INSTR_RRE_RF }, - { "csxtr", 0xeb, INSTR_RRE_RF }, - { "cxtr", 0xec, INSTR_RRE_FF }, - { "eextr", 0xed, INSTR_RRE_RF }, - { "esxtr", 0xef, INSTR_RRE_RF }, - { { 0, LONG_INSN_CDGTRA }, 0xf1, INSTR_RRF_UUFR }, - { "cdutr", 0xf2, INSTR_RRE_FR }, - { "cdstr", 0xf3, INSTR_RRE_FR }, - { "cedtr", 0xf4, INSTR_RRE_FF }, - { "qadtr", 0xf5, INSTR_RRF_FUFF }, - { "iedtr", 0xf6, INSTR_RRF_F0FR }, - { "rrdtr", 0xf7, INSTR_RRF_FFRU }, - { { 0, LONG_INSN_CXGTRA }, 0xf9, INSTR_RRF_UURF }, - { "cxutr", 0xfa, INSTR_RRE_FR }, - { "cxstr", 0xfb, INSTR_RRE_FR }, - { "cextr", 0xfc, INSTR_RRE_FF }, - { "qaxtr", 0xfd, INSTR_RRF_FUFF }, - { "iextr", 0xfe, INSTR_RRF_F0FR }, - { "rrxtr", 0xff, INSTR_RRF_FFRU }, - { "lpebr", 0x00, INSTR_RRE_FF }, - { "lnebr", 0x01, INSTR_RRE_FF }, - { "ltebr", 0x02, INSTR_RRE_FF }, - { "lcebr", 0x03, INSTR_RRE_FF }, - { "ldebr", 0x04, INSTR_RRE_FF }, - { "lxdbr", 0x05, INSTR_RRE_FF }, - { "lxebr", 0x06, INSTR_RRE_FF }, - { "mxdbr", 0x07, INSTR_RRE_FF }, - { "kebr", 0x08, INSTR_RRE_FF }, - { "cebr", 0x09, INSTR_RRE_FF }, - { "aebr", 0x0a, INSTR_RRE_FF }, - { "sebr", 0x0b, INSTR_RRE_FF }, - { "mdebr", 0x0c, INSTR_RRE_FF }, - { "debr", 0x0d, INSTR_RRE_FF }, - { "maebr", 0x0e, INSTR_RRF_F0FF }, - { "msebr", 0x0f, INSTR_RRF_F0FF }, - { "lpdbr", 0x10, INSTR_RRE_FF }, - { "lndbr", 0x11, INSTR_RRE_FF }, - { "ltdbr", 0x12, INSTR_RRE_FF }, - { "lcdbr", 0x13, INSTR_RRE_FF }, - { "sqebr", 0x14, INSTR_RRE_FF }, - { "sqdbr", 0x15, INSTR_RRE_FF }, - { "sqxbr", 0x16, INSTR_RRE_FF }, - { "meebr", 0x17, INSTR_RRE_FF }, - { "kdbr", 0x18, INSTR_RRE_FF }, - { "cdbr", 0x19, INSTR_RRE_FF }, - { "adbr", 0x1a, INSTR_RRE_FF }, - { "sdbr", 0x1b, INSTR_RRE_FF }, - { "mdbr", 0x1c, INSTR_RRE_FF }, - { "ddbr", 0x1d, INSTR_RRE_FF }, - { "madbr", 0x1e, INSTR_RRF_F0FF }, - { "msdbr", 0x1f, INSTR_RRF_F0FF }, - { "lder", 0x24, INSTR_RRE_FF }, - { "lxdr", 0x25, INSTR_RRE_FF }, - { "lxer", 0x26, INSTR_RRE_FF }, - { "maer", 0x2e, INSTR_RRF_F0FF }, - { "mser", 0x2f, INSTR_RRF_F0FF }, - { "sqxr", 0x36, INSTR_RRE_FF }, - { "meer", 0x37, INSTR_RRE_FF }, - { "madr", 0x3e, INSTR_RRF_F0FF }, - { "msdr", 0x3f, INSTR_RRF_F0FF }, - { "lpxbr", 0x40, INSTR_RRE_FF }, - { "lnxbr", 0x41, INSTR_RRE_FF }, - { "ltxbr", 0x42, INSTR_RRE_FF }, - { "lcxbr", 0x43, INSTR_RRE_FF }, - { { 0, LONG_INSN_LEDBRA }, 0x44, INSTR_RRF_UUFF }, - { { 0, LONG_INSN_LDXBRA }, 0x45, INSTR_RRF_UUFF }, - { { 0, LONG_INSN_LEXBRA }, 0x46, INSTR_RRF_UUFF }, - { { 0, LONG_INSN_FIXBRA }, 0x47, INSTR_RRF_UUFF }, - { "kxbr", 0x48, INSTR_RRE_FF }, - { "cxbr", 0x49, INSTR_RRE_FF }, - { "axbr", 0x4a, INSTR_RRE_FF }, - { "sxbr", 0x4b, INSTR_RRE_FF }, - { "mxbr", 0x4c, INSTR_RRE_FF }, - { "dxbr", 0x4d, INSTR_RRE_FF }, - { "tbedr", 0x50, INSTR_RRF_U0FF }, - { "tbdr", 0x51, INSTR_RRF_U0FF }, - { "diebr", 0x53, INSTR_RRF_FUFF }, - { { 0, LONG_INSN_FIEBRA }, 0x57, INSTR_RRF_UUFF }, - { "thder", 0x58, INSTR_RRE_FF }, - { "thdr", 0x59, INSTR_RRE_FF }, - { "didbr", 0x5b, INSTR_RRF_FUFF }, - { { 0, LONG_INSN_FIDBRA }, 0x5f, INSTR_RRF_UUFF }, - { "lpxr", 0x60, INSTR_RRE_FF }, - { "lnxr", 0x61, INSTR_RRE_FF }, - { "ltxr", 0x62, INSTR_RRE_FF }, - { "lcxr", 0x63, INSTR_RRE_FF }, - { "lxr", 0x65, INSTR_RRE_FF }, - { "lexr", 0x66, INSTR_RRE_FF }, - { "fixr", 0x67, INSTR_RRE_FF }, - { "cxr", 0x69, INSTR_RRE_FF }, - { "lzer", 0x74, INSTR_RRE_F0 }, - { "lzdr", 0x75, INSTR_RRE_F0 }, - { "lzxr", 0x76, INSTR_RRE_F0 }, - { "fier", 0x77, INSTR_RRE_FF }, - { "fidr", 0x7f, INSTR_RRE_FF }, - { "sfpc", 0x84, INSTR_RRE_RR_OPT }, - { "efpc", 0x8c, INSTR_RRE_RR_OPT }, - { "cefr", 0xb4, INSTR_RRE_FR }, - { "cdfr", 0xb5, INSTR_RRE_FR }, - { "cxfr", 0xb6, INSTR_RRE_FR }, - { "cfer", 0xb8, INSTR_RRF_U0RF }, - { "cfdr", 0xb9, INSTR_RRF_U0RF }, - { "cfxr", 0xba, INSTR_RRF_U0RF }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_b9[] = { - { "lpgr", 0x00, INSTR_RRE_RR }, - { "lngr", 0x01, INSTR_RRE_RR }, - { "ltgr", 0x02, INSTR_RRE_RR }, - { "lcgr", 0x03, INSTR_RRE_RR }, - { "lgr", 0x04, INSTR_RRE_RR }, - { "lurag", 0x05, INSTR_RRE_RR }, - { "lgbr", 0x06, INSTR_RRE_RR }, - { "lghr", 0x07, INSTR_RRE_RR }, - { "agr", 0x08, INSTR_RRE_RR }, - { "sgr", 0x09, INSTR_RRE_RR }, - { "algr", 0x0a, INSTR_RRE_RR }, - { "slgr", 0x0b, INSTR_RRE_RR }, - { "msgr", 0x0c, INSTR_RRE_RR }, - { "dsgr", 0x0d, INSTR_RRE_RR }, - { "eregg", 0x0e, INSTR_RRE_RR }, - { "lrvgr", 0x0f, INSTR_RRE_RR }, - { "lpgfr", 0x10, INSTR_RRE_RR }, - { "lngfr", 0x11, INSTR_RRE_RR }, - { "ltgfr", 0x12, INSTR_RRE_RR }, - { "lcgfr", 0x13, INSTR_RRE_RR }, - { "lgfr", 0x14, INSTR_RRE_RR }, - { "llgfr", 0x16, INSTR_RRE_RR }, - { "llgtr", 0x17, INSTR_RRE_RR }, - { "agfr", 0x18, INSTR_RRE_RR }, - { "sgfr", 0x19, INSTR_RRE_RR }, - { "algfr", 0x1a, INSTR_RRE_RR }, - { "slgfr", 0x1b, INSTR_RRE_RR }, - { "msgfr", 0x1c, INSTR_RRE_RR }, - { "dsgfr", 0x1d, INSTR_RRE_RR }, - { "cgr", 0x20, INSTR_RRE_RR }, - { "clgr", 0x21, INSTR_RRE_RR }, - { "sturg", 0x25, INSTR_RRE_RR }, - { "lbr", 0x26, INSTR_RRE_RR }, - { "lhr", 0x27, INSTR_RRE_RR }, - { "kma", 0x29, INSTR_RRF_R0RR }, - { "cgfr", 0x30, INSTR_RRE_RR }, - { "clgfr", 0x31, INSTR_RRE_RR }, - { "ppno", 0x3c, INSTR_RRE_RR }, - { "cfdtr", 0x41, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLGDTR }, 0x42, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLFDTR }, 0x43, INSTR_RRF_UURF }, - { "bctgr", 0x46, INSTR_RRE_RR }, - { "cfxtr", 0x49, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CLGXTR }, 0x4a, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CLFXTR }, 0x4b, INSTR_RRF_UUFR }, - { "cdftr", 0x51, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDLGTR }, 0x52, INSTR_RRF_UUFR }, - { { 0, LONG_INSN_CDLFTR }, 0x53, INSTR_RRF_UUFR }, - { "cxftr", 0x59, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CXLGTR }, 0x5a, INSTR_RRF_UURF }, - { { 0, LONG_INSN_CXLFTR }, 0x5b, INSTR_RRF_UUFR }, - { "cgrt", 0x60, INSTR_RRF_U0RR }, - { "clgrt", 0x61, INSTR_RRF_U0RR }, - { "crt", 0x72, INSTR_RRF_U0RR }, - { "clrt", 0x73, INSTR_RRF_U0RR }, - { "ngr", 0x80, INSTR_RRE_RR }, - { "ogr", 0x81, INSTR_RRE_RR }, - { "xgr", 0x82, INSTR_RRE_RR }, - { "flogr", 0x83, INSTR_RRE_RR }, - { "llgcr", 0x84, INSTR_RRE_RR }, - { "llghr", 0x85, INSTR_RRE_RR }, - { "mlgr", 0x86, INSTR_RRE_RR }, - { "dlgr", 0x87, INSTR_RRE_RR }, - { "alcgr", 0x88, INSTR_RRE_RR }, - { "slbgr", 0x89, INSTR_RRE_RR }, - { "cspg", 0x8a, INSTR_RRE_RR }, - { "idte", 0x8e, INSTR_RRF_R0RR }, - { "crdte", 0x8f, INSTR_RRF_RMRR }, - { "llcr", 0x94, INSTR_RRE_RR }, - { "llhr", 0x95, INSTR_RRE_RR }, - { "esea", 0x9d, INSTR_RRE_R0 }, - { "ptf", 0xa2, INSTR_RRE_R0 }, - { "lptea", 0xaa, INSTR_RRF_RURR }, - { "rrbm", 0xae, INSTR_RRE_RR }, - { "pfmf", 0xaf, INSTR_RRE_RR }, - { "cu14", 0xb0, INSTR_RRF_M0RR }, - { "cu24", 0xb1, INSTR_RRF_M0RR }, - { "cu41", 0xb2, INSTR_RRE_RR }, - { "cu42", 0xb3, INSTR_RRE_RR }, - { "trtre", 0xbd, INSTR_RRF_M0RR }, - { "srstu", 0xbe, INSTR_RRE_RR }, - { "trte", 0xbf, INSTR_RRF_M0RR }, - { "ahhhr", 0xc8, INSTR_RRF_R0RR2 }, - { "shhhr", 0xc9, INSTR_RRF_R0RR2 }, - { { 0, LONG_INSN_ALHHHR }, 0xca, INSTR_RRF_R0RR2 }, - { { 0, LONG_INSN_SLHHHR }, 0xcb, INSTR_RRF_R0RR2 }, - { "chhr", 0xcd, INSTR_RRE_RR }, - { "clhhr", 0xcf, INSTR_RRE_RR }, - { { 0, LONG_INSN_PCISTG }, 0xd0, INSTR_RRE_RR }, - { "pcilg", 0xd2, INSTR_RRE_RR }, - { "rpcit", 0xd3, INSTR_RRE_RR }, - { "ahhlr", 0xd8, INSTR_RRF_R0RR2 }, - { "shhlr", 0xd9, INSTR_RRF_R0RR2 }, - { { 0, LONG_INSN_ALHHLR }, 0xda, INSTR_RRF_R0RR2 }, - { { 0, LONG_INSN_SLHHLR }, 0xdb, INSTR_RRF_R0RR2 }, - { "chlr", 0xdd, INSTR_RRE_RR }, - { "clhlr", 0xdf, INSTR_RRE_RR }, - { { 0, LONG_INSN_LOCFHR }, 0xe0, INSTR_RRF_U0RR }, - { { 0, LONG_INSN_POPCNT }, 0xe1, INSTR_RRE_RR }, - { "locgr", 0xe2, INSTR_RRF_M0RR }, - { "ngrk", 0xe4, INSTR_RRF_R0RR2 }, - { "ogrk", 0xe6, INSTR_RRF_R0RR2 }, - { "xgrk", 0xe7, INSTR_RRF_R0RR2 }, - { "agrk", 0xe8, INSTR_RRF_R0RR2 }, - { "sgrk", 0xe9, INSTR_RRF_R0RR2 }, - { "algrk", 0xea, INSTR_RRF_R0RR2 }, - { "slgrk", 0xeb, INSTR_RRF_R0RR2 }, - { "mgrk", 0xec, INSTR_RRF_R0RR2 }, - { { 0, LONG_INSN_MSGRKC }, 0xed, INSTR_RRF_R0RR2 }, - { "locr", 0xf2, INSTR_RRF_M0RR }, - { "nrk", 0xf4, INSTR_RRF_R0RR2 }, - { "ork", 0xf6, INSTR_RRF_R0RR2 }, - { "xrk", 0xf7, INSTR_RRF_R0RR2 }, - { "ark", 0xf8, INSTR_RRF_R0RR2 }, - { "srk", 0xf9, INSTR_RRF_R0RR2 }, - { "alrk", 0xfa, INSTR_RRF_R0RR2 }, - { "slrk", 0xfb, INSTR_RRF_R0RR2 }, - { "msrkc", 0xfd, INSTR_RRF_R0RR2 }, - { "kmac", 0x1e, INSTR_RRE_RR }, - { "lrvr", 0x1f, INSTR_RRE_RR }, - { "km", 0x2e, INSTR_RRE_RR }, - { "kmc", 0x2f, INSTR_RRE_RR }, - { "kimd", 0x3e, INSTR_RRE_RR }, - { "klmd", 0x3f, INSTR_RRE_RR }, - { "epsw", 0x8d, INSTR_RRE_RR }, - { "trtt", 0x90, INSTR_RRF_M0RR }, - { "trto", 0x91, INSTR_RRF_M0RR }, - { "trot", 0x92, INSTR_RRF_M0RR }, - { "troo", 0x93, INSTR_RRF_M0RR }, - { "mlr", 0x96, INSTR_RRE_RR }, - { "dlr", 0x97, INSTR_RRE_RR }, - { "alcr", 0x98, INSTR_RRE_RR }, - { "slbr", 0x99, INSTR_RRE_RR }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_c0[] = { - { "lgfi", 0x01, INSTR_RIL_RI }, - { "xihf", 0x06, INSTR_RIL_RU }, - { "xilf", 0x07, INSTR_RIL_RU }, - { "iihf", 0x08, INSTR_RIL_RU }, - { "iilf", 0x09, INSTR_RIL_RU }, - { "nihf", 0x0a, INSTR_RIL_RU }, - { "nilf", 0x0b, INSTR_RIL_RU }, - { "oihf", 0x0c, INSTR_RIL_RU }, - { "oilf", 0x0d, INSTR_RIL_RU }, - { "llihf", 0x0e, INSTR_RIL_RU }, - { "llilf", 0x0f, INSTR_RIL_RU }, - { "larl", 0x00, INSTR_RIL_RP }, - { "brcl", 0x04, INSTR_RIL_UP }, - { "brasl", 0x05, INSTR_RIL_RP }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_c2[] = { - { "msgfi", 0x00, INSTR_RIL_RI }, - { "msfi", 0x01, INSTR_RIL_RI }, - { "slgfi", 0x04, INSTR_RIL_RU }, - { "slfi", 0x05, INSTR_RIL_RU }, - { "agfi", 0x08, INSTR_RIL_RI }, - { "afi", 0x09, INSTR_RIL_RI }, - { "algfi", 0x0a, INSTR_RIL_RU }, - { "alfi", 0x0b, INSTR_RIL_RU }, - { "cgfi", 0x0c, INSTR_RIL_RI }, - { "cfi", 0x0d, INSTR_RIL_RI }, - { "clgfi", 0x0e, INSTR_RIL_RU }, - { "clfi", 0x0f, INSTR_RIL_RU }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_c4[] = { - { "llhrl", 0x02, INSTR_RIL_RP }, - { "lghrl", 0x04, INSTR_RIL_RP }, - { "lhrl", 0x05, INSTR_RIL_RP }, - { { 0, LONG_INSN_LLGHRL }, 0x06, INSTR_RIL_RP }, - { "sthrl", 0x07, INSTR_RIL_RP }, - { "lgrl", 0x08, INSTR_RIL_RP }, - { "stgrl", 0x0b, INSTR_RIL_RP }, - { "lgfrl", 0x0c, INSTR_RIL_RP }, - { "lrl", 0x0d, INSTR_RIL_RP }, - { { 0, LONG_INSN_LLGFRL }, 0x0e, INSTR_RIL_RP }, - { "strl", 0x0f, INSTR_RIL_RP }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_c6[] = { - { "exrl", 0x00, INSTR_RIL_RP }, - { "pfdrl", 0x02, INSTR_RIL_UP }, - { "cghrl", 0x04, INSTR_RIL_RP }, - { "chrl", 0x05, INSTR_RIL_RP }, - { { 0, LONG_INSN_CLGHRL }, 0x06, INSTR_RIL_RP }, - { "clhrl", 0x07, INSTR_RIL_RP }, - { "cgrl", 0x08, INSTR_RIL_RP }, - { "clgrl", 0x0a, INSTR_RIL_RP }, - { "cgfrl", 0x0c, INSTR_RIL_RP }, - { "crl", 0x0d, INSTR_RIL_RP }, - { { 0, LONG_INSN_CLGFRL }, 0x0e, INSTR_RIL_RP }, - { "clrl", 0x0f, INSTR_RIL_RP }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_c8[] = { - { "mvcos", 0x00, INSTR_SSF_RRDRD }, - { "ectg", 0x01, INSTR_SSF_RRDRD }, - { "csst", 0x02, INSTR_SSF_RRDRD }, - { "lpd", 0x04, INSTR_SSF_RRDRD2 }, - { "lpdg", 0x05, INSTR_SSF_RRDRD2 }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_cc[] = { - { "brcth", 0x06, INSTR_RIL_RP }, - { "aih", 0x08, INSTR_RIL_RI }, - { "alsih", 0x0a, INSTR_RIL_RI }, - { { 0, LONG_INSN_ALSIHN }, 0x0b, INSTR_RIL_RI }, - { "cih", 0x0d, INSTR_RIL_RI }, - { "clih", 0x0f, INSTR_RIL_RI }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_e3[] = { - { "ltg", 0x02, INSTR_RXY_RRRD }, - { "lrag", 0x03, INSTR_RXY_RRRD }, - { "lg", 0x04, INSTR_RXY_RRRD }, - { "cvby", 0x06, INSTR_RXY_RRRD }, - { "ag", 0x08, INSTR_RXY_RRRD }, - { "sg", 0x09, INSTR_RXY_RRRD }, - { "alg", 0x0a, INSTR_RXY_RRRD }, - { "slg", 0x0b, INSTR_RXY_RRRD }, - { "msg", 0x0c, INSTR_RXY_RRRD }, - { "dsg", 0x0d, INSTR_RXY_RRRD }, - { "cvbg", 0x0e, INSTR_RXY_RRRD }, - { "lrvg", 0x0f, INSTR_RXY_RRRD }, - { "lt", 0x12, INSTR_RXY_RRRD }, - { "lray", 0x13, INSTR_RXY_RRRD }, - { "lgf", 0x14, INSTR_RXY_RRRD }, - { "lgh", 0x15, INSTR_RXY_RRRD }, - { "llgf", 0x16, INSTR_RXY_RRRD }, - { "llgt", 0x17, INSTR_RXY_RRRD }, - { "agf", 0x18, INSTR_RXY_RRRD }, - { "sgf", 0x19, INSTR_RXY_RRRD }, - { "algf", 0x1a, INSTR_RXY_RRRD }, - { "slgf", 0x1b, INSTR_RXY_RRRD }, - { "msgf", 0x1c, INSTR_RXY_RRRD }, - { "dsgf", 0x1d, INSTR_RXY_RRRD }, - { "cg", 0x20, INSTR_RXY_RRRD }, - { "clg", 0x21, INSTR_RXY_RRRD }, - { "stg", 0x24, INSTR_RXY_RRRD }, - { "ntstg", 0x25, INSTR_RXY_RRRD }, - { "cvdy", 0x26, INSTR_RXY_RRRD }, - { "lzrg", 0x2a, INSTR_RXY_RRRD }, - { "cvdg", 0x2e, INSTR_RXY_RRRD }, - { "strvg", 0x2f, INSTR_RXY_RRRD }, - { "cgf", 0x30, INSTR_RXY_RRRD }, - { "clgf", 0x31, INSTR_RXY_RRRD }, - { "ltgf", 0x32, INSTR_RXY_RRRD }, - { "cgh", 0x34, INSTR_RXY_RRRD }, - { "pfd", 0x36, INSTR_RXY_URRD }, - { "agh", 0x38, INSTR_RXY_RRRD }, - { "sgh", 0x39, INSTR_RXY_RRRD }, - { { 0, LONG_INSN_LLZRGF }, 0x3a, INSTR_RXY_RRRD }, - { "lzrf", 0x3b, INSTR_RXY_RRRD }, - { "mgh", 0x3c, INSTR_RXY_RRRD }, - { "strvh", 0x3f, INSTR_RXY_RRRD }, - { "bctg", 0x46, INSTR_RXY_RRRD }, - { "bic", 0x47, INSTR_RXY_URRD }, - { { 0, LONG_INSN_LLGFSG }, 0x48, INSTR_RXY_RRRD }, - { "stgsc", 0x49, INSTR_RXY_RRRD }, - { "lgg", 0x4c, INSTR_RXY_RRRD }, - { "lgsc", 0x4d, INSTR_RXY_RRRD }, - { "sty", 0x50, INSTR_RXY_RRRD }, - { "msy", 0x51, INSTR_RXY_RRRD }, - { "msc", 0x53, INSTR_RXY_RRRD }, - { "ny", 0x54, INSTR_RXY_RRRD }, - { "cly", 0x55, INSTR_RXY_RRRD }, - { "oy", 0x56, INSTR_RXY_RRRD }, - { "xy", 0x57, INSTR_RXY_RRRD }, - { "ly", 0x58, INSTR_RXY_RRRD }, - { "cy", 0x59, INSTR_RXY_RRRD }, - { "ay", 0x5a, INSTR_RXY_RRRD }, - { "sy", 0x5b, INSTR_RXY_RRRD }, - { "mfy", 0x5c, INSTR_RXY_RRRD }, - { "aly", 0x5e, INSTR_RXY_RRRD }, - { "sly", 0x5f, INSTR_RXY_RRRD }, - { "sthy", 0x70, INSTR_RXY_RRRD }, - { "lay", 0x71, INSTR_RXY_RRRD }, - { "stcy", 0x72, INSTR_RXY_RRRD }, - { "icy", 0x73, INSTR_RXY_RRRD }, - { "laey", 0x75, INSTR_RXY_RRRD }, - { "lb", 0x76, INSTR_RXY_RRRD }, - { "lgb", 0x77, INSTR_RXY_RRRD }, - { "lhy", 0x78, INSTR_RXY_RRRD }, - { "chy", 0x79, INSTR_RXY_RRRD }, - { "ahy", 0x7a, INSTR_RXY_RRRD }, - { "shy", 0x7b, INSTR_RXY_RRRD }, - { "mhy", 0x7c, INSTR_RXY_RRRD }, - { "ng", 0x80, INSTR_RXY_RRRD }, - { "og", 0x81, INSTR_RXY_RRRD }, - { "xg", 0x82, INSTR_RXY_RRRD }, - { "msgc", 0x83, INSTR_RXY_RRRD }, - { "mg", 0x84, INSTR_RXY_RRRD }, - { "lgat", 0x85, INSTR_RXY_RRRD }, - { "mlg", 0x86, INSTR_RXY_RRRD }, - { "dlg", 0x87, INSTR_RXY_RRRD }, - { "alcg", 0x88, INSTR_RXY_RRRD }, - { "slbg", 0x89, INSTR_RXY_RRRD }, - { "stpq", 0x8e, INSTR_RXY_RRRD }, - { "lpq", 0x8f, INSTR_RXY_RRRD }, - { "llgc", 0x90, INSTR_RXY_RRRD }, - { "llgh", 0x91, INSTR_RXY_RRRD }, - { "llc", 0x94, INSTR_RXY_RRRD }, - { "llh", 0x95, INSTR_RXY_RRRD }, - { { 0, LONG_INSN_LLGTAT }, 0x9c, INSTR_RXY_RRRD }, - { { 0, LONG_INSN_LLGFAT }, 0x9d, INSTR_RXY_RRRD }, - { "lat", 0x9f, INSTR_RXY_RRRD }, - { "lbh", 0xc0, INSTR_RXY_RRRD }, - { "llch", 0xc2, INSTR_RXY_RRRD }, - { "stch", 0xc3, INSTR_RXY_RRRD }, - { "lhh", 0xc4, INSTR_RXY_RRRD }, - { "llhh", 0xc6, INSTR_RXY_RRRD }, - { "sthh", 0xc7, INSTR_RXY_RRRD }, - { "lfhat", 0xc8, INSTR_RXY_RRRD }, - { "lfh", 0xca, INSTR_RXY_RRRD }, - { "stfh", 0xcb, INSTR_RXY_RRRD }, - { "chf", 0xcd, INSTR_RXY_RRRD }, - { "clhf", 0xcf, INSTR_RXY_RRRD }, - { { 0, LONG_INSN_MPCIFC }, 0xd0, INSTR_RXY_RRRD }, - { { 0, LONG_INSN_STPCIFC }, 0xd4, INSTR_RXY_RRRD }, - { "lrv", 0x1e, INSTR_RXY_RRRD }, - { "lrvh", 0x1f, INSTR_RXY_RRRD }, - { "strv", 0x3e, INSTR_RXY_RRRD }, - { "ml", 0x96, INSTR_RXY_RRRD }, - { "dl", 0x97, INSTR_RXY_RRRD }, - { "alc", 0x98, INSTR_RXY_RRRD }, - { "slb", 0x99, INSTR_RXY_RRRD }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_e5[] = { - { "strag", 0x02, INSTR_SSE_RDRD }, - { "mvhhi", 0x44, INSTR_SIL_RDI }, - { "mvghi", 0x48, INSTR_SIL_RDI }, - { "mvhi", 0x4c, INSTR_SIL_RDI }, - { "chhsi", 0x54, INSTR_SIL_RDI }, - { { 0, LONG_INSN_CLHHSI }, 0x55, INSTR_SIL_RDU }, - { "cghsi", 0x58, INSTR_SIL_RDI }, - { { 0, LONG_INSN_CLGHSI }, 0x59, INSTR_SIL_RDU }, - { "chsi", 0x5c, INSTR_SIL_RDI }, - { { 0, LONG_INSN_CLFHSI }, 0x5d, INSTR_SIL_RDU }, - { { 0, LONG_INSN_TBEGIN }, 0x60, INSTR_SIL_RDU }, - { { 0, LONG_INSN_TBEGINC }, 0x61, INSTR_SIL_RDU }, - { "lasp", 0x00, INSTR_SSE_RDRD }, - { "tprot", 0x01, INSTR_SSE_RDRD }, - { "mvcsk", 0x0e, INSTR_SSE_RDRD }, - { "mvcdk", 0x0f, INSTR_SSE_RDRD }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_e6[] = { - { "vpkz", 0x34, INSTR_VSI_URDV }, - { "vlrl", 0x35, INSTR_VSI_URDV }, - { "vlrlr", 0x37, INSTR_VRS_RRDV }, - { "vupkz", 0x3c, INSTR_VSI_URDV }, - { "vstrl", 0x3d, INSTR_VSI_URDV }, - { {0 , LONG_INSN_VSTRLR }, 0x3f, INSTR_VRS_RRDV }, - { "vlip", 0x49, INSTR_VRI_V0UU2 }, - { "vcvb", 0x50, INSTR_VRR_RV0U }, - { "vcvbg", 0x52, INSTR_VRR_RV0U }, - { "vcvd", 0x58, INSTR_VRI_VR0UU }, - { "vsrp", 0x59, INSTR_VRI_VVUUU2 }, - { "vcvdg", 0x5a, INSTR_VRI_VR0UU }, - { "vpsop", 0x5b, INSTR_VRI_VVUUU2 }, - { "vtp", 0x5f, INSTR_VRR_0V }, - { "vap", 0x71, INSTR_VRI_VVV0UU2 }, - { "vsp", 0x73, INSTR_VRI_VVV0UU2 }, - { "vcp", 0x77, INSTR_VRR_0VV0U }, - { "vmp", 0x78, INSTR_VRI_VVV0UU2 }, - { "vmsp", 0x79, INSTR_VRI_VVV0UU2 }, - { "vdp", 0x7a, INSTR_VRI_VVV0UU2 }, - { "vrp", 0x7b, INSTR_VRI_VVV0UU2 }, - { "vsdp", 0x7e, INSTR_VRI_VVV0UU2 }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_e7[] = { - { "lcbb", 0x27, INSTR_RXE_RRRDM }, - { "vgef", 0x13, INSTR_VRV_VVRDM }, - { "vgeg", 0x12, INSTR_VRV_VVRDM }, - { "vgbm", 0x44, INSTR_VRI_V0I0 }, - { "vgm", 0x46, INSTR_VRI_V0IIM }, - { "vl", 0x06, INSTR_VRX_VRRD0 }, - { "vlr", 0x56, INSTR_VRR_VV00000 }, - { "vlrp", 0x05, INSTR_VRX_VRRDM }, - { "vleb", 0x00, INSTR_VRX_VRRDM }, - { "vleh", 0x01, INSTR_VRX_VRRDM }, - { "vlef", 0x03, INSTR_VRX_VRRDM }, - { "vleg", 0x02, INSTR_VRX_VRRDM }, - { "vleib", 0x40, INSTR_VRI_V0IM }, - { "vleih", 0x41, INSTR_VRI_V0IM }, - { "vleif", 0x43, INSTR_VRI_V0IM }, - { "vleig", 0x42, INSTR_VRI_V0IM }, - { "vlgv", 0x21, INSTR_VRS_RVRDM }, - { "vllez", 0x04, INSTR_VRX_VRRDM }, - { "vlm", 0x36, INSTR_VRS_VVRD0 }, - { "vlbb", 0x07, INSTR_VRX_VRRDM }, - { "vlvg", 0x22, INSTR_VRS_VRRDM }, - { "vlvgp", 0x62, INSTR_VRR_VRR0000 }, - { "vll", 0x37, INSTR_VRS_VRRD0 }, - { "vmrh", 0x61, INSTR_VRR_VVV000M }, - { "vmrl", 0x60, INSTR_VRR_VVV000M }, - { "vpk", 0x94, INSTR_VRR_VVV000M }, - { "vpks", 0x97, INSTR_VRR_VVV0M0M }, - { "vpkls", 0x95, INSTR_VRR_VVV0M0M }, - { "vperm", 0x8c, INSTR_VRR_VVV000V }, - { "vpdi", 0x84, INSTR_VRR_VVV000M }, - { "vrep", 0x4d, INSTR_VRI_VVIM }, - { "vrepi", 0x45, INSTR_VRI_V0IM }, - { "vscef", 0x1b, INSTR_VRV_VWRDM }, - { "vsceg", 0x1a, INSTR_VRV_VWRDM }, - { "vsel", 0x8d, INSTR_VRR_VVV000V }, - { "vseg", 0x5f, INSTR_VRR_VV0000M }, - { "vst", 0x0e, INSTR_VRX_VRRD0 }, - { "vsteb", 0x08, INSTR_VRX_VRRDM }, - { "vsteh", 0x09, INSTR_VRX_VRRDM }, - { "vstef", 0x0b, INSTR_VRX_VRRDM }, - { "vsteg", 0x0a, INSTR_VRX_VRRDM }, - { "vstm", 0x3e, INSTR_VRS_VVRD0 }, - { "vstl", 0x3f, INSTR_VRS_VRRD0 }, - { "vuph", 0xd7, INSTR_VRR_VV0000M }, - { "vuplh", 0xd5, INSTR_VRR_VV0000M }, - { "vupl", 0xd6, INSTR_VRR_VV0000M }, - { "vupll", 0xd4, INSTR_VRR_VV0000M }, - { "va", 0xf3, INSTR_VRR_VVV000M }, - { "vacc", 0xf1, INSTR_VRR_VVV000M }, - { "vac", 0xbb, INSTR_VRR_VVVM00V }, - { "vaccc", 0xb9, INSTR_VRR_VVVM00V }, - { "vn", 0x68, INSTR_VRR_VVV0000 }, - { "vnc", 0x69, INSTR_VRR_VVV0000 }, - { "vavg", 0xf2, INSTR_VRR_VVV000M }, - { "vavgl", 0xf0, INSTR_VRR_VVV000M }, - { "vcksm", 0x66, INSTR_VRR_VVV0000 }, - { "vec", 0xdb, INSTR_VRR_VV0000M }, - { "vecl", 0xd9, INSTR_VRR_VV0000M }, - { "vceq", 0xf8, INSTR_VRR_VVV0M0M }, - { "vch", 0xfb, INSTR_VRR_VVV0M0M }, - { "vchl", 0xf9, INSTR_VRR_VVV0M0M }, - { "vclz", 0x53, INSTR_VRR_VV0000M }, - { "vctz", 0x52, INSTR_VRR_VV0000M }, - { "vx", 0x6d, INSTR_VRR_VVV0000 }, - { "vgfm", 0xb4, INSTR_VRR_VVV000M }, - { "vgfma", 0xbc, INSTR_VRR_VVVM00V }, - { "vlc", 0xde, INSTR_VRR_VV0000M }, - { "vlp", 0xdf, INSTR_VRR_VV0000M }, - { "vmx", 0xff, INSTR_VRR_VVV000M }, - { "vmxl", 0xfd, INSTR_VRR_VVV000M }, - { "vmn", 0xfe, INSTR_VRR_VVV000M }, - { "vmnl", 0xfc, INSTR_VRR_VVV000M }, - { "vmal", 0xaa, INSTR_VRR_VVVM00V }, - { "vmae", 0xae, INSTR_VRR_VVVM00V }, - { "vmale", 0xac, INSTR_VRR_VVVM00V }, - { "vmah", 0xab, INSTR_VRR_VVVM00V }, - { "vmalh", 0xa9, INSTR_VRR_VVVM00V }, - { "vmao", 0xaf, INSTR_VRR_VVVM00V }, - { "vmalo", 0xad, INSTR_VRR_VVVM00V }, - { "vmh", 0xa3, INSTR_VRR_VVV000M }, - { "vmlh", 0xa1, INSTR_VRR_VVV000M }, - { "vml", 0xa2, INSTR_VRR_VVV000M }, - { "vme", 0xa6, INSTR_VRR_VVV000M }, - { "vmle", 0xa4, INSTR_VRR_VVV000M }, - { "vmo", 0xa7, INSTR_VRR_VVV000M }, - { "vmlo", 0xa5, INSTR_VRR_VVV000M }, - { "vno", 0x6b, INSTR_VRR_VVV0000 }, - { "vo", 0x6a, INSTR_VRR_VVV0000 }, - { { 0, LONG_INSN_VPOPCT }, 0x50, INSTR_VRR_VV0000M }, - { { 0, LONG_INSN_VERLLV }, 0x73, INSTR_VRR_VVV000M }, - { "verll", 0x33, INSTR_VRS_VVRDM }, - { "verim", 0x72, INSTR_VRI_VVV0IM }, - { "veslv", 0x70, INSTR_VRR_VVV000M }, - { "vesl", 0x30, INSTR_VRS_VVRDM }, - { { 0, LONG_INSN_VESRAV }, 0x7a, INSTR_VRR_VVV000M }, - { "vesra", 0x3a, INSTR_VRS_VVRDM }, - { { 0, LONG_INSN_VESRLV }, 0x78, INSTR_VRR_VVV000M }, - { "vesrl", 0x38, INSTR_VRS_VVRDM }, - { "vsl", 0x74, INSTR_VRR_VVV0000 }, - { "vslb", 0x75, INSTR_VRR_VVV0000 }, - { "vsldb", 0x77, INSTR_VRI_VVV0I0 }, - { "vsra", 0x7e, INSTR_VRR_VVV0000 }, - { "vsrab", 0x7f, INSTR_VRR_VVV0000 }, - { "vsrl", 0x7c, INSTR_VRR_VVV0000 }, - { "vsrlb", 0x7d, INSTR_VRR_VVV0000 }, - { "vs", 0xf7, INSTR_VRR_VVV000M }, - { "vscb", 0xf5, INSTR_VRR_VVV000M }, - { "vsb", 0xbf, INSTR_VRR_VVVM00V }, - { { 0, LONG_INSN_VSBCBI }, 0xbd, INSTR_VRR_VVVM00V }, - { "vsumg", 0x65, INSTR_VRR_VVV000M }, - { "vsumq", 0x67, INSTR_VRR_VVV000M }, - { "vsum", 0x64, INSTR_VRR_VVV000M }, - { "vtm", 0xd8, INSTR_VRR_VV00000 }, - { "vfae", 0x82, INSTR_VRR_VVV0M0M }, - { "vfee", 0x80, INSTR_VRR_VVV0M0M }, - { "vfene", 0x81, INSTR_VRR_VVV0M0M }, - { "vistr", 0x5c, INSTR_VRR_VV00M0M }, - { "vstrc", 0x8a, INSTR_VRR_VVVMM0V }, - { "vfa", 0xe3, INSTR_VRR_VVV00MM }, - { "wfc", 0xcb, INSTR_VRR_VV000MM }, - { "wfk", 0xca, INSTR_VRR_VV000MM }, - { "vfce", 0xe8, INSTR_VRR_VVV0MMM }, - { "vfch", 0xeb, INSTR_VRR_VVV0MMM }, - { "vfche", 0xea, INSTR_VRR_VVV0MMM }, - { "vcdg", 0xc3, INSTR_VRR_VV00MMM }, - { "vcdlg", 0xc1, INSTR_VRR_VV00MMM }, - { "vcgd", 0xc2, INSTR_VRR_VV00MMM }, - { "vclgd", 0xc0, INSTR_VRR_VV00MMM }, - { "vfd", 0xe5, INSTR_VRR_VVV00MM }, - { "vfi", 0xc7, INSTR_VRR_VV00MMM }, - { "vlde", 0xc4, INSTR_VRR_VV000MM }, - { "vled", 0xc5, INSTR_VRR_VV00MMM }, - { "vfm", 0xe7, INSTR_VRR_VVV00MM }, - { "vfma", 0x8f, INSTR_VRR_VVVM0MV }, - { "vfms", 0x8e, INSTR_VRR_VVVM0MV }, - { "vfpso", 0xcc, INSTR_VRR_VV00MMM }, - { "vfsq", 0xce, INSTR_VRR_VV000MM }, - { "vfs", 0xe2, INSTR_VRR_VVV00MM }, - { "vftci", 0x4a, INSTR_VRI_VVIMM }, - { "vnx", 0x6c, INSTR_VRR_VVV }, - { "vnn", 0x6e, INSTR_VRR_VVV }, - { "voc", 0x6f, INSTR_VRR_VVV }, - { { 0, LONG_INSN_VBPERM }, 0x85, INSTR_VRR_VVV }, - { "vfnms", 0x9e, INSTR_VRR_VVVU0UV }, - { "vfnma", 0x9f, INSTR_VRR_VVVU0UV }, - { "vmsl", 0xb8, INSTR_VRR_VVVUU0V }, - { "vfmin", 0xee, INSTR_VRR_VVV0UUU }, - { "vfmax", 0xef, INSTR_VRR_VVV0UUU }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_eb[] = { - { "lmg", 0x04, INSTR_RSY_RRRD }, - { "srag", 0x0a, INSTR_RSY_RRRD }, - { "slag", 0x0b, INSTR_RSY_RRRD }, - { "srlg", 0x0c, INSTR_RSY_RRRD }, - { "sllg", 0x0d, INSTR_RSY_RRRD }, - { "tracg", 0x0f, INSTR_RSY_RRRD }, - { "csy", 0x14, INSTR_RSY_RRRD }, - { "rllg", 0x1c, INSTR_RSY_RRRD }, - { "clmh", 0x20, INSTR_RSY_RURD }, - { "clmy", 0x21, INSTR_RSY_RURD }, - { "clt", 0x23, INSTR_RSY_RURD }, - { "stmg", 0x24, INSTR_RSY_RRRD }, - { "stctg", 0x25, INSTR_RSY_CCRD }, - { "stmh", 0x26, INSTR_RSY_RRRD }, - { "clgt", 0x2b, INSTR_RSY_RURD }, - { "stcmh", 0x2c, INSTR_RSY_RURD }, - { "stcmy", 0x2d, INSTR_RSY_RURD }, - { "lctlg", 0x2f, INSTR_RSY_CCRD }, - { "csg", 0x30, INSTR_RSY_RRRD }, - { "cdsy", 0x31, INSTR_RSY_RRRD }, - { "cdsg", 0x3e, INSTR_RSY_RRRD }, - { "bxhg", 0x44, INSTR_RSY_RRRD }, - { "bxleg", 0x45, INSTR_RSY_RRRD }, - { "ecag", 0x4c, INSTR_RSY_RRRD }, - { "tmy", 0x51, INSTR_SIY_URD }, - { "mviy", 0x52, INSTR_SIY_URD }, - { "niy", 0x54, INSTR_SIY_URD }, - { "cliy", 0x55, INSTR_SIY_URD }, - { "oiy", 0x56, INSTR_SIY_URD }, - { "xiy", 0x57, INSTR_SIY_URD }, - { "asi", 0x6a, INSTR_SIY_IRD }, - { "alsi", 0x6e, INSTR_SIY_IRD }, - { "agsi", 0x7a, INSTR_SIY_IRD }, - { "algsi", 0x7e, INSTR_SIY_IRD }, - { "icmh", 0x80, INSTR_RSY_RURD }, - { "icmy", 0x81, INSTR_RSY_RURD }, - { "clclu", 0x8f, INSTR_RSY_RRRD }, - { "stmy", 0x90, INSTR_RSY_RRRD }, - { "lmh", 0x96, INSTR_RSY_RRRD }, - { "lmy", 0x98, INSTR_RSY_RRRD }, - { "lamy", 0x9a, INSTR_RSY_AARD }, - { "stamy", 0x9b, INSTR_RSY_AARD }, - { { 0, LONG_INSN_PCISTB }, 0xd0, INSTR_RSY_RRRD }, - { "sic", 0xd1, INSTR_RSY_RRRD }, - { "srak", 0xdc, INSTR_RSY_RRRD }, - { "slak", 0xdd, INSTR_RSY_RRRD }, - { "srlk", 0xde, INSTR_RSY_RRRD }, - { "sllk", 0xdf, INSTR_RSY_RRRD }, - { "locfh", 0xe0, INSTR_RSY_RURD2 }, - { { 0, LONG_INSN_STOCFH }, 0xe1, INSTR_RSY_RURD2 }, - { "locg", 0xe2, INSTR_RSY_RDRM }, - { "stocg", 0xe3, INSTR_RSY_RDRM }, - { "lang", 0xe4, INSTR_RSY_RRRD }, - { "laog", 0xe6, INSTR_RSY_RRRD }, - { "laxg", 0xe7, INSTR_RSY_RRRD }, - { "laag", 0xe8, INSTR_RSY_RRRD }, - { "laalg", 0xea, INSTR_RSY_RRRD }, - { "loc", 0xf2, INSTR_RSY_RDRM }, - { "stoc", 0xf3, INSTR_RSY_RDRM }, - { "lan", 0xf4, INSTR_RSY_RRRD }, - { "lao", 0xf6, INSTR_RSY_RRRD }, - { "lax", 0xf7, INSTR_RSY_RRRD }, - { "laa", 0xf8, INSTR_RSY_RRRD }, - { "laal", 0xfa, INSTR_RSY_RRRD }, - { "lric", 0x60, INSTR_RSY_RDRM }, - { "stric", 0x61, INSTR_RSY_RDRM }, - { "mric", 0x62, INSTR_RSY_RDRM }, - { { 0, LONG_INSN_STCCTM }, 0x17, INSTR_RSY_RMRD }, - { "rll", 0x1d, INSTR_RSY_RRRD }, - { "mvclu", 0x8e, INSTR_RSY_RRRD }, - { "tp", 0xc0, INSTR_RSL_R0RD }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_ec[] = { - { "lochi", 0x42, INSTR_RIE_RUI0 }, - { "brxhg", 0x44, INSTR_RIE_RRP }, - { "brxlg", 0x45, INSTR_RIE_RRP }, - { { 0, LONG_INSN_LOCGHI }, 0x46, INSTR_RIE_RUI0 }, - { { 0, LONG_INSN_LOCHHI }, 0x4e, INSTR_RIE_RUI0 }, - { { 0, LONG_INSN_RISBLG }, 0x51, INSTR_RIE_RRUUU }, - { "rnsbg", 0x54, INSTR_RIE_RRUUU }, - { "risbg", 0x55, INSTR_RIE_RRUUU }, - { "rosbg", 0x56, INSTR_RIE_RRUUU }, - { "rxsbg", 0x57, INSTR_RIE_RRUUU }, - { { 0, LONG_INSN_RISBGN }, 0x59, INSTR_RIE_RRUUU }, - { { 0, LONG_INSN_RISBHG }, 0x5D, INSTR_RIE_RRUUU }, - { "cgrj", 0x64, INSTR_RIE_RRPU }, - { "clgrj", 0x65, INSTR_RIE_RRPU }, - { "cgit", 0x70, INSTR_RIE_R0IU }, - { "clgit", 0x71, INSTR_RIE_R0UU }, - { "cit", 0x72, INSTR_RIE_R0IU }, - { "clfit", 0x73, INSTR_RIE_R0UU }, - { "crj", 0x76, INSTR_RIE_RRPU }, - { "clrj", 0x77, INSTR_RIE_RRPU }, - { "cgij", 0x7c, INSTR_RIE_RUPI }, - { "clgij", 0x7d, INSTR_RIE_RUPU }, - { "cij", 0x7e, INSTR_RIE_RUPI }, - { "clij", 0x7f, INSTR_RIE_RUPU }, - { "ahik", 0xd8, INSTR_RIE_RRI0 }, - { "aghik", 0xd9, INSTR_RIE_RRI0 }, - { { 0, LONG_INSN_ALHSIK }, 0xda, INSTR_RIE_RRI0 }, - { { 0, LONG_INSN_ALGHSIK }, 0xdb, INSTR_RIE_RRI0 }, - { "cgrb", 0xe4, INSTR_RRS_RRRDU }, - { "clgrb", 0xe5, INSTR_RRS_RRRDU }, - { "crb", 0xf6, INSTR_RRS_RRRDU }, - { "clrb", 0xf7, INSTR_RRS_RRRDU }, - { "cgib", 0xfc, INSTR_RIS_RURDI }, - { "clgib", 0xfd, INSTR_RIS_RURDU }, - { "cib", 0xfe, INSTR_RIS_RURDI }, - { "clib", 0xff, INSTR_RIS_RURDU }, - { "", 0, INSTR_INVALID } -}; - -static struct s390_insn opcode_ed[] = { - { "mayl", 0x38, INSTR_RXF_FRRDF }, - { "myl", 0x39, INSTR_RXF_FRRDF }, - { "may", 0x3a, INSTR_RXF_FRRDF }, - { "my", 0x3b, INSTR_RXF_FRRDF }, - { "mayh", 0x3c, INSTR_RXF_FRRDF }, - { "myh", 0x3d, INSTR_RXF_FRRDF }, - { "sldt", 0x40, INSTR_RXF_FRRDF }, - { "srdt", 0x41, INSTR_RXF_FRRDF }, - { "slxt", 0x48, INSTR_RXF_FRRDF }, - { "srxt", 0x49, INSTR_RXF_FRRDF }, - { "tdcet", 0x50, INSTR_RXE_FRRD }, - { "tdget", 0x51, INSTR_RXE_FRRD }, - { "tdcdt", 0x54, INSTR_RXE_FRRD }, - { "tdgdt", 0x55, INSTR_RXE_FRRD }, - { "tdcxt", 0x58, INSTR_RXE_FRRD }, - { "tdgxt", 0x59, INSTR_RXE_FRRD }, - { "ley", 0x64, INSTR_RXY_FRRD }, - { "ldy", 0x65, INSTR_RXY_FRRD }, - { "stey", 0x66, INSTR_RXY_FRRD }, - { "stdy", 0x67, INSTR_RXY_FRRD }, - { "czdt", 0xa8, INSTR_RSL_LRDFU }, - { "czxt", 0xa9, INSTR_RSL_LRDFU }, - { "cdzt", 0xaa, INSTR_RSL_LRDFU }, - { "cxzt", 0xab, INSTR_RSL_LRDFU }, - { "ldeb", 0x04, INSTR_RXE_FRRD }, - { "lxdb", 0x05, INSTR_RXE_FRRD }, - { "lxeb", 0x06, INSTR_RXE_FRRD }, - { "mxdb", 0x07, INSTR_RXE_FRRD }, - { "keb", 0x08, INSTR_RXE_FRRD }, - { "ceb", 0x09, INSTR_RXE_FRRD }, - { "aeb", 0x0a, INSTR_RXE_FRRD }, - { "seb", 0x0b, INSTR_RXE_FRRD }, - { "mdeb", 0x0c, INSTR_RXE_FRRD }, - { "deb", 0x0d, INSTR_RXE_FRRD }, - { "maeb", 0x0e, INSTR_RXF_FRRDF }, - { "mseb", 0x0f, INSTR_RXF_FRRDF }, - { "tceb", 0x10, INSTR_RXE_FRRD }, - { "tcdb", 0x11, INSTR_RXE_FRRD }, - { "tcxb", 0x12, INSTR_RXE_FRRD }, - { "sqeb", 0x14, INSTR_RXE_FRRD }, - { "sqdb", 0x15, INSTR_RXE_FRRD }, - { "meeb", 0x17, INSTR_RXE_FRRD }, - { "kdb", 0x18, INSTR_RXE_FRRD }, - { "cdb", 0x19, INSTR_RXE_FRRD }, - { "adb", 0x1a, INSTR_RXE_FRRD }, - { "sdb", 0x1b, INSTR_RXE_FRRD }, - { "mdb", 0x1c, INSTR_RXE_FRRD }, - { "ddb", 0x1d, INSTR_RXE_FRRD }, - { "madb", 0x1e, INSTR_RXF_FRRDF }, - { "msdb", 0x1f, INSTR_RXF_FRRDF }, - { "lde", 0x24, INSTR_RXE_FRRD }, - { "lxd", 0x25, INSTR_RXE_FRRD }, - { "lxe", 0x26, INSTR_RXE_FRRD }, - { "mae", 0x2e, INSTR_RXF_FRRDF }, - { "mse", 0x2f, INSTR_RXF_FRRDF }, - { "sqe", 0x34, INSTR_RXE_FRRD }, - { "sqd", 0x35, INSTR_RXE_FRRD }, - { "mee", 0x37, INSTR_RXE_FRRD }, - { "mad", 0x3e, INSTR_RXF_FRRDF }, - { "msd", 0x3f, INSTR_RXF_FRRDF }, - { "cdpt", 0xae, INSTR_RSL_LRDFU }, - { "cxpt", 0xaf, INSTR_RSL_LRDFU }, - { "cpdt", 0xac, INSTR_RSL_LRDFU }, - { "cpxt", 0xad, INSTR_RSL_LRDFU }, - { "", 0, INSTR_INVALID } -}; +static char long_insn_name[][7] = LONG_INSN_INITIALIZER; +static struct s390_insn opcode[] = OPCODE_TABLE_INITIALIZER; +static struct s390_opcode_offset opcode_offset[] = OPCODE_OFFSET_INITIALIZER; /* Extracts an operand value from an instruction. */ static unsigned int extract_operand(unsigned char *code, @@ -1876,87 +391,24 @@ static unsigned int extract_operand(unsigned char *code, struct s390_insn *find_insn(unsigned char *code) { - unsigned char opfrag = code[1]; - unsigned char opmask; - struct s390_insn *table; + struct s390_opcode_offset *entry; + struct s390_insn *insn; + unsigned char opfrag; + int i; - switch (code[0]) { - case 0x01: - table = opcode_01; - break; - case 0xa5: - table = opcode_a5; - break; - case 0xa7: - table = opcode_a7; - break; - case 0xaa: - table = opcode_aa; - break; - case 0xb2: - table = opcode_b2; - break; - case 0xb3: - table = opcode_b3; - break; - case 0xb9: - table = opcode_b9; - break; - case 0xc0: - table = opcode_c0; - break; - case 0xc2: - table = opcode_c2; - break; - case 0xc4: - table = opcode_c4; - break; - case 0xc6: - table = opcode_c6; - break; - case 0xc8: - table = opcode_c8; - break; - case 0xcc: - table = opcode_cc; - break; - case 0xe3: - table = opcode_e3; - opfrag = code[5]; - break; - case 0xe5: - table = opcode_e5; - break; - case 0xe6: - table = opcode_e6; - opfrag = code[5]; - break; - case 0xe7: - table = opcode_e7; - opfrag = code[5]; - break; - case 0xeb: - table = opcode_eb; - opfrag = code[5]; - break; - case 0xec: - table = opcode_ec; - opfrag = code[5]; - break; - case 0xed: - table = opcode_ed; - opfrag = code[5]; - break; - default: - table = opcode; - opfrag = code[0]; - break; + for (i = 0; i < ARRAY_SIZE(opcode_offset); i++) { + entry = &opcode_offset[i]; + if (entry->opcode == code[0] || entry->opcode == 0) + break; } - while (table->format != INSTR_INVALID) { - opmask = formats[table->format][0]; - if (table->opfrag == (opfrag & opmask)) - return table; - table++; + + opfrag = *(code + entry->byte) & entry->mask; + + insn = &opcode[entry->offset]; + for (i = 0; i < entry->count; i++) { + if (insn->opfrag == opfrag) + return insn; + insn++; } return NULL; } @@ -1974,14 +426,14 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) ptr = buffer; insn = find_insn(code); if (insn) { - if (insn->name[0] == '\0') - ptr += sprintf(ptr, "%s\t", - long_insn_name[(int) insn->name[1]]); + if (insn->zero == 0) + ptr += sprintf(ptr, "%.7s\t", + long_insn_name[insn->offset]); else ptr += sprintf(ptr, "%.5s\t", insn->name); /* Extract the operands. */ separator = 0; - for (ops = formats[insn->format] + 1, i = 0; + for (ops = formats[insn->format], i = 0; *ops != 0 && i < 6; ops++, i++) { operand = operands + *ops; value = extract_operand(code, operand); diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 4b5e1e499527..712fc47bee6a 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -3,11 +3,21 @@ # hostprogs-y += gen_facilities +hostprogs-y += gen_opcode_table + HOSTCFLAGS_gen_facilities.o += -Wall $(LINUXINCLUDE) +HOSTCFLAGS_gen_opcode_table.o += -Wall $(LINUXINCLUDE) define filechk_facilities.h $(obj)/gen_facilities endef +define filechk_dis.h + ( $(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt ) +endef + include/generated/facilities.h: $(obj)/gen_facilities FORCE $(call filechk,facilities.h) + +include/generated/dis.h: $(obj)/gen_opcode_table FORCE + $(call filechk,dis.h,__FUN) diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c new file mode 100644 index 000000000000..01d4c5a4bfe9 --- /dev/null +++ b/arch/s390/tools/gen_opcode_table.c @@ -0,0 +1,336 @@ +/* + * Generate opcode table initializers for the in-kernel disassembler. + * + * Copyright IBM Corp. 2017 + * + */ + +#include +#include +#include +#include + +#define STRING_SIZE_MAX 20 + +struct insn_type { + unsigned char byte; + unsigned char mask; + char **format; +}; + +struct insn { + struct insn_type *type; + char opcode[STRING_SIZE_MAX]; + char name[STRING_SIZE_MAX]; + char upper[STRING_SIZE_MAX]; + char format[STRING_SIZE_MAX]; + unsigned int name_len; +}; + +struct insn_group { + struct insn_type *type; + int offset; + int count; + char opcode[2]; +}; + +struct insn_format { + char *format; + int type; +}; + +struct gen_opcode { + struct insn *insn; + int nr; + struct insn_group *group; + int nr_groups; +}; + +/* + * Table of instruction format types. Each opcode is defined with at + * least one byte (two nibbles), three nibbles, or two bytes (four + * nibbles). + * The byte member of each instruction format type entry defines + * within which byte of an instruction the third (and fourth) nibble + * of an opcode can be found. The mask member is the and-mask that + * needs to be applied on this byte in order to get the third (and + * fourth) nibble of the opcode. + * The format array defines all instruction formats (as defined in the + * Principles of Operation) which have the same position of the opcode + * nibbles. + * A special case are instruction formats with 1-byte opcodes. In this + * case the byte member always is zero, so that the mask is applied on + * the (only) byte that contains the opcode. + */ +static struct insn_type insn_type_table[] = { + { + .byte = 0, + .mask = 0xff, + .format = (char *[]) { + "MII", + "RR", + "RS", + "RSI", + "RX", + "SI", + "SMI", + "SS", + NULL, + }, + }, + { + .byte = 1, + .mask = 0x0f, + .format = (char *[]) { + "RI", + "RIL", + "SSF", + NULL, + }, + }, + { + .byte = 1, + .mask = 0xff, + .format = (char *[]) { + "E", + "IE", + "RRE", + "RRF", + "RRR", + "S", + "SIL", + "SSE", + NULL, + }, + }, + { + .byte = 5, + .mask = 0xff, + .format = (char *[]) { + "RIE", + "RIS", + "RRS", + "RSE", + "RSL", + "RSY", + "RXE", + "RXF", + "RXY", + "SIY", + "VRI", + "VRR", + "VRS", + "VRV", + "VRX", + "VSI", + NULL, + }, + }, +}; + +static struct insn_type *insn_format_to_type(char *format) +{ + char tmp[STRING_SIZE_MAX]; + char *base_format, **ptr; + int i; + + strcpy(tmp, format); + base_format = tmp; + base_format = strsep(&base_format, "_"); + for (i = 0; i < sizeof(insn_type_table) / sizeof(insn_type_table[0]); i++) { + ptr = insn_type_table[i].format; + while (*ptr) { + if (!strcmp(base_format, *ptr)) + return &insn_type_table[i]; + ptr++; + } + } + exit(EXIT_FAILURE); +} + +static void read_instructions(struct gen_opcode *desc) +{ + struct insn insn; + int rc, i; + + while (1) { + rc = scanf("%s %s %s", insn.opcode, insn.name, insn.format); + if (rc == EOF) + break; + if (rc != 3) + exit(EXIT_FAILURE); + insn.type = insn_format_to_type(insn.format); + insn.name_len = strlen(insn.name); + for (i = 0; i <= insn.name_len; i++) + insn.upper[i] = toupper((unsigned char)insn.name[i]); + desc->nr++; + desc->insn = realloc(desc->insn, desc->nr * sizeof(*desc->insn)); + if (!desc->insn) + exit(EXIT_FAILURE); + desc->insn[desc->nr - 1] = insn; + } +} + +static int cmpformat(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->format, ((struct insn *)b)->format); +} + +static void print_formats(struct gen_opcode *desc) +{ + char *format; + int i, count; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmpformat); + format = ""; + count = 0; + printf("enum {\n"); + for (i = 0; i < desc->nr; i++) { + if (!strcmp(format, desc->insn[i].format)) + continue; + count++; + format = desc->insn[i].format; + printf("\tINSTR_%s,\n", format); + } + printf("}; /* %d */\n\n", count); +} + +static int cmp_long_insn(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->name, ((struct insn *)b)->name); +} + +static void print_long_insn(struct gen_opcode *desc) +{ + struct insn *insn; + int i, count; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmp_long_insn); + count = 0; + printf("enum {\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->name_len < 6) + continue; + printf("\tLONG_INSN_%s,\n", insn->upper); + count++; + } + printf("}; /* %d */\n\n", count); + + printf("#define LONG_INSN_INITIALIZER { \\\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->name_len < 6) + continue; + printf("\t[LONG_INSN_%s] = \"%s\", \\\n", insn->upper, insn->name); + } + printf("}\n\n"); +} + +static void print_opcode(struct insn *insn, int nr) +{ + char *opcode; + + opcode = insn->opcode; + if (insn->type->byte != 0) + opcode += 2; + printf("\t[%4d] = { .opfrag = 0x%s, .format = INSTR_%s, ", nr, opcode, insn->format); + if (insn->name_len < 6) + printf(".name = \"%s\" ", insn->name); + else + printf(".offset = LONG_INSN_%s ", insn->upper); + printf("}, \\\n"); +} + +static void add_to_group(struct gen_opcode *desc, struct insn *insn, int offset) +{ + struct insn_group *group; + + group = desc->group ? &desc->group[desc->nr_groups - 1] : NULL; + if (group && (!strncmp(group->opcode, insn->opcode, 2) || group->type->byte == 0)) { + group->count++; + return; + } + desc->nr_groups++; + desc->group = realloc(desc->group, desc->nr_groups * sizeof(*desc->group)); + if (!desc->group) + exit(EXIT_FAILURE); + group = &desc->group[desc->nr_groups - 1]; + strncpy(group->opcode, insn->opcode, 2); + group->type = insn->type; + group->offset = offset; + group->count = 1; +} + +static int cmpopcode(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->opcode, ((struct insn *)b)->opcode); +} + +static void print_opcode_table(struct gen_opcode *desc) +{ + char opcode[2] = ""; + struct insn *insn; + int i, offset; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmpopcode); + printf("#define OPCODE_TABLE_INITIALIZER { \\\n"); + offset = 0; + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->type->byte == 0) + continue; + add_to_group(desc, insn, offset); + if (strncmp(opcode, insn->opcode, 2)) { + strncpy(opcode, insn->opcode, 2); + printf("\t/* %.2s */ \\\n", opcode); + } + print_opcode(insn, offset); + offset++; + } + printf("\t/* 1-byte opcode instructions */ \\\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->type->byte != 0) + continue; + add_to_group(desc, insn, offset); + print_opcode(insn, offset); + offset++; + } + printf("}\n\n"); +} + +static void print_opcode_table_offsets(struct gen_opcode *desc) +{ + struct insn_group *group; + int i; + + printf("#define OPCODE_OFFSET_INITIALIZER { \\\n"); + for (i = 0; i < desc->nr_groups; i++) { + group = &desc->group[i]; + printf("\t{ .opcode = 0x%.2s, .mask = 0x%02x, .byte = %d, .offset = %d, .count = %d }, \\\n", + group->opcode, group->type->mask, group->type->byte, group->offset, group->count); + } + printf("}\n\n"); +} + +int main(int argc, char **argv) +{ + struct gen_opcode _desc = { 0 }; + struct gen_opcode *desc = &_desc; + + read_instructions(desc); + printf("#ifndef __S390_GENERATED_DIS_H__\n"); + printf("#define __S390_GENERATED_DIS_H__\n"); + printf("/*\n"); + printf(" * DO NOT MODIFY.\n"); + printf(" *\n"); + printf(" * This file was generated by %s\n", __FILE__); + printf(" */\n\n"); + print_formats(desc); + print_long_insn(desc); + print_opcode_table(desc); + print_opcode_table_offsets(desc); + printf("#endif\n"); + exit(EXIT_SUCCESS); +} diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt new file mode 100644 index 000000000000..1cbed82cd17b --- /dev/null +++ b/arch/s390/tools/opcodes.txt @@ -0,0 +1,1183 @@ +0101 pr E +0102 upt E +0104 ptff E +0107 sckpf E +010a pfpo E +010b tam E +010c sam24 E +010d sam31 E +010e sam64 E +01ff trap2 E +04 spm RR_R0 +05 balr RR_RR +06 bctr RR_RR +07 bcr RR_UR +0a svc RR_U0 +0b bsm RR_RR +0c bassm RR_RR +0d basr RR_RR +0e mvcl RR_RR +0f clcl RR_RR +10 lpr RR_RR +11 lnr RR_RR +12 ltr RR_RR +13 lcr RR_RR +14 nr RR_RR +15 clr RR_RR +16 or RR_RR +17 xr RR_RR +18 lr RR_RR +19 cr RR_RR +1a ar RR_RR +1b sr RR_RR +1c mr RR_RR +1d dr RR_RR +1e alr RR_RR +1f slr RR_RR +20 lpdr RR_FF +21 lndr RR_FF +22 ltdr RR_FF +23 lcdr RR_FF +24 hdr RR_FF +25 ldxr RR_FF +26 mxr RR_FF +27 mxdr RR_FF +28 ldr RR_FF +29 cdr RR_FF +2a adr RR_FF +2b sdr RR_FF +2c mdr RR_FF +2d ddr RR_FF +2e awr RR_FF +2f swr RR_FF +30 lper RR_FF +31 lner RR_FF +32 lter RR_FF +33 lcer RR_FF +34 her RR_FF +35 ledr RR_FF +36 axr RR_FF +37 sxr RR_FF +38 ler RR_FF +39 cer RR_FF +3a aer RR_FF +3b ser RR_FF +3c mder RR_FF +3d der RR_FF +3e aur RR_FF +3f sur RR_FF +40 sth RX_RRRD +41 la RX_RRRD +42 stc RX_RRRD +43 ic RX_RRRD +44 ex RX_RRRD +45 bal RX_RRRD +46 bct RX_RRRD +47 bc RX_URRD +48 lh RX_RRRD +49 ch RX_RRRD +4a ah RX_RRRD +4b sh RX_RRRD +4c mh RX_RRRD +4d bas RX_RRRD +4e cvd RX_RRRD +4f cvb RX_RRRD +50 st RX_RRRD +51 lae RX_RRRD +54 n RX_RRRD +55 cl RX_RRRD +56 o RX_RRRD +57 x RX_RRRD +58 l RX_RRRD +59 c RX_RRRD +5a a RX_RRRD +5b s RX_RRRD +5c m RX_RRRD +5d d RX_RRRD +5e al RX_RRRD +5f sl RX_RRRD +60 std RX_FRRD +67 mxd RX_FRRD +68 ld RX_FRRD +69 cd RX_FRRD +6a ad RX_FRRD +6b sd RX_FRRD +6c md RX_FRRD +6d dd RX_FRRD +6e aw RX_FRRD +6f sw RX_FRRD +70 ste RX_FRRD +71 ms RX_RRRD +78 le RX_FRRD +79 ce RX_FRRD +7a ae RX_FRRD +7b se RX_FRRD +7c mde RX_FRRD +7d de RX_FRRD +7e au RX_FRRD +7f su RX_FRRD +80 ssm SI_RD +82 lpsw SI_RD +83 diag RS_RRRD +84 brxh RSI_RRP +85 brxle RSI_RRP +86 bxh RS_RRRD +87 bxle RS_RRRD +88 srl RS_R0RD +89 sll RS_R0RD +8a sra RS_R0RD +8b sla RS_R0RD +8c srdl RS_R0RD +8d sldl RS_R0RD +8e srda RS_R0RD +8f slda RS_R0RD +90 stm RS_RRRD +91 tm SI_URD +92 mvi SI_URD +93 ts SI_RD +94 ni SI_URD +95 cli SI_URD +96 oi SI_URD +97 xi SI_URD +98 lm RS_RRRD +99 trace RS_RRRD +9a lam RS_AARD +9b stam RS_AARD +a50 iihh RI_RU +a51 iihl RI_RU +a52 iilh RI_RU +a53 iill RI_RU +a54 nihh RI_RU +a55 nihl RI_RU +a56 nilh RI_RU +a57 nill RI_RU +a58 oihh RI_RU +a59 oihl RI_RU +a5a oilh RI_RU +a5b oill RI_RU +a5c llihh RI_RU +a5d llihl RI_RU +a5e llilh RI_RU +a5f llill RI_RU +a70 tmlh RI_RU +a71 tmll RI_RU +a72 tmhh RI_RU +a73 tmhl RI_RU +a74 brc RI_UP +a75 bras RI_RP +a76 brct RI_RP +a77 brctg RI_RP +a78 lhi RI_RI +a79 lghi RI_RI +a7a ahi RI_RI +a7b aghi RI_RI +a7c mhi RI_RI +a7d mghi RI_RI +a7e chi RI_RI +a7f cghi RI_RI +a8 mvcle RS_RRRD +a9 clcle RS_RRRD +aa0 rinext RI_RI +aa1 rion RI_RI +aa2 tric RI_RI +aa3 rioff RI_RI +aa4 riemit RI_RI +ac stnsm SI_URD +ad stosm SI_URD +ae sigp RS_RRRD +af mc SI_URD +b1 lra RX_RRRD +b202 stidp S_RD +b204 sck S_RD +b205 stck S_RD +b206 sckc S_RD +b207 stckc S_RD +b208 spt S_RD +b209 stpt S_RD +b20a spka S_RD +b20b ipk S_00 +b20d ptlb S_00 +b210 spx S_RD +b211 stpx S_RD +b212 stap S_RD +b214 sie S_RD +b218 pc S_RD +b219 sac S_RD +b21a cfc S_RD +b220 servc RRE_RR +b221 ipte RRF_RURR +b222 ipm RRE_R0 +b223 ivsk RRE_RR +b224 iac RRE_R0 +b225 ssar RRE_R0 +b226 epar RRE_R0 +b227 esar RRE_R0 +b228 pt RRE_RR +b229 iske RRE_RR +b22a rrbe RRE_RR +b22b sske RRF_U0RR +b22c tb RRE_RR +b22d dxr RRE_FF +b22e pgin RRE_RR +b22f pgout RRE_RR +b230 csch S_00 +b231 hsch S_00 +b232 msch S_RD +b233 ssch S_RD +b234 stsch S_RD +b235 tsch S_RD +b236 tpi S_RD +b237 sal S_00 +b238 rsch S_00 +b239 stcrw S_RD +b23a stcps S_RD +b23b rchp S_00 +b23c schm S_00 +b240 bakr RRE_RR +b241 cksm RRE_RR +b244 sqdr RRE_FF +b245 sqer RRE_FF +b246 stura RRE_RR +b247 msta RRE_R0 +b248 palb RRE_00 +b249 ereg RRE_RR +b24a esta RRE_RR +b24b lura RRE_RR +b24c tar RRE_AR +b24d cpya RRE_AA +b24e sar RRE_AR +b24f ear RRE_RA +b250 csp RRE_RR +b252 msr RRE_RR +b254 mvpg RRE_RR +b255 mvst RRE_RR +b256 sthyi RRE_RR +b257 cuse RRE_RR +b258 bsg RRE_RR +b25a bsa RRE_RR +b25d clst RRE_RR +b25e srst RRE_RR +b263 cmpsc RRE_RR +b274 siga S_RD +b276 xsch S_00 +b277 rp S_RD +b278 stcke S_RD +b279 sacf S_RD +b27c stckf S_RD +b27d stsi S_RD +b280 lpp S_RD +b284 lcctl S_RD +b285 lpctl S_RD +b286 qsi S_RD +b287 lsctl S_RD +b28e qctri S_RD +b299 srnm S_RD +b29c stfpc S_RD +b29d lfpc S_RD +b2a5 tre RRE_RR +b2a6 cu21 RRF_U0RR +b2a7 cu12 RRF_U0RR +b2b0 stfle S_RD +b2b1 stfl S_RD +b2b2 lpswe S_RD +b2b8 srnmb S_RD +b2b9 srnmt S_RD +b2bd lfas S_RD +b2e0 scctr RRE_RR +b2e1 spctr RRE_RR +b2e4 ecctr RRE_RR +b2e5 epctr RRE_RR +b2e8 ppa RRF_U0RR +b2ec etnd RRE_R0 +b2ed ecpga RRE_RR +b2f8 tend S_00 +b2fa niai IE_UU +b2fc tabort S_RD +b2ff trap4 S_RD +b300 lpebr RRE_FF +b301 lnebr RRE_FF +b302 ltebr RRE_FF +b303 lcebr RRE_FF +b304 ldebr RRE_FF +b305 lxdbr RRE_FF +b306 lxebr RRE_FF +b307 mxdbr RRE_FF +b308 kebr RRE_FF +b309 cebr RRE_FF +b30a aebr RRE_FF +b30b sebr RRE_FF +b30c mdebr RRE_FF +b30d debr RRE_FF +b30e maebr RRF_F0FF +b30f msebr RRF_F0FF +b310 lpdbr RRE_FF +b311 lndbr RRE_FF +b312 ltdbr RRE_FF +b313 lcdbr RRE_FF +b314 sqebr RRE_FF +b315 sqdbr RRE_FF +b316 sqxbr RRE_FF +b317 meebr RRE_FF +b318 kdbr RRE_FF +b319 cdbr RRE_FF +b31a adbr RRE_FF +b31b sdbr RRE_FF +b31c mdbr RRE_FF +b31d ddbr RRE_FF +b31e madbr RRF_F0FF +b31f msdbr RRF_F0FF +b324 lder RRE_FF +b325 lxdr RRE_FF +b326 lxer RRE_FF +b32e maer RRF_F0FF +b32f mser RRF_F0FF +b336 sqxr RRE_FF +b337 meer RRE_FF +b338 maylr RRF_F0FF +b339 mylr RRF_F0FF +b33a mayr RRF_F0FF +b33b myr RRF_F0FF +b33c mayhr RRF_F0FF +b33d myhr RRF_F0FF +b33e madr RRF_F0FF +b33f msdr RRF_F0FF +b340 lpxbr RRE_FF +b341 lnxbr RRE_FF +b342 ltxbr RRE_FF +b343 lcxbr RRE_FF +b344 ledbra RRF_UUFF +b345 ldxbra RRF_UUFF +b346 lexbra RRF_UUFF +b347 fixbra RRF_UUFF +b348 kxbr RRE_FF +b349 cxbr RRE_FF +b34a axbr RRE_FF +b34b sxbr RRE_FF +b34c mxbr RRE_FF +b34d dxbr RRE_FF +b350 tbedr RRF_U0FF +b351 tbdr RRF_U0FF +b353 diebr RRF_FUFF +b357 fiebra RRF_UUFF +b358 thder RRE_FF +b359 thdr RRE_FF +b35b didbr RRF_FUFF +b35f fidbra RRF_UUFF +b360 lpxr RRE_FF +b361 lnxr RRE_FF +b362 ltxr RRE_FF +b363 lcxr RRE_FF +b365 lxr RRE_FF +b366 lexr RRE_FF +b367 fixr RRE_FF +b369 cxr RRE_FF +b370 lpdfr RRE_FF +b371 lndfr RRE_FF +b372 cpsdr RRF_F0FF2 +b373 lcdfr RRE_FF +b374 lzer RRE_F0 +b375 lzdr RRE_F0 +b376 lzxr RRE_F0 +b377 fier RRE_FF +b37f fidr RRE_FF +b384 sfpc RRE_RR +b385 sfasr RRE_R0 +b38c efpc RRE_RR +b390 celfbr RRF_UUFR +b391 cdlfbr RRF_UUFR +b392 cxlfbr RRF_UUFR +b394 cefbra RRF_UUFR +b395 cdfbra RRF_UUFR +b396 cxfbra RRF_UUFR +b398 cfebra RRF_UURF +b399 cfdbra RRF_UURF +b39a cfxbra RRF_UURF +b39c clfebr RRF_UURF +b39d clfdbr RRF_UURF +b39e clfxbr RRF_UURF +b3a0 celgbr RRF_UUFR +b3a1 cdlgbr RRF_UUFR +b3a2 cxlgbr RRF_UUFR +b3a4 cegbra RRF_UUFR +b3a5 cdgbra RRF_UUFR +b3a6 cxgbra RRF_UUFR +b3a8 cgebra RRF_UURF +b3a9 cgdbra RRF_UURF +b3aa cgxbra RRF_UURF +b3ac clgebr RRF_UURF +b3ad clgdbr RRF_UURF +b3ae clgxbr RRF_UURF +b3b4 cefr RRE_FR +b3b5 cdfr RRE_FR +b3b6 cxfr RRE_FR +b3b8 cfer RRF_U0RF +b3b9 cfdr RRF_U0RF +b3ba cfxr RRF_U0RF +b3c1 ldgr RRE_FR +b3c4 cegr RRE_FR +b3c5 cdgr RRE_FR +b3c6 cxgr RRE_FR +b3c8 cger RRF_U0RF +b3c9 cgdr RRF_U0RF +b3ca cgxr RRF_U0RF +b3cd lgdr RRE_RF +b3d0 mdtra RRF_FUFF2 +b3d1 ddtra RRF_FUFF2 +b3d2 adtra RRF_FUFF2 +b3d3 sdtra RRF_FUFF2 +b3d4 ldetr RRF_0UFF +b3d5 ledtr RRF_UUFF +b3d6 ltdtr RRE_FF +b3d7 fidtr RRF_UUFF +b3d8 mxtra RRF_FUFF2 +b3d9 dxtra RRF_FUFF2 +b3da axtra RRF_FUFF2 +b3db sxtra RRF_FUFF2 +b3dc lxdtr RRF_0UFF +b3dd ldxtr RRF_UUFF +b3de ltxtr RRE_FF +b3df fixtr RRF_UUFF +b3e0 kdtr RRE_FF +b3e1 cgdtra RRF_UURF +b3e2 cudtr RRE_RF +b3e3 csdtr RRF_0URF +b3e4 cdtr RRE_FF +b3e5 eedtr RRE_RF +b3e7 esdtr RRE_RF +b3e8 kxtr RRE_FF +b3e9 cgxtra RRF_UURF +b3ea cuxtr RRE_RF +b3eb csxtr RRF_0URF +b3ec cxtr RRE_FF +b3ed eextr RRE_RF +b3ef esxtr RRE_RF +b3f1 cdgtra RRF_UUFR +b3f2 cdutr RRE_FR +b3f3 cdstr RRE_FR +b3f4 cedtr RRE_FF +b3f5 qadtr RRF_FUFF +b3f6 iedtr RRF_F0FR +b3f7 rrdtr RRF_FFRU +b3f9 cxgtra RRF_UUFR +b3fa cxutr RRE_FR +b3fb cxstr RRE_FR +b3fc cextr RRE_FF +b3fd qaxtr RRF_FUFF +b3fe iextr RRF_F0FR +b3ff rrxtr RRF_FFRU +b6 stctl RS_CCRD +b7 lctl RS_CCRD +b900 lpgr RRE_RR +b901 lngr RRE_RR +b902 ltgr RRE_RR +b903 lcgr RRE_RR +b904 lgr RRE_RR +b905 lurag RRE_RR +b906 lgbr RRE_RR +b907 lghr RRE_RR +b908 agr RRE_RR +b909 sgr RRE_RR +b90a algr RRE_RR +b90b slgr RRE_RR +b90c msgr RRE_RR +b90d dsgr RRE_RR +b90e eregg RRE_RR +b90f lrvgr RRE_RR +b910 lpgfr RRE_RR +b911 lngfr RRE_RR +b912 ltgfr RRE_RR +b913 lcgfr RRE_RR +b914 lgfr RRE_RR +b916 llgfr RRE_RR +b917 llgtr RRE_RR +b918 agfr RRE_RR +b919 sgfr RRE_RR +b91a algfr RRE_RR +b91b slgfr RRE_RR +b91c msgfr RRE_RR +b91d dsgfr RRE_RR +b91e kmac RRE_RR +b91f lrvr RRE_RR +b920 cgr RRE_RR +b921 clgr RRE_RR +b925 sturg RRE_RR +b926 lbr RRE_RR +b927 lhr RRE_RR +b928 pckmo RRE_00 +b929 kma RRF_R0RR +b92a kmf RRE_RR +b92b kmo RRE_RR +b92c pcc RRE_00 +b92d kmctr RRF_R0RR +b92e km RRE_RR +b92f kmc RRE_RR +b930 cgfr RRE_RR +b931 clgfr RRE_RR +b93c ppno RRE_RR +b93e kimd RRE_RR +b93f klmd RRE_RR +b941 cfdtr RRF_UURF +b942 clgdtr RRF_UURF +b943 clfdtr RRF_UURF +b946 bctgr RRE_RR +b949 cfxtr RRF_UURF +b94a clgxtr RRF_UURF +b94b clfxtr RRF_UURF +b951 cdftr RRF_UUFR +b952 cdlgtr RRF_UUFR +b953 cdlftr RRF_UUFR +b959 cxftr RRF_UUFR +b95a cxlgtr RRF_UUFR +b95b cxlftr RRF_UUFR +b960 cgrt RRF_U0RR +b961 clgrt RRF_U0RR +b972 crt RRF_U0RR +b973 clrt RRF_U0RR +b980 ngr RRE_RR +b981 ogr RRE_RR +b982 xgr RRE_RR +b983 flogr RRE_RR +b984 llgcr RRE_RR +b985 llghr RRE_RR +b986 mlgr RRE_RR +b987 dlgr RRE_RR +b988 alcgr RRE_RR +b989 slbgr RRE_RR +b98a cspg RRE_RR +b98d epsw RRE_RR +b98e idte RRF_RURR2 +b98f crdte RRF_RURR2 +b990 trtt RRF_U0RR +b991 trto RRF_U0RR +b992 trot RRF_U0RR +b993 troo RRF_U0RR +b994 llcr RRE_RR +b995 llhr RRE_RR +b996 mlr RRE_RR +b997 dlr RRE_RR +b998 alcr RRE_RR +b999 slbr RRE_RR +b99a epair RRE_R0 +b99b esair RRE_R0 +b99d esea RRE_R0 +b99e pti RRE_RR +b99f ssair RRE_R0 +b9a1 tpei RRE_RR +b9a2 ptf RRE_R0 +b9aa lptea RRF_RURR2 +b9ac irbm RRE_RR +b9ae rrbm RRE_RR +b9af pfmf RRE_RR +b9b0 cu14 RRF_U0RR +b9b1 cu24 RRF_U0RR +b9b2 cu41 RRE_RR +b9b3 cu42 RRE_RR +b9bd trtre RRF_U0RR +b9be srstu RRE_RR +b9bf trte RRF_U0RR +b9c8 ahhhr RRF_R0RR2 +b9c9 shhhr RRF_R0RR2 +b9ca alhhhr RRF_R0RR2 +b9cb slhhhr RRF_R0RR2 +b9cd chhr RRE_RR +b9cf clhhr RRE_RR +b9d0 pcistg RRE_RR +b9d2 pcilg RRE_RR +b9d3 rpcit RRE_RR +b9d8 ahhlr RRF_R0RR2 +b9d9 shhlr RRF_R0RR2 +b9da alhhlr RRF_R0RR2 +b9db slhhlr RRF_R0RR2 +b9dd chlr RRE_RR +b9df clhlr RRE_RR +b9e0 locfhr RRF_U0RR +b9e1 popcnt RRE_RR +b9e2 locgr RRF_U0RR +b9e4 ngrk RRF_R0RR2 +b9e6 ogrk RRF_R0RR2 +b9e7 xgrk RRF_R0RR2 +b9e8 agrk RRF_R0RR2 +b9e9 sgrk RRF_R0RR2 +b9ea algrk RRF_R0RR2 +b9eb slgrk RRF_R0RR2 +b9ec mgrk RRF_R0RR2 +b9ed msgrkc RRF_R0RR2 +b9f2 locr RRF_U0RR +b9f4 nrk RRF_R0RR2 +b9f6 ork RRF_R0RR2 +b9f7 xrk RRF_R0RR2 +b9f8 ark RRF_R0RR2 +b9f9 srk RRF_R0RR2 +b9fa alrk RRF_R0RR2 +b9fb slrk RRF_R0RR2 +b9fd msrkc RRF_R0RR2 +ba cs RS_RRRD +bb cds RS_RRRD +bd clm RS_RURD +be stcm RS_RURD +bf icm RS_RURD +c00 larl RIL_RP +c01 lgfi RIL_RI +c04 brcl RIL_UP +c05 brasl RIL_RP +c06 xihf RIL_RU +c07 xilf RIL_RU +c08 iihf RIL_RU +c09 iilf RIL_RU +c0a nihf RIL_RU +c0b nilf RIL_RU +c0c oihf RIL_RU +c0d oilf RIL_RU +c0e llihf RIL_RU +c0f llilf RIL_RU +c20 msgfi RIL_RI +c21 msfi RIL_RI +c24 slgfi RIL_RU +c25 slfi RIL_RU +c28 agfi RIL_RI +c29 afi RIL_RI +c2a algfi RIL_RU +c2b alfi RIL_RU +c2c cgfi RIL_RI +c2d cfi RIL_RI +c2e clgfi RIL_RU +c2f clfi RIL_RU +c42 llhrl RIL_RP +c44 lghrl RIL_RP +c45 lhrl RIL_RP +c46 llghrl RIL_RP +c47 sthrl RIL_RP +c48 lgrl RIL_RP +c4b stgrl RIL_RP +c4c lgfrl RIL_RP +c4d lrl RIL_RP +c4e llgfrl RIL_RP +c4f strl RIL_RP +c5 bprp MII_UPP +c60 exrl RIL_RP +c62 pfdrl RIL_UP +c64 cghrl RIL_RP +c65 chrl RIL_RP +c66 clghrl RIL_RP +c67 clhrl RIL_RP +c68 cgrl RIL_RP +c6a clgrl RIL_RP +c6c cgfrl RIL_RP +c6d crl RIL_RP +c6e clgfrl RIL_RP +c6f clrl RIL_RP +c7 bpp SMI_U0RDP +c80 mvcos SSF_RRDRD +c81 ectg SSF_RRDRD +c82 csst SSF_RRDRD +c84 lpd SSF_RRDRD2 +c85 lpdg SSF_RRDRD2 +cc6 brcth RIL_RP +cc8 aih RIL_RI +cca alsih RIL_RI +ccb alsihn RIL_RI +ccd cih RIL_RI +ccf clih RIL_RU +d0 trtr SS_L0RDRD +d1 mvn SS_L0RDRD +d2 mvc SS_L0RDRD +d3 mvz SS_L0RDRD +d4 nc SS_L0RDRD +d5 clc SS_L0RDRD +d6 oc SS_L0RDRD +d7 xc SS_L0RDRD +d9 mvck SS_RRRDRD +da mvcp SS_RRRDRD +db mvcs SS_RRRDRD +dc tr SS_L0RDRD +dd trt SS_L0RDRD +de ed SS_L0RDRD +df edmk SS_L0RDRD +e1 pku SS_L2RDRD +e2 unpku SS_L0RDRD +e302 ltg RXY_RRRD +e303 lrag RXY_RRRD +e304 lg RXY_RRRD +e306 cvby RXY_RRRD +e308 ag RXY_RRRD +e309 sg RXY_RRRD +e30a alg RXY_RRRD +e30b slg RXY_RRRD +e30c msg RXY_RRRD +e30d dsg RXY_RRRD +e30e cvbg RXY_RRRD +e30f lrvg RXY_RRRD +e312 lt RXY_RRRD +e313 lray RXY_RRRD +e314 lgf RXY_RRRD +e315 lgh RXY_RRRD +e316 llgf RXY_RRRD +e317 llgt RXY_RRRD +e318 agf RXY_RRRD +e319 sgf RXY_RRRD +e31a algf RXY_RRRD +e31b slgf RXY_RRRD +e31c msgf RXY_RRRD +e31d dsgf RXY_RRRD +e31e lrv RXY_RRRD +e31f lrvh RXY_RRRD +e320 cg RXY_RRRD +e321 clg RXY_RRRD +e324 stg RXY_RRRD +e325 ntstg RXY_RRRD +e326 cvdy RXY_RRRD +e32a lzrg RXY_RRRD +e32e cvdg RXY_RRRD +e32f strvg RXY_RRRD +e330 cgf RXY_RRRD +e331 clgf RXY_RRRD +e332 ltgf RXY_RRRD +e334 cgh RXY_RRRD +e336 pfd RXY_URRD +e338 agh RXY_RRRD +e339 sgh RXY_RRRD +e33a llzrgf RXY_RRRD +e33b lzrf RXY_RRRD +e33c mgh RXY_RRRD +e33e strv RXY_RRRD +e33f strvh RXY_RRRD +e346 bctg RXY_RRRD +e347 bic RXY_URRD +e348 llgfsg RXY_RRRD +e349 stgsc RXY_RRRD +e34c lgg RXY_RRRD +e34d lgsc RXY_RRRD +e350 sty RXY_RRRD +e351 msy RXY_RRRD +e353 msc RXY_RRRD +e354 ny RXY_RRRD +e355 cly RXY_RRRD +e356 oy RXY_RRRD +e357 xy RXY_RRRD +e358 ly RXY_RRRD +e359 cy RXY_RRRD +e35a ay RXY_RRRD +e35b sy RXY_RRRD +e35c mfy RXY_RRRD +e35e aly RXY_RRRD +e35f sly RXY_RRRD +e370 sthy RXY_RRRD +e371 lay RXY_RRRD +e372 stcy RXY_RRRD +e373 icy RXY_RRRD +e375 laey RXY_RRRD +e376 lb RXY_RRRD +e377 lgb RXY_RRRD +e378 lhy RXY_RRRD +e379 chy RXY_RRRD +e37a ahy RXY_RRRD +e37b shy RXY_RRRD +e37c mhy RXY_RRRD +e380 ng RXY_RRRD +e381 og RXY_RRRD +e382 xg RXY_RRRD +e383 msgc RXY_RRRD +e384 mg RXY_RRRD +e385 lgat RXY_RRRD +e386 mlg RXY_RRRD +e387 dlg RXY_RRRD +e388 alcg RXY_RRRD +e389 slbg RXY_RRRD +e38e stpq RXY_RRRD +e38f lpq RXY_RRRD +e390 llgc RXY_RRRD +e391 llgh RXY_RRRD +e394 llc RXY_RRRD +e395 llh RXY_RRRD +e396 ml RXY_RRRD +e397 dl RXY_RRRD +e398 alc RXY_RRRD +e399 slb RXY_RRRD +e39c llgtat RXY_RRRD +e39d llgfat RXY_RRRD +e39f lat RXY_RRRD +e3c0 lbh RXY_RRRD +e3c2 llch RXY_RRRD +e3c3 stch RXY_RRRD +e3c4 lhh RXY_RRRD +e3c6 llhh RXY_RRRD +e3c7 sthh RXY_RRRD +e3c8 lfhat RXY_RRRD +e3ca lfh RXY_RRRD +e3cb stfh RXY_RRRD +e3cd chf RXY_RRRD +e3cf clhf RXY_RRRD +e3d0 mpcifc RXY_RRRD +e3d4 stpcifc RXY_RRRD +e500 lasp SSE_RDRD +e501 tprot SSE_RDRD +e502 strag SSE_RDRD +e50e mvcsk SSE_RDRD +e50f mvcdk SSE_RDRD +e544 mvhhi SIL_RDI +e548 mvghi SIL_RDI +e54c mvhi SIL_RDI +e554 chhsi SIL_RDI +e555 clhhsi SIL_RDU +e558 cghsi SIL_RDI +e559 clghsi SIL_RDU +e55c chsi SIL_RDI +e55d clfhsi SIL_RDU +e560 tbegin SIL_RDU +e561 tbeginc SIL_RDU +e634 vpkz VSI_URDV +e635 vlrl VSI_URDV +e637 vlrlr VRS_RRDV +e63c vupkz VSI_URDV +e63d vstrl VSI_URDV +e63f vstrlr VRS_RRDV +e649 vlip VRI_V0UU2 +e650 vcvb VRR_RV0U +e652 vcvbg VRR_RV0U +e658 vcvd VRI_VR0UU +e659 vsrp VRI_VVUUU2 +e65a vcvdg VRI_VR0UU +e65b vpsop VRI_VVUUU2 +e65f vtp VRR_0V +e671 vap VRI_VVV0UU2 +e673 vsp VRI_VVV0UU2 +e677 vcp VRR_0VV0U +e678 vmp VRI_VVV0UU2 +e679 vmsp VRI_VVV0UU2 +e67a vdp VRI_VVV0UU2 +e67b vrp VRI_VVV0UU2 +e67e vsdp VRI_VVV0UU2 +e700 vleb VRX_VRRDU +e701 vleh VRX_VRRDU +e702 vleg VRX_VRRDU +e703 vlef VRX_VRRDU +e704 vllez VRX_VRRDU +e705 vlrep VRX_VRRDU +e706 vl VRX_VRRD +e707 vlbb VRX_VRRDU +e708 vsteb VRX_VRRDU +e709 vsteh VRX_VRRDU +e70a vsteg VRX_VRRDU +e70b vstef VRX_VRRDU +e70e vst VRX_VRRD +e712 vgeg VRV_VVXRDU +e713 vgef VRV_VVXRDU +e71a vsceg VRV_VVXRDU +e71b vscef VRV_VVXRDU +e721 vlgv VRS_RVRDU +e722 vlvg VRS_VRRDU +e727 lcbb RXE_RRRDU +e730 vesl VRS_VVRDU +e733 verll VRS_VVRDU +e736 vlm VRS_VVRD +e737 vll VRS_VRRD +e738 vesrl VRS_VVRDU +e73a vesra VRS_VVRDU +e73e vstm VRS_VVRD +e73f vstl VRS_VRRD +e740 vleib VRI_V0IU +e741 vleih VRI_V0IU +e742 vleig VRI_V0IU +e743 vleif VRI_V0IU +e744 vgbm VRI_V0U +e745 vrepi VRI_V0IU +e746 vgm VRI_V0UUU +e74a vftci VRI_VVUUU +e74d vrep VRI_VVUU +e750 vpopct VRR_VV0U +e752 vctz VRR_VV0U +e753 vclz VRR_VV0U +e756 vlr VRX_VV +e75c vistr VRR_VV0U0U +e75f vseg VRR_VV0U +e760 vmrl VRR_VVV0U +e761 vmrh VRR_VVV0U +e762 vlvgp VRR_VRR +e764 vsum VRR_VVV0U +e765 vsumg VRR_VVV0U +e766 vcksm VRR_VVV +e767 vsumq VRR_VVV0U +e768 vn VRR_VVV +e769 vnc VRR_VVV +e76a vo VRR_VVV +e76b vno VRR_VVV +e76c vnx VRR_VVV +e76d vx VRR_VVV +e76e vnn VRR_VVV +e76f voc VRR_VVV +e770 veslv VRR_VVV0U +e772 verim VRI_VVV0UU +e773 verllv VRR_VVV0U +e774 vsl VRR_VVV +e775 vslb VRR_VVV +e777 vsldb VRI_VVV0U +e778 vesrlv VRR_VVV0U +e77a vesrav VRR_VVV0U +e77c vsrl VRR_VVV +e77d vsrlb VRR_VVV +e77e vsra VRR_VVV +e77f vsrab VRR_VVV +e780 vfee VRR_VVV0U0U +e781 vfene VRR_VVV0U0U +e782 vfae VRR_VVV0U0U +e784 vpdi VRR_VVV0U +e785 vbperm VRR_VVV +e78a vstrc VRR_VVVUU0V +e78c vperm VRR_VVV0V +e78d vsel VRR_VVV0V +e78e vfms VRR_VVVU0UV +e78f vfma VRR_VVVU0UV +e794 vpk VRR_VVV0U +e795 vpkls VRR_VVV0U0U +e797 vpks VRR_VVV0U0U +e79e vfnms VRR_VVVU0UV +e79f vfnma VRR_VVVU0UV +e7a1 vmlh VRR_VVV0U +e7a2 vml VRR_VVV0U +e7a3 vmh VRR_VVV0U +e7a4 vmle VRR_VVV0U +e7a5 vmlo VRR_VVV0U +e7a6 vme VRR_VVV0U +e7a7 vmo VRR_VVV0U +e7a9 vmalh VRR_VVVU0V +e7aa vmal VRR_VVVU0V +e7ab vmah VRR_VVVU0V +e7ac vmale VRR_VVVU0V +e7ad vmalo VRR_VVVU0V +e7ae vmae VRR_VVVU0V +e7af vmao VRR_VVVU0V +e7b4 vgfm VRR_VVV0U +e7b8 vmsl VRR_VVVUU0V +e7b9 vaccc VRR_VVVU0V +e7bb vac VRR_VVVU0V +e7bc vgfma VRR_VVVU0V +e7bd vsbcbi VRR_VVVU0V +e7bf vsbi VRR_VVVU0V +e7c0 vclgd VRR_VV0UUU +e7c1 vcdlg VRR_VV0UUU +e7c2 vcgd VRR_VV0UUU +e7c3 vcdg VRR_VV0UUU +e7c4 vlde VRR_VV0UU2 +e7c5 vled VRR_VV0UUU +e7c7 vfi VRR_VV0UUU +e7ca wfk VRR_VV0UU2 +e7cb wfc VRR_VV0UU2 +e7cc vfpso VRR_VV0UUU +e7ce vfsq VRR_VV0UU2 +e7d4 vupll VRR_VV0U +e7d5 vuplh VRR_VV0U +e7d6 vupl VRR_VV0U +e7d7 vuph VRR_VV0U +e7d8 vtm VRR_VV +e7d9 vecl VRR_VV0U +e7db vec VRR_VV0U +e7de vlc VRR_VV0U +e7df vlp VRR_VV0U +e7e2 vfs VRR_VVV0UU +e7e3 vfa VRR_VVV0UU +e7e5 vfd VRR_VVV0UU +e7e7 vfm VRR_VVV0UU +e7e8 vfce VRR_VVV0UUU +e7ea vfche VRR_VVV0UUU +e7eb vfch VRR_VVV0UUU +e7ee vfmin VRR_VVV0UUU +e7ef vfmax VRR_VVV0UUU +e7f0 vavgl VRR_VVV0U +e7f1 vacc VRR_VVV0U +e7f2 vavg VRR_VVV0U +e7f3 va VRR_VVV0U +e7f5 vscbi VRR_VVV0U +e7f7 vs VRR_VVV0U +e7f8 vceq VRR_VVV0U0U +e7f9 vchl VRR_VVV0U0U +e7fb vch VRR_VVV0U0U +e7fc vmnl VRR_VVV0U +e7fd vmxl VRR_VVV0U +e7fe vmn VRR_VVV0U +e7ff vmx VRR_VVV0U +e8 mvcin SS_L0RDRD +e9 pka SS_L2RDRD +ea unpka SS_L0RDRD +eb04 lmg RSY_RRRD +eb0a srag RSY_RRRD +eb0b slag RSY_RRRD +eb0c srlg RSY_RRRD +eb0d sllg RSY_RRRD +eb0f tracg RSY_RRRD +eb14 csy RSY_RRRD +eb17 stcctm RSY_RURD +eb1c rllg RSY_RRRD +eb1d rll RSY_RRRD +eb20 clmh RSY_RURD +eb21 clmy RSY_RURD +eb23 clt RSY_RURD +eb24 stmg RSY_RRRD +eb25 stctg RSY_CCRD +eb26 stmh RSY_RRRD +eb2b clgt RSY_RURD +eb2c stcmh RSY_RURD +eb2d stcmy RSY_RURD +eb2f lctlg RSY_CCRD +eb30 csg RSY_RRRD +eb31 cdsy RSY_RRRD +eb3e cdsg RSY_RRRD +eb44 bxhg RSY_RRRD +eb45 bxleg RSY_RRRD +eb4c ecag RSY_RRRD +eb51 tmy SIY_URD +eb52 mviy SIY_URD +eb54 niy SIY_URD +eb55 cliy SIY_URD +eb56 oiy SIY_URD +eb57 xiy SIY_URD +eb60 lric RSY_RDRU +eb61 stric RSY_RDRU +eb62 mric RSY_RDRU +eb6a asi SIY_IRD +eb6e alsi SIY_IRD +eb7a agsi SIY_IRD +eb7e algsi SIY_IRD +eb80 icmh RSY_RURD +eb81 icmy RSY_RURD +eb8e mvclu RSY_RRRD +eb8f clclu RSY_RRRD +eb90 stmy RSY_RRRD +eb96 lmh RSY_RRRD +eb98 lmy RSY_RRRD +eb9a lamy RSY_AARD +eb9b stamy RSY_AARD +ebc0 tp RSL_R0RD +ebd0 pcistb RSY_RRRD +ebd1 sic RSY_RRRD +ebdc srak RSY_RRRD +ebdd slak RSY_RRRD +ebde srlk RSY_RRRD +ebdf sllk RSY_RRRD +ebe0 locfh RSY_RURD2 +ebe1 stocfh RSY_RURD2 +ebe2 locg RSY_RURD2 +ebe3 stocg RSY_RURD2 +ebe4 lang RSY_RRRD +ebe6 laog RSY_RRRD +ebe7 laxg RSY_RRRD +ebe8 laag RSY_RRRD +ebea laalg RSY_RRRD +ebf2 loc RSY_RURD2 +ebf3 stoc RSY_RURD2 +ebf4 lan RSY_RRRD +ebf6 lao RSY_RRRD +ebf7 lax RSY_RRRD +ebf8 laa RSY_RRRD +ebfa laal RSY_RRRD +ec42 lochi RIE_RUI0 +ec44 brxhg RIE_RRP +ec45 brxlg RIE_RRP +ec46 locghi RIE_RUI0 +ec4e lochhi RIE_RUI0 +ec51 risblg RIE_RRUUU +ec54 rnsbg RIE_RRUUU +ec55 risbg RIE_RRUUU +ec56 rosbg RIE_RRUUU +ec57 rxsbg RIE_RRUUU +ec59 risbgn RIE_RRUUU +ec5d risbhg RIE_RRUUU +ec64 cgrj RIE_RRPU +ec65 clgrj RIE_RRPU +ec70 cgit RIE_R0IU +ec71 clgit RIE_R0UU +ec72 cit RIE_R0IU +ec73 clfit RIE_R0UU +ec76 crj RIE_RRPU +ec77 clrj RIE_RRPU +ec7c cgij RIE_RUPI +ec7d clgij RIE_RUPU +ec7e cij RIE_RUPI +ec7f clij RIE_RUPU +ecd8 ahik RIE_RRI0 +ecd9 aghik RIE_RRI0 +ecda alhsik RIE_RRI0 +ecdb alghsik RIE_RRI0 +ece4 cgrb RRS_RRRDU +ece5 clgrb RRS_RRRDU +ecf6 crb RRS_RRRDU +ecf7 clrb RRS_RRRDU +ecfc cgib RIS_RURDI +ecfd clgib RIS_RURDU +ecfe cib RIS_RURDI +ecff clib RIS_RURDU +ed04 ldeb RXE_FRRD +ed05 lxdb RXE_FRRD +ed06 lxeb RXE_FRRD +ed07 mxdb RXE_FRRD +ed08 keb RXE_FRRD +ed09 ceb RXE_FRRD +ed0a aeb RXE_FRRD +ed0b seb RXE_FRRD +ed0c mdeb RXE_FRRD +ed0d deb RXE_FRRD +ed0e maeb RXF_FRRDF +ed0f mseb RXF_FRRDF +ed10 tceb RXE_FRRD +ed11 tcdb RXE_FRRD +ed12 tcxb RXE_FRRD +ed14 sqeb RXE_FRRD +ed15 sqdb RXE_FRRD +ed17 meeb RXE_FRRD +ed18 kdb RXE_FRRD +ed19 cdb RXE_FRRD +ed1a adb RXE_FRRD +ed1b sdb RXE_FRRD +ed1c mdb RXE_FRRD +ed1d ddb RXE_FRRD +ed1e madb RXF_FRRDF +ed1f msdb RXF_FRRDF +ed24 lde RXE_FRRD +ed25 lxd RXE_FRRD +ed26 lxe RXE_FRRD +ed2e mae RXF_FRRDF +ed2f mse RXF_FRRDF +ed34 sqe RXE_FRRD +ed35 sqd RXE_FRRD +ed37 mee RXE_FRRD +ed38 mayl RXF_FRRDF +ed39 myl RXF_FRRDF +ed3a may RXF_FRRDF +ed3b my RXF_FRRDF +ed3c mayh RXF_FRRDF +ed3d myh RXF_FRRDF +ed3e mad RXF_FRRDF +ed3f msd RXF_FRRDF +ed40 sldt RXF_FRRDF +ed41 srdt RXF_FRRDF +ed48 slxt RXF_FRRDF +ed49 srxt RXF_FRRDF +ed50 tdcet RXE_FRRD +ed51 tdget RXE_FRRD +ed54 tdcdt RXE_FRRD +ed55 tdgdt RXE_FRRD +ed58 tdcxt RXE_FRRD +ed59 tdgxt RXE_FRRD +ed64 ley RXY_FRRD +ed65 ldy RXY_FRRD +ed66 stey RXY_FRRD +ed67 stdy RXY_FRRD +eda8 czdt RSL_LRDFU +eda9 czxt RSL_LRDFU +edaa cdzt RSL_LRDFU +edab cxzt RSL_LRDFU +edac cpdt RSL_LRDFU +edad cpxt RSL_LRDFU +edae cdpt RSL_LRDFU +edaf cxpt RSL_LRDFU +ee plo SS_RRRDRD2 +ef lmd SS_RRRDRD3 +f0 srp SS_LIRDRD +f1 mvo SS_LLRDRD +f2 pack SS_LLRDRD +f3 unpk SS_LLRDRD +f8 zap SS_LLRDRD +f9 cp SS_LLRDRD +fa ap SS_LLRDRD +fb sp SS_LLRDRD +fc mp SS_LLRDRD +fd dp SS_LLRDRD From ead7a22e9b6eff225afb127f8835a1d3da271a89 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 8 Nov 2017 11:18:29 +0100 Subject: [PATCH 1042/1085] s390: avoid undefined behaviour At a couple of places smatch emits warnings like this: arch/s390/mm/vmem.c:409 vmem_map_init() warn: right shifting more than type allows In fact shifting a signed type right is undefined. Avoid this and add an unsigned long cast. The shifted values are always positive. Signed-off-by: Heiko Carstens --- arch/s390/mm/init.c | 4 ++-- arch/s390/mm/vmem.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3b567838b905..a26fb8ee4a6b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -144,8 +144,8 @@ void __init mem_init(void) void free_initmem(void) { - __set_memory((unsigned long) _sinittext, - (_einittext - _sinittext) >> PAGE_SHIFT, + __set_memory((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index f890f2ad951b..24671beb2def 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -402,17 +402,17 @@ void __init vmem_map_init(void) for_each_memblock(memory, reg) vmem_add_mem(reg->base, reg->size); - __set_memory((unsigned long) _stext, - (_etext - _stext) >> PAGE_SHIFT, + __set_memory((unsigned long)_stext, + (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - __set_memory((unsigned long) _etext, - (__end_rodata - _etext) >> PAGE_SHIFT, + __set_memory((unsigned long)_etext, + (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, SET_MEMORY_RO); - __set_memory((unsigned long) _sinittext, - (_einittext - _sinittext) >> PAGE_SHIFT, + __set_memory((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); pr_info("Write protected kernel read-only data: %luk\n", - (__end_rodata - _stext) >> 10); + (unsigned long)(__end_rodata - _stext) >> 10); } /* From 8e5f4ba0cd5ed3879d484472657122742a749e9c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 3 Sep 2017 13:23:32 -0700 Subject: [PATCH 1043/1085] scsi: qla2xxx: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: qla2xxx-upstream@qlogic.com Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Martin K. Petersen Tested-by: Bart Van Assche --- drivers/scsi/qla2xxx/qla_gbl.h | 6 +++--- drivers/scsi/qla2xxx/qla_init.c | 4 ++-- drivers/scsi/qla2xxx/qla_inline.h | 4 +--- drivers/scsi/qla2xxx/qla_mid.c | 2 +- drivers/scsi/qla2xxx/qla_os.c | 11 +++++------ 5 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index f852ca60c49f..3ad375f85b59 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -206,8 +206,8 @@ int qla24xx_async_abort_cmd(srb_t *); */ extern struct scsi_host_template qla2xxx_driver_template; extern struct scsi_transport_template *qla2xxx_transport_vport_template; -extern void qla2x00_timer(scsi_qla_host_t *); -extern void qla2x00_start_timer(scsi_qla_host_t *, void *, unsigned long); +extern void qla2x00_timer(struct timer_list *); +extern void qla2x00_start_timer(scsi_qla_host_t *, unsigned long); extern void qla24xx_deallocate_vp_id(scsi_qla_host_t *); extern int qla24xx_disable_vp (scsi_qla_host_t *); extern int qla24xx_enable_vp (scsi_qla_host_t *); @@ -753,7 +753,7 @@ extern int qla82xx_restart_isp(scsi_qla_host_t *); /* IOCB related functions */ extern int qla82xx_start_scsi(srb_t *); extern void qla2x00_sp_free(void *); -extern void qla2x00_sp_timeout(unsigned long); +extern void qla2x00_sp_timeout(struct timer_list *); extern void qla2x00_bsg_job_done(void *, int); extern void qla2x00_bsg_sp_free(void *); extern void qla2x00_start_iocbs(struct scsi_qla_host *, struct req_que *); diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b5b48ddca962..44cf875a484a 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -45,9 +45,9 @@ static void qla24xx_handle_prli_done_event(struct scsi_qla_host *, /* SRB Extensions ---------------------------------------------------------- */ void -qla2x00_sp_timeout(unsigned long __data) +qla2x00_sp_timeout(struct timer_list *t) { - srb_t *sp = (srb_t *)__data; + srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer); struct srb_iocb *iocb; scsi_qla_host_t *vha = sp->vha; struct req_que *req; diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h index 9a2c86eacf44..17d2c20f1f75 100644 --- a/drivers/scsi/qla2xxx/qla_inline.h +++ b/drivers/scsi/qla2xxx/qla_inline.h @@ -269,10 +269,8 @@ qla2x00_rel_sp(srb_t *sp) static inline void qla2x00_init_timer(srb_t *sp, unsigned long tmo) { - init_timer(&sp->u.iocb_cmd.timer); + timer_setup(&sp->u.iocb_cmd.timer, qla2x00_sp_timeout, 0); sp->u.iocb_cmd.timer.expires = jiffies + tmo * HZ; - sp->u.iocb_cmd.timer.data = (unsigned long)sp; - sp->u.iocb_cmd.timer.function = qla2x00_sp_timeout; add_timer(&sp->u.iocb_cmd.timer); sp->free = qla2x00_sp_free; if (IS_QLAFX00(sp->vha->hw) && (sp->type == SRB_FXIOCB_DCMD)) diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index c0f8f6c17b79..cbf544dbf883 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -487,7 +487,7 @@ qla24xx_create_vhost(struct fc_vport *fc_vport) atomic_set(&vha->loop_state, LOOP_DOWN); atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME); - qla2x00_start_timer(vha, qla2x00_timer, WATCH_INTERVAL); + qla2x00_start_timer(vha, WATCH_INTERVAL); vha->req = base_vha->req; host->can_queue = base_vha->req->length + 128; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 5b2437a5ea44..c3b3daa797ff 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -330,12 +330,10 @@ struct scsi_transport_template *qla2xxx_transport_vport_template = NULL; */ __inline__ void -qla2x00_start_timer(scsi_qla_host_t *vha, void *func, unsigned long interval) +qla2x00_start_timer(scsi_qla_host_t *vha, unsigned long interval) { - init_timer(&vha->timer); + timer_setup(&vha->timer, qla2x00_timer, 0); vha->timer.expires = jiffies + interval * HZ; - vha->timer.data = (unsigned long)vha; - vha->timer.function = (void (*)(unsigned long))func; add_timer(&vha->timer); vha->timer_active = 1; } @@ -3245,7 +3243,7 @@ skip_dpc: base_vha->host->irq = ha->pdev->irq; /* Initialized the timer */ - qla2x00_start_timer(base_vha, qla2x00_timer, WATCH_INTERVAL); + qla2x00_start_timer(base_vha, WATCH_INTERVAL); ql_dbg(ql_dbg_init, base_vha, 0x00ef, "Started qla2x00_timer with " "interval=%d.\n", WATCH_INTERVAL); @@ -5994,8 +5992,9 @@ qla2x00_rst_aen(scsi_qla_host_t *vha) * Context: Interrupt ***************************************************************************/ void -qla2x00_timer(scsi_qla_host_t *vha) +qla2x00_timer(struct timer_list *t) { + scsi_qla_host_t *vha = from_timer(vha, t, timer); unsigned long cpu_flags = 0; int start_dpc = 0; int index; From 8ef81c65485b94c59b8ae43ed7d1d75c3562a835 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Oct 2017 22:24:18 -0700 Subject: [PATCH 1044/1085] netfilter: ipvs: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Wensong Zhang Cc: Simon Horman Cc: Julian Anastasov Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: lvs-devel@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Kees Cook Acked-by: Julian Anastasov Acked-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 10 +++++----- net/netfilter/ipvs/ip_vs_ctl.c | 7 +++---- net/netfilter/ipvs/ip_vs_est.c | 6 +++--- net/netfilter/ipvs/ip_vs_lblc.c | 11 ++++++----- net/netfilter/ipvs/ip_vs_lblcr.c | 11 ++++++----- 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 3d2ac71a83ec..3a43b3470331 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,7 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static void ip_vs_conn_expire(unsigned long data); +static void ip_vs_conn_expire(struct timer_list *t); /* * Returns hash value for IPVS connection entry @@ -457,7 +457,7 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) { __ip_vs_conn_put(cp); - ip_vs_conn_expire((unsigned long)cp); + ip_vs_conn_expire(&cp->timer); } /* @@ -817,9 +817,9 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct ip_vs_conn *cp = from_timer(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -909,7 +909,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, } INIT_HLIST_NODE(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4f940d7eb2f7..b47e266c6eca 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1146,9 +1146,9 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } -static void ip_vs_dest_trash_expire(unsigned long data) +static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = (struct netns_ipvs *)data; + struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -4019,8 +4019,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); - setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, - (unsigned long) ipvs); + timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 457c6c193e13..489055091a9b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -97,12 +97,12 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, } -static void estimation_timer(unsigned long arg) +static void estimation_timer(struct timer_list *t) { struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = (struct netns_ipvs *)arg; + struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { @@ -192,7 +192,7 @@ int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); - setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs); + timer_setup(&ipvs->est_timer, estimation_timer, 0); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index b6aa4a970c6e..d625179de485 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -106,6 +106,7 @@ struct ip_vs_lblc_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ int rover; /* rover for expire check */ @@ -294,10 +295,10 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) * of the table. * The full expiration check is for this purpose now. */ -static void ip_vs_lblc_check_expire(unsigned long data) +static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; @@ -369,12 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->rover = 0; tbl->counter = 1; tbl->dead = 0; + tbl->svc = svc; /* * Hook periodic timer for garbage collection */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)svc); + timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c13ff575f9f7..84c57b62a588 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -278,6 +278,7 @@ struct ip_vs_lblcr_table { atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; @@ -458,10 +459,10 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) * of the table. * The full expiration check is for this purpose now. */ -static void ip_vs_lblcr_check_expire(unsigned long data) +static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; @@ -532,12 +533,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->rover = 0; tbl->counter = 1; tbl->dead = 0; + tbl->svc = svc; /* * Hook periodic timer for garbage collection */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)svc); + timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; From 3653bc95bcc7daa938c0fdcd64ff199ed8f7f208 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 21:52:09 -0700 Subject: [PATCH 1045/1085] timer: Prepare to change all DEFINE_TIMER() callbacks Before we can globally change the function prototype of all timer callbacks, we have to change those set up by DEFINE_TIMER(). Prepare for this by casting the callbacks until the prototype changes globally. Cc: Thomas Gleixner Signed-off-by: Kees Cook --- include/linux/timer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index a1af92bac0d5..9f8895decb82 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -63,6 +63,9 @@ struct timer_list { #define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + #define __TIMER_INITIALIZER(_function, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ @@ -74,7 +77,7 @@ struct timer_list { #define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ - __TIMER_INITIALIZER(_function, 0, 0) + __TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0, 0) void init_timer_key(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); @@ -147,9 +150,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) -#define TIMER_DATA_TYPE unsigned long -#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) - #ifndef CONFIG_LOCKDEP static inline void timer_setup(struct timer_list *timer, void (*callback)(struct timer_list *), From 7980f029d05d8a3b4634aa6952e1ec51bce9431f Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 9 Nov 2017 14:09:11 +0800 Subject: [PATCH 1046/1085] x86/build: Make the boot image generation less verbose This change suppresses the 'dd' output and adds the '-quiet' parameter to mkisofs tool. It also removes the 'Using ...' messages, as none of the messages matter to the user normally. "make V=1" can still be used for a more verbose build. The new build messages are now a streamlined set of: $ make isoimage ... Kernel: arch/x86/boot/bzImage is ready (#75) GENIMAGE arch/x86/boot/image.iso Kernel: arch/x86/boot/image.iso is ready Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1510207751-22166-1-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 628e93662093..49f4970f693b 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -19,6 +19,13 @@ # $6 - inird image file # +# Use "make V=1" to debug this script +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + verify () { if [ ! -f "$1" ]; then echo "" 1>&2 @@ -50,7 +57,7 @@ genbzdisk() { } genfdimage144() { - dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 + dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null mformat v: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - v:syslinux.cfg @@ -61,7 +68,7 @@ genfdimage144() { } genfdimage288() { - dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 + dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null mformat w: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - W:syslinux.cfg @@ -79,14 +86,12 @@ genisoimage() { for j in syslinux ISOLINUX ; do if [ -f /usr/$i/$j/isolinux.bin ] ; then isolinux=/usr/$i/$j/isolinux.bin - echo "Using $isolinux" cp $isolinux $tmp_dir fi done for j in syslinux syslinux/modules/bios ; do if [ -f /usr/$i/$j/ldlinux.c32 ]; then ldlinux=/usr/$i/$j/ldlinux.c32 - echo "Using $ldlinux" cp $ldlinux $tmp_dir fi done @@ -103,7 +108,7 @@ genisoimage() { if [ -f "$FDINITRD" ] ; then cp "$FDINITRD" $tmp_dir/initrd.img fi - mkisofs -J -r -input-charset=utf-8 -o $FIMAGE -b isolinux.bin \ + mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ $tmp_dir isohybrid $FIMAGE 2>/dev/null || true From 765cc3a4b224e22bf524fabe40284a524f37cdd0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 8 Nov 2017 18:41:01 +0000 Subject: [PATCH 1047/1085] sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds When the kernel is compiled with !CONFIG_SCHED_DEBUG support, we expect that all SCHED_FEAT are turned into compile time constants being propagated to support compiler optimizations. Specifically, we expect that code blocks like this: if (sched_feat(FEATURE_NAME) [&& ]) { /* FEATURE CODE */ } are turned into dead-code in case FEATURE_NAME defaults to FALSE, and thus being removed by the compiler from the finale image. For this mechanism to properly work it's required for the compiler to have full access, from each translation unit, to whatever is the value defined by the sched_feat macro. This macro is defined as: #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) and thus, the compiler can optimize that code only if the value of sysctl_sched_features is visible within each translation unit. Since: 029632fbb ("sched: Make separate sched*.c translation units") the scheduler code has been split into separate translation units however the definition of sysctl_sched_features is part of kernel/sched/core.c while, for all the other scheduler modules, it is visible only via kernel/sched/sched.h as an: extern const_debug unsigned int sysctl_sched_features Unfortunately, an extern reference does not allow the compiler to apply constants propagation. Thus, on !CONFIG_SCHED_DEBUG kernel we still end up with code to load a memory reference and (eventually) doing an unconditional jump of a chunk of code. This mechanism is unavoidable when sched_features can be turned on and off at run-time. However, this is not the case for "production" kernels compiled with !CONFIG_SCHED_DEBUG. In this case, sysctl_sched_features is just a constant value which cannot be changed at run-time and thus memory loads and jumps can be avoided altogether. This patch fixes the case of !CONFIG_SCHED_DEBUG kernel by declaring a local version of the sysctl_sched_features constant for each translation unit. This will ultimately allow the compiler to perform constants propagation and dead-code pruning. Tests have been done, with !CONFIG_SCHED_DEBUG on a v4.14-rc8 with and without the patch, by running 30 iterations of: perf bench sched messaging --pipe --thread --group 4 --loop 50000 on a 40 cores Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz using the powersave governor to rule out variations due to frequency scaling. Statistics on the reported completion time: count mean std min 99% max v4.14-rc8 30.0 15.7831 0.176032 15.442 16.01226 16.014 v4.14-rc8+patch 30.0 15.5033 0.189681 15.232 15.93938 15.962 ... show a 1.8% speedup on average completion time and 0.5% speedup in the 99 percentile. Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath Reviewed-by: Dietmar Eggemann Reviewed-by: Brendan Jackman Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20171108184101.16006-1-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- kernel/sched/sched.h | 25 ++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a55c842bfbc..a39a08113003 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -43,18 +43,21 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) /* * Debugging: various feature bits + * + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of + * sysctl_sched_features, defined in sched.h, to allow constants propagation + * at compile time and compiler optimization based on features default. */ - #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | - const_debug unsigned int sysctl_sched_features = #include "features.h" 0; - #undef SCHED_FEAT +#endif /* * Number of tasks to iterate in a single balance run. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58787e3631c7..45ab0bf564e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1233,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) # define const_debug const #endif -extern const_debug unsigned int sysctl_sched_features; - #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -1246,6 +1244,13 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1253,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ } #include "features.h" - #undef SCHED_FEAT extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; From 1f19aee0ec404112cec08f0c852fcd291690fbc7 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 9 Nov 2017 11:02:44 +0000 Subject: [PATCH 1048/1085] irqchip/mips-gic: Add pr_fmt and reword pr_* messages Several messages from the MIPS GIC driver include the text "GIC", but the format is not standard. Add a pr_fmt of "irq-mips-gic: " and reword the messages now that they will be prefixed with the driver name. Signed-off-by: Matt Redfearn Reviewed-by: Paul Burton Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 3ccebb020f40..9b768899f07b 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -6,6 +6,9 @@ * Copyright (C) 2008 Ralf Baechle (ralf@linux-mips.org) * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. */ + +#define pr_fmt(fmt) "irq-mips-gic: " fmt + #include #include #include @@ -685,7 +688,7 @@ static int __init gic_of_init(struct device_node *node, cpu_vec = find_first_zero_bit(&reserved, hweight_long(ST0_IM)); if (cpu_vec == hweight_long(ST0_IM)) { - pr_err("No CPU vectors available for GIC\n"); + pr_err("No CPU vectors available\n"); return -ENODEV; } @@ -699,7 +702,7 @@ static int __init gic_of_init(struct device_node *node, ~CM_GCR_GIC_BASE_GICEN; gic_len = 0x20000; } else { - pr_err("Failed to get GIC memory range\n"); + pr_err("Failed to get memory range\n"); return -ENODEV; } } else { @@ -757,7 +760,7 @@ static int __init gic_of_init(struct device_node *node, gic_shared_intrs, 0, &gic_irq_domain_ops, NULL); if (!gic_irq_domain) { - pr_err("Failed to add GIC IRQ domain"); + pr_err("Failed to add IRQ domain"); return -ENXIO; } @@ -766,7 +769,7 @@ static int __init gic_of_init(struct device_node *node, GIC_NUM_LOCAL_INTRS + gic_shared_intrs, node, &gic_ipi_domain_ops, NULL); if (!gic_ipi_domain) { - pr_err("Failed to add GIC IPI domain"); + pr_err("Failed to add IPI domain"); return -ENXIO; } From 666740fde412567aa0a8ea251ffee3004a6fa3a6 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 9 Nov 2017 11:02:45 +0000 Subject: [PATCH 1049/1085] irqchip: mips-gic: Print warning if inherited GIC base is used If the physical address of the GIC resource cannot be read from device tree, then the code falls back to reading it from the gcr_gic_base register. Hopefully this has been set to a sane value by the bootloader or some platform code, but is defined by the hardware manual to have "undefined" reset state. Using it as the address at which the GIC will be mapped into physical memory space can therefore be risky if it has not been initialised, since it may result in the GIC being mapped to an effectively random address anywhere in physical memory, where it might conflict with peripherals or RAM and lead to weird crashes. Since a "sane value" is very platform specific because it is particular to the platform's memory map, it is difficult to test for. At the very least, a warning message should be printed in the case that we trust the inherited value. Reported-by: Amit Kama Signed-off-by: Matt Redfearn Reviewed-by: Paul Burton Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 9b768899f07b..ef92a4d2038e 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -701,6 +701,8 @@ static int __init gic_of_init(struct device_node *node, gic_base = read_gcr_gic_base() & ~CM_GCR_GIC_BASE_GICEN; gic_len = 0x20000; + pr_warn("Using inherited base address %pa\n", + &gic_base); } else { pr_err("Failed to get memory range\n"); return -ENODEV; From a401917bc3e2d251ce5210527d8de0b0b83f3b44 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 9 Nov 2017 08:57:47 +0100 Subject: [PATCH 1050/1085] s390/virtio: remove unused header file kvm_virtio.h With commit 7fb2b2d51244 ("s390/virtio: remove the old KVM virtio transport") the pre-ccw virtio transport for s390 was removed. To complete the removal the uapi header file that contains the related data structures must also be removed. Signed-off-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/include/uapi/asm/kvm_virtio.h | 64 ------------------------- arch/s390/kernel/setup.c | 1 - 2 files changed, 65 deletions(-) delete mode 100644 arch/s390/include/uapi/asm/kvm_virtio.h diff --git a/arch/s390/include/uapi/asm/kvm_virtio.h b/arch/s390/include/uapi/asm/kvm_virtio.h deleted file mode 100644 index 44a438ca9e72..000000000000 --- a/arch/s390/include/uapi/asm/kvm_virtio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * definition for virtio for kvm on s390 - * - * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - * - * Author(s): Christian Borntraeger - */ - -#ifndef __KVM_S390_VIRTIO_H -#define __KVM_S390_VIRTIO_H - -#include - -struct kvm_device_desc { - /* The device type: console, network, disk etc. Type 0 terminates. */ - __u8 type; - /* The number of virtqueues (first in config array) */ - __u8 num_vq; - /* - * The number of bytes of feature bits. Multiply by 2: one for host - * features and one for guest acknowledgements. - */ - __u8 feature_len; - /* The number of bytes of the config array after virtqueues. */ - __u8 config_len; - /* A status byte, written by the Guest. */ - __u8 status; - __u8 config[0]; -}; - -/* - * This is how we expect the device configuration field for a virtqueue - * to be laid out in config space. - */ -struct kvm_vqconfig { - /* The token returned with an interrupt. Set by the guest */ - __u64 token; - /* The address of the virtio ring */ - __u64 address; - /* The number of entries in the virtio_ring */ - __u16 num; - -}; - -#define KVM_S390_VIRTIO_NOTIFY 0 -#define KVM_S390_VIRTIO_RESET 1 -#define KVM_S390_VIRTIO_SET_STATUS 2 - -/* The alignment to use between consumer and producer parts of vring. - * This is pagesize for historical reasons. */ -#define KVM_S390_VIRTIO_RING_ALIGN 4096 - - -/* These values are supposed to be in ext_params on an interrupt */ -#define VIRTIO_PARAM_MASK 0xff -#define VIRTIO_PARAM_VRING_INTERRUPT 0x0 -#define VIRTIO_PARAM_CONFIG_CHANGED 0x1 -#define VIRTIO_PARAM_DEV_ADD 0x2 - -#endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b0943ef8cc31..b5baaef11f9b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include From 56c5c6834e330caca7584445f4dc103515eb7175 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 30 Oct 2017 12:10:54 +0100 Subject: [PATCH 1051/1085] s390/zcrypt: Rework struct ap_qact_ap_info. The ap_qact_ap_info struct can get more easy handled when the fields in there can be accessed by their names but also the struct as a whole with just an unsigned long value. This patch reworks this struct to be a union and adapt the using code accordingly. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_asm.h | 40 ++++++++++++++++++------------------ drivers/s390/crypto/ap_bus.c | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/s390/crypto/ap_asm.h b/drivers/s390/crypto/ap_asm.h index 0c0c02f26f5b..f29ce8a44d3b 100644 --- a/drivers/s390/crypto/ap_asm.h +++ b/drivers/s390/crypto/ap_asm.h @@ -117,45 +117,45 @@ static inline int ap_qci(void *config) } /* - * struct ap_qact_ap_info - used together with the + * union ap_qact_ap_info - used together with the * ap_aqic() function to provide a convenient way * to handle the ap info needed by the qact function. */ -struct ap_qact_ap_info { - unsigned int _res1 : 3; - unsigned int mode : 3; - unsigned int _res2 : 26; - unsigned int cat : 8; - unsigned int _res3 : 8; - unsigned char ver[2]; +union ap_qact_ap_info { + unsigned long val; + struct { + unsigned int : 3; + unsigned int mode : 3; + unsigned int : 26; + unsigned int cat : 8; + unsigned int : 8; + unsigned char ver[2]; + }; }; /** * ap_qact(): Query AP combatibility type. * @qid: The AP queue number - * @apinfo: On input the info about the AP queue (content of GR1 - * according to the AR). On output the alternate AP queue - * info provided by the qact function in GR2 is stored in. + * @apinfo: On input the info about the AP queue. On output the + * alternate AP queue info provided by the qact function + * in GR2 is stored in. * * Returns AP queue status. Check response_code field for failures. */ static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, - struct ap_qact_ap_info *apinfo) + union ap_qact_ap_info *apinfo) { register unsigned long reg0 asm ("0") = qid | (5UL << 24) | ((ifbit & 0x01) << 22); - register struct ap_qact_ap_info reg1_in asm ("1") = *apinfo; + register unsigned long reg1_in asm ("1") = apinfo->val; register struct ap_queue_status reg1_out asm ("1"); - register unsigned long reg2_in asm ("2") = 0; - register struct ap_qact_ap_info reg2_out asm ("2"); + register unsigned long reg2 asm ("2") = 0; asm volatile( ".long 0xb2af0000" /* PQAP(QACT) */ - : "+d" (reg0), "+d" (reg1_in), "=d" (reg1_out), - "+d" (reg2_in), "=d" (reg2_out) - : - : "cc"); - *apinfo = reg2_out; + : "+d" (reg0), "+d" (reg1_in), "=d" (reg1_out), "+d" (reg2) + : : "cc"); + apinfo->val = reg2; return reg1_out; } diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 0c1c48c476b7..8b5658b0bec3 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1021,7 +1021,7 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) */ if (ap_qact_available()) { struct ap_queue_status status; - struct ap_qact_ap_info apinfo = {0}; + union ap_qact_ap_info apinfo = {0}; apinfo.mode = (func >> 26) & 0x07; apinfo.cat = AP_DEVICE_TYPE_CEX6; From baaf9be8d05c63c5cca9729ff05e69c6afed4fc5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Nov 2017 13:20:12 +0100 Subject: [PATCH 1052/1085] s390: simplify transactional execution elf hwcap handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just use MACHINE_HAS_TE to decide if HWCAP_S390_TE needs to be added to elf_hwcap. Suggested-by: Dan Horák Reviewed-by: Christian Borntraeger Reviewed-by: Hendrik Brueckner Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b5baaef11f9b..090053cf279b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -759,7 +759,7 @@ static int __init setup_hwcaps(void) /* * Transactional execution support HWCAP_S390_TE is bit 10. */ - if (test_facility(50) && test_facility(73)) + if (MACHINE_HAS_TE) elf_hwcap |= HWCAP_S390_TE; /* From 0e37a23ebdadc0e34176b816770da6f4ac64043e Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 26 Oct 2017 22:45:24 -0400 Subject: [PATCH 1053/1085] m68k/mac: More printk modernization Log message fragments used to be printed on one line but now get split up. Fix this. Also, suppress log spam that merely prints known pointer values. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/baboon.c | 2 +- arch/m68k/mac/iop.c | 4 ++-- arch/m68k/mac/psc.c | 6 ++---- arch/m68k/mac/via.c | 18 +++++------------- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/m68k/mac/baboon.c b/arch/m68k/mac/baboon.c index 514acde3cd40..0d154c909e17 100644 --- a/arch/m68k/mac/baboon.c +++ b/arch/m68k/mac/baboon.c @@ -36,7 +36,7 @@ void __init baboon_init(void) baboon = (struct baboon *) BABOON_BASE; baboon_present = 1; - printk("Baboon detected at %p\n", baboon); + pr_debug("Baboon detected at %p\n", baboon); } /* diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 4c1e606e7d03..a2ea52db7d18 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -273,10 +273,10 @@ void __init iop_init(void) int i; if (iop_scc_present) { - pr_info("IOP: detected SCC IOP at %p\n", iop_base[IOP_NUM_SCC]); + pr_debug("SCC IOP detected at %p\n", iop_base[IOP_NUM_SCC]); } if (iop_ism_present) { - pr_info("IOP: detected ISM IOP at %p\n", iop_base[IOP_NUM_ISM]); + pr_debug("ISM IOP detected at %p\n", iop_base[IOP_NUM_ISM]); iop_start(iop_base[IOP_NUM_ISM]); iop_alive(iop_base[IOP_NUM_ISM]); /* clears the alive flag */ } diff --git a/arch/m68k/mac/psc.c b/arch/m68k/mac/psc.c index 439a2a2e5874..8d547df4e16c 100644 --- a/arch/m68k/mac/psc.c +++ b/arch/m68k/mac/psc.c @@ -42,7 +42,7 @@ static void psc_debug_dump(void) return; for (i = 0x30 ; i < 0x70 ; i += 0x10) { - printk("PSC #%d: IFR = 0x%02X IER = 0x%02X\n", + printk(KERN_DEBUG "PSC #%d: IFR = 0x%02X IER = 0x%02X\n", i >> 4, (int) psc_read_byte(pIFRbase + i), (int) psc_read_byte(pIERbase + i)); @@ -59,14 +59,12 @@ static __init void psc_dma_die_die_die(void) { int i; - printk("Killing all PSC DMA channels..."); for (i = 0 ; i < 9 ; i++) { psc_write_word(PSC_CTL_BASE + (i << 4), 0x8800); psc_write_word(PSC_CTL_BASE + (i << 4), 0x1000); psc_write_word(PSC_CMD_BASE + (i << 5), 0x1100); psc_write_word(PSC_CMD_BASE + (i << 5) + 0x10, 0x1100); } - printk("done!\n"); } /* @@ -92,7 +90,7 @@ void __init psc_init(void) psc = (void *) PSC_BASE; - printk("PSC detected at %p\n", psc); + pr_debug("PSC detected at %p\n", psc); psc_dma_die_die_die(); diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 16629e91feba..05021381af0b 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -121,18 +121,21 @@ void via_debug_dump(void); void __init via_init(void) { + via1 = (void *)VIA1_BASE; + pr_debug("VIA1 detected at %p\n", via1); + switch(macintosh_config->via_type) { /* IIci, IIsi, IIvx, IIvi (P6xx), LC series */ case MAC_VIA_IICI: - via1 = (void *) VIA1_BASE; if (macintosh_config->ident == MAC_MODEL_IIFX) { via2 = NULL; rbv_present = 0; oss_present = 1; } else { via2 = (void *) RBV_BASE; + pr_debug("VIA2 (RBV) detected at %p\n", via2); rbv_present = 1; oss_present = 0; } @@ -154,8 +157,8 @@ void __init via_init(void) case MAC_VIA_QUADRA: case MAC_VIA_II: - via1 = (void *) VIA1_BASE; via2 = (void *) VIA2_BASE; + pr_debug("VIA2 detected at %p\n", via2); rbv_present = 0; oss_present = 0; rbv_clear = 0x00; @@ -168,17 +171,6 @@ void __init via_init(void) panic("UNKNOWN VIA TYPE"); } - printk(KERN_INFO "VIA1 at %p is a 6522 or clone\n", via1); - - printk(KERN_INFO "VIA2 at %p is ", via2); - if (rbv_present) { - printk("an RBV\n"); - } else if (oss_present) { - printk("an OSS\n"); - } else { - printk("a 6522 or clone\n"); - } - #ifdef DEBUG_VIA via_debug_dump(); #endif From 7a0bb4427024f426668f3ed63542328135282422 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 26 Oct 2017 22:45:24 -0400 Subject: [PATCH 1054/1085] m68k/mac: Disentangle VIA and OSS initialization macintosh_config->via_type is meaningless on Mac IIfx (i.e. the only model with OSS chip), so skip the via_type switch statement. Call oss_init() before via_init() because it is more important and because that is the right place to initialize the oss_present flag. On this model, bringing forward oss_init() and delaying via_init() is no problem because those functions are independent. The only requirement here is that oss_register_interrupts() happens after via_init(). That is, mac_init_IRQ() happens after config_mac(). Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/config.c | 2 +- arch/m68k/mac/oss.c | 8 ++++---- arch/m68k/mac/via.c | 32 +++++++++++++------------------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 22123f7e8f75..16cd5cea5207 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -898,8 +898,8 @@ static void __init mac_identify(void) mac_bi_data.id, mac_bi_data.cpuid, mac_bi_data.memsize); iop_init(); - via_init(); oss_init(); + via_init(); psc_init(); baboon_init(); diff --git a/arch/m68k/mac/oss.c b/arch/m68k/mac/oss.c index ca84dcf41fc9..61906eb67d0f 100644 --- a/arch/m68k/mac/oss.c +++ b/arch/m68k/mac/oss.c @@ -31,18 +31,18 @@ volatile struct mac_oss *oss; /* * Initialize the OSS - * - * The OSS "detection" code is actually in via_init() which is always called - * before us. Thus we can count on oss_present being valid on entry. */ void __init oss_init(void) { int i; - if (!oss_present) return; + if (macintosh_config->ident != MAC_MODEL_IIFX) + return; oss = (struct mac_oss *) OSS_BASE; + pr_debug("OSS detected at %p", oss); + oss_present = 1; /* Disable all interrupts. Unlike a VIA it looks like we */ /* do this by setting the source's interrupt level to zero. */ diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 05021381af0b..19ad46ba4787 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -113,10 +113,6 @@ void via_debug_dump(void); * First we figure out where they actually _are_ as well as what type of * VIA we have for VIA2 (it could be a real VIA or an RBV or even an OSS.) * Then we pretty much clear them out and disable all IRQ sources. - * - * Note: the OSS is actually "detected" here and not in oss_init(). It just - * seems more logical to do it here since via_init() needs to know - * these things anyways. */ void __init via_init(void) @@ -124,21 +120,18 @@ void __init via_init(void) via1 = (void *)VIA1_BASE; pr_debug("VIA1 detected at %p\n", via1); - switch(macintosh_config->via_type) { + if (oss_present) { + via2 = NULL; + rbv_present = 0; + } else { + switch (macintosh_config->via_type) { /* IIci, IIsi, IIvx, IIvi (P6xx), LC series */ case MAC_VIA_IICI: - if (macintosh_config->ident == MAC_MODEL_IIFX) { - via2 = NULL; - rbv_present = 0; - oss_present = 1; - } else { - via2 = (void *) RBV_BASE; - pr_debug("VIA2 (RBV) detected at %p\n", via2); - rbv_present = 1; - oss_present = 0; - } + via2 = (void *)RBV_BASE; + pr_debug("VIA2 (RBV) detected at %p\n", via2); + rbv_present = 1; if (macintosh_config->ident == MAC_MODEL_LCIII) { rbv_clear = 0x00; } else { @@ -160,15 +153,16 @@ void __init via_init(void) via2 = (void *) VIA2_BASE; pr_debug("VIA2 detected at %p\n", via2); rbv_present = 0; - oss_present = 0; rbv_clear = 0x00; gIER = vIER; gIFR = vIFR; gBufA = vBufA; gBufB = vBufB; break; + default: panic("UNKNOWN VIA TYPE"); + } } #ifdef DEBUG_VIA @@ -295,9 +289,9 @@ void via_debug_dump(void) (uint) via1[vDirA], (uint) via1[vDirB], (uint) via1[vACR]); printk(KERN_DEBUG " PCR = 0x%02X IFR = 0x%02X IER = 0x%02X\n", (uint) via1[vPCR], (uint) via1[vIFR], (uint) via1[vIER]); - if (oss_present) { - printk(KERN_DEBUG "VIA2: \n"); - } else if (rbv_present) { + if (!via2) + return; + if (rbv_present) { printk(KERN_DEBUG "VIA2: IFR = 0x%02X IER = 0x%02X\n", (uint) via2[rIFR], (uint) via2[rIER]); printk(KERN_DEBUG " SIFR = 0x%02X SIER = 0x%02X\n", From 8ee90c5c3fd1af8e68133defb15cfc66c1de3f88 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 26 Oct 2017 22:45:24 -0400 Subject: [PATCH 1055/1085] m68k/mac: Disentangle VIA/RBV and NuBus initialization The Nubus subsystem should not be concerned with differences between VIA, RBV and OSS platforms. It should be portable across Macs and PowerMacs. This goal has implications for the initialization code relating to bus locking and slot interrupts. During Nubus initialization, bus transactions are "unlocked": on VIA2 and RBV machines, via_nubus_init() sets a bit in the via2[gBufB] register to allow bus-mastering Nubus cards to arbitrate for the bus. This happens upon subsys_initcall(nubus_init). But because nubus_init() has no effect on card state, this sequence is arbitrary. Moreover, when Penguin is used to boot Linux, the bus is already unlocked when Linux starts. On OSS machines there's no attempt to unlock Nubus transactions at all. (Maybe there's no benefit on that platform or maybe no-one knows how.) All of this demonstrates that there's no benefit in locking out bus-mastering cards, as yet. (If the need arises, we could lock the bus for the duration of a timing-critical operation.) NetBSD unlocks the Nubus early (at VIA initialization) and we can do the same. via_nubus_init() is also responsible for some VIA interrupt setup that should happen earlier than subsys_initcall(nubus_init). And actually, the Nubus subsystem need not be involved with slot interrupts: SLOT2IRQ works fine because Nubus slot IRQs are geographically assigned (regardless of platform). For certain platforms with PDS slots, some Nubus IRQs may be platform IRQs and this is not something that the NuBus subsystem should worry about. So let's invoke via_nubus_init() earlier and make the platform responsible for bus unlocking and interrupt setup instead of the NuBus subsystem. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/oss.c | 8 -------- arch/m68k/mac/via.c | 5 ++++- drivers/nubus/nubus.c | 13 ------------- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/arch/m68k/mac/oss.c b/arch/m68k/mac/oss.c index 61906eb67d0f..21b85605db58 100644 --- a/arch/m68k/mac/oss.c +++ b/arch/m68k/mac/oss.c @@ -51,14 +51,6 @@ void __init oss_init(void) oss->irq_level[i] = 0; } -/* - * Initialize OSS for Nubus access - */ - -void __init oss_nubus_init(void) -{ -} - /* * Handle miscellaneous OSS interrupts. */ diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 19ad46ba4787..c38fab213e9f 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -106,6 +106,7 @@ static int gIER,gIFR,gBufA,gBufB; static u8 nubus_disabled; void via_debug_dump(void); +static void via_nubus_init(void); /* * Initialize the VIAs @@ -238,6 +239,8 @@ void __init via_init(void) via2[vACR] &= ~0x03; /* disable port A & B latches */ } + via_nubus_init(); + /* Everything below this point is VIA2 only... */ if (rbv_present) @@ -359,7 +362,7 @@ int via_get_cache_disable(void) * Initialize VIA2 for Nubus access */ -void __init via_nubus_init(void) +static void __init via_nubus_init(void) { /* unlock nubus transactions */ diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index df431e8a0631..4f50f30650cd 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -18,11 +18,6 @@ #include #include #include -#include -#include - -extern void via_nubus_init(void); -extern void oss_nubus_init(void); /* Constants */ @@ -840,14 +835,6 @@ static int __init nubus_init(void) if (!MACH_IS_MAC) return 0; - /* Initialize the NuBus interrupts */ - if (oss_present) { - oss_nubus_init(); - } else { - via_nubus_init(); - } - - /* And probe */ pr_info("NuBus: Scanning NuBus slots.\n"); nubus_devices = NULL; nubus_boards = NULL; From 92178fcabbcd39fc9ccd4e58ec4be83dd5323a46 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 26 Oct 2017 22:45:24 -0400 Subject: [PATCH 1056/1085] m68k/mac: Add mutual exclusion for IOP interrupt polling The IOP interrupt handler iop_ism_irq() is used by the adb-iop driver to poll for ADB request completion. Unfortunately, it is not re-entrant. Fix the race condition by adding an iop_ism_irq_poll() function with suitable mutual exclusion. Tested-by: Stan Johnson Signed-off-by: Finn Thain Cc: Benjamin Herrenschmidt Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/mac_iop.h | 1 + arch/m68k/mac/iop.c | 9 +++++++++ drivers/macintosh/adb-iop.c | 4 +--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/m68k/include/asm/mac_iop.h b/arch/m68k/include/asm/mac_iop.h index 42566fd052bc..d2a08e004e2c 100644 --- a/arch/m68k/include/asm/mac_iop.h +++ b/arch/m68k/include/asm/mac_iop.h @@ -158,6 +158,7 @@ extern void iop_complete_message(struct iop_msg *); extern void iop_upload_code(uint, __u8 *, uint, __u16); extern void iop_download_code(uint, __u8 *, uint, __u16); extern __u8 *iop_compare_code(uint, __u8 *, uint, __u16); +extern void iop_ism_irq_poll(uint); extern void iop_register_interrupts(void); diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index a2ea52db7d18..9bfa17015768 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -598,3 +598,12 @@ irqreturn_t iop_ism_irq(int irq, void *dev_id) } return IRQ_HANDLED; } + +void iop_ism_irq_poll(uint iop_num) +{ + unsigned long flags; + + local_irq_save(flags); + iop_ism_irq(0, (void *)iop_num); + local_irq_restore(flags); +} diff --git a/drivers/macintosh/adb-iop.c b/drivers/macintosh/adb-iop.c index f5f4da3d0b67..4b0ad3995497 100644 --- a/drivers/macintosh/adb-iop.c +++ b/drivers/macintosh/adb-iop.c @@ -29,8 +29,6 @@ /*#define DEBUG_ADB_IOP*/ -extern void iop_ism_irq(int, void *); - static struct adb_request *current_req; static struct adb_request *last_req; #if 0 @@ -265,7 +263,7 @@ int adb_iop_autopoll(int devs) void adb_iop_poll(void) { if (adb_iop_state == idle) adb_iop_start(); - iop_ism_irq(0, (void *) ADB_IOP); + iop_ism_irq_poll(ADB_IOP); } int adb_iop_reset_bus(void) From 5e387199c17c4d2fb8b72ada82fbf81f4ba6f317 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 30 Oct 2017 08:13:32 +0100 Subject: [PATCH 1057/1085] m68k/defconfig: Update defconfigs for v4.14-rc7 Signed-off-by: Geert Uytterhoeven --- arch/m68k/configs/amiga_defconfig | 6 ++++-- arch/m68k/configs/apollo_defconfig | 6 ++++-- arch/m68k/configs/atari_defconfig | 6 ++++-- arch/m68k/configs/bvme6000_defconfig | 6 ++++-- arch/m68k/configs/hp300_defconfig | 6 ++++-- arch/m68k/configs/mac_defconfig | 6 ++++-- arch/m68k/configs/multi_defconfig | 6 ++++-- arch/m68k/configs/mvme147_defconfig | 6 ++++-- arch/m68k/configs/mvme16x_defconfig | 6 ++++-- arch/m68k/configs/q40_defconfig | 6 ++++-- arch/m68k/configs/sun3_defconfig | 6 ++++-- arch/m68k/configs/sun3x_defconfig | 6 ++++-- 12 files changed, 48 insertions(+), 24 deletions(-) diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 54191f6fc715..5b5fa9831b4d 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -123,6 +123,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -302,6 +303,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -400,6 +402,7 @@ CONFIG_ARIADNE=y # CONFIG_NET_VENDOR_CIRRUS is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_HP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -451,6 +454,7 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_CIRRUS=y CONFIG_FB_AMIGA=y @@ -607,12 +611,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index fb4663904428..72a7764b74ed 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -121,6 +121,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -300,6 +301,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -377,6 +379,7 @@ CONFIG_VETH=m # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -419,6 +422,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -566,12 +570,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 4ab393e86e52..884b43a2f0d9 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -121,6 +121,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -300,6 +301,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -387,6 +389,7 @@ CONFIG_ATARILANCE=y # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -434,6 +437,7 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_ATARI=y CONFIG_FRAMEBUFFER_CONSOLE=y @@ -588,12 +592,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 1dd8d697545b..fcfa60d31499 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -298,6 +299,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -376,6 +378,7 @@ CONFIG_VETH=m # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_BVME6000_NET=y # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -417,6 +420,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -558,12 +562,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 02b39f50076e..9d597bbbbbfe 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -121,6 +121,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -300,6 +301,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -378,6 +380,7 @@ CONFIG_HPLANCE=y # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -422,6 +425,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -568,12 +572,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 044dcb2bf8fb..45da20d1286c 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -120,6 +120,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -302,6 +303,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -395,6 +397,7 @@ CONFIG_MACMACE=y # CONFIG_NET_VENDOR_BROADCOM is not set CONFIG_MAC89x0=y # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -444,6 +447,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_VALKYRIE=y CONFIG_FB_MAC=y @@ -590,12 +594,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 3ad04682077a..fda880c10861 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -130,6 +130,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -312,6 +313,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -436,6 +438,7 @@ CONFIG_MACMACE=y CONFIG_MAC89x0=y # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_HP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_BVME6000_NET=y CONFIG_MVME16x_NET=y # CONFIG_NET_VENDOR_MARVELL is not set @@ -501,6 +504,7 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_CIRRUS=y CONFIG_FB_AMIGA=y @@ -670,12 +674,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index dc2dd61948cd..7d5e4863efec 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -118,6 +118,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -297,6 +298,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -376,6 +378,7 @@ CONFIG_MVME147_NET=y # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -417,6 +420,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -558,12 +562,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 54e7b523fc3d..7763b71a9c49 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -298,6 +299,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -376,6 +378,7 @@ CONFIG_VETH=m # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_MVME16x_NET=y # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -417,6 +420,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -558,12 +562,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index d63d8a15f6db..17eaebfa3e19 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -298,6 +299,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -386,6 +388,7 @@ CONFIG_VETH=m # CONFIG_NET_VENDOR_CIRRUS is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_HP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -434,6 +437,7 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -581,12 +585,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index d0924c22f52a..d1cb7a04ae1d 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -116,6 +116,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -295,6 +296,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -373,6 +375,7 @@ CONFIG_SUN3LANCE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_SUN3_82586=y # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -416,6 +419,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -559,12 +563,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 3001ee1e5dc5..ea3a331c62d5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -116,6 +116,7 @@ CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -295,6 +296,7 @@ CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m CONFIG_NET_L3_MASTER_DEV=y CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set @@ -374,6 +376,7 @@ CONFIG_SUN3LANCE=y # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MICREL is not set @@ -416,6 +419,7 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -560,12 +564,10 @@ CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_MICHAEL_MIC=m From 120fc3fbb7787fb70240190cc9c113d1f6523c42 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 8 Nov 2017 18:09:52 +0800 Subject: [PATCH 1058/1085] x86/tsc: Mark cyc2ns_init() and detect_art() __init These two functions are only called by tsc_init(), which is an __init function during boot time, so mark them __init as well. Signed-off-by: Dou Liyang Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1510135792-17429-1-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f1326c0422c1..416de29fe862 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -112,7 +112,7 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_offset = 0; } -static void cyc2ns_init(int cpu) +static void __init cyc2ns_init(int cpu) { struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); @@ -955,7 +955,7 @@ core_initcall(cpufreq_register_tsc_scaling); /* * If ART is present detect the numerator:denominator to convert to TSC */ -static void detect_art(void) +static void __init detect_art(void) { unsigned int unused[2]; From f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Nov 2017 14:27:35 +0100 Subject: [PATCH 1059/1085] x86/virt, x86/platform: Merge 'struct x86_hyper' into 'struct x86_platform' and 'struct x86_init' Instead of x86_hyper being either NULL on bare metal or a pointer to a struct hypervisor_x86 in case of the kernel running as a guest merge the struct into x86_platform and x86_init. This will remove the need for wrappers making it hard to find out what is being called. With dummy functions added for all callbacks testing for a NULL function pointer can be removed, too. Suggested-by: Ingo Molnar Signed-off-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: devel@linuxdriverproject.org Cc: haiyangz@microsoft.com Cc: kvm@vger.kernel.org Cc: kys@microsoft.com Cc: pbonzini@redhat.com Cc: rkrcmar@redhat.com Cc: rusty@rustcorp.com.au Cc: sthemmin@microsoft.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20171109132739.23465-2-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hypervisor.h | 25 +++----------- arch/x86/include/asm/x86_init.h | 24 +++++++++++++ arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/hypervisor.c | 56 +++++++++++++++---------------- arch/x86/kernel/cpu/mshyperv.c | 2 +- arch/x86/kernel/cpu/vmware.c | 4 +-- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/x86_init.c | 9 +++++ arch/x86/mm/init.c | 2 +- arch/x86/xen/enlighten_hvm.c | 8 ++--- arch/x86/xen/enlighten_pv.c | 2 +- include/linux/hypervisor.h | 8 +++-- 12 files changed, 82 insertions(+), 62 deletions(-) diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 0ead9dbb9130..0eca7239a7aa 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -23,6 +23,7 @@ #ifdef CONFIG_HYPERVISOR_GUEST #include +#include #include /* @@ -35,17 +36,11 @@ struct hypervisor_x86 { /* Detection routine */ uint32_t (*detect)(void); - /* Platform setup (run once per boot) */ - void (*init_platform)(void); + /* init time callbacks */ + struct x86_hyper_init init; - /* X2APIC detection (run once per boot) */ - bool (*x2apic_available)(void); - - /* pin current vcpu to specified physical cpu (run rarely) */ - void (*pin_vcpu)(int); - - /* called during init_mem_mapping() to setup early mappings. */ - void (*init_mem_mapping)(void); + /* runtime callbacks */ + struct x86_hyper_runtime runtime; }; extern const struct hypervisor_x86 *x86_hyper; @@ -58,17 +53,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor_platform(void); -extern bool hypervisor_x2apic_available(void); -extern void hypervisor_pin_vcpu(int cpu); - -static inline void hypervisor_init_mem_mapping(void) -{ - if (x86_hyper && x86_hyper->init_mem_mapping) - x86_hyper->init_mem_mapping(); -} #else static inline void init_hypervisor_platform(void) { } -static inline bool hypervisor_x2apic_available(void) { return false; } -static inline void hypervisor_init_mem_mapping(void) { } #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8a1ebf9540dd..ad15a0fda917 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -114,6 +114,18 @@ struct x86_init_pci { void (*fixup_irqs)(void); }; +/** + * struct x86_hyper_init - x86 hypervisor init functions + * @init_platform: platform setup + * @x2apic_available: X2APIC detection + * @init_mem_mapping: setup early mappings during init_mem_mapping() + */ +struct x86_hyper_init { + void (*init_platform)(void); + bool (*x2apic_available)(void); + void (*init_mem_mapping)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -127,6 +139,7 @@ struct x86_init_ops { struct x86_init_timers timers; struct x86_init_iommu iommu; struct x86_init_pci pci; + struct x86_hyper_init hyper; }; /** @@ -199,6 +212,15 @@ struct x86_legacy_features { struct x86_legacy_devices devices; }; +/** + * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks + * + * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) + */ +struct x86_hyper_runtime { + void (*pin_vcpu)(int cpu); +}; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_cpu: calibrate CPU @@ -218,6 +240,7 @@ struct x86_legacy_features { * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. + * @hyper: x86 hypervisor specific runtime callbacks */ struct x86_platform_ops { unsigned long (*calibrate_cpu)(void); @@ -233,6 +256,7 @@ struct x86_platform_ops { void (*apic_post_init)(void); struct x86_legacy_features legacy; void (*set_legacy_features)(void); + struct x86_hyper_runtime hyper; }; struct pci_dev; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ff891772c9f8..89c7c8569e5e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1645,7 +1645,7 @@ static __init void try_to_enable_x2apic(int remap_mode) * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) { + !x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 4fa90006ac68..22226c1bf092 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -44,51 +44,49 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = const struct hypervisor_x86 *x86_hyper; EXPORT_SYMBOL(x86_hyper); -static inline void __init +static inline const struct hypervisor_x86 * __init detect_hypervisor_vendor(void) { - const struct hypervisor_x86 *h, * const *p; + const struct hypervisor_x86 *h = NULL, * const *p; uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { - h = *p; - pri = h->detect(); - if (pri != 0 && pri > max_pri) { + pri = (*p)->detect(); + if (pri > max_pri) { max_pri = pri; - x86_hyper = h; + h = *p; } } - if (max_pri) - pr_info("Hypervisor detected: %s\n", x86_hyper->name); + if (h) + pr_info("Hypervisor detected: %s\n", h->name); + + return h; +} + +static void __init copy_array(const void *src, void *target, unsigned int size) +{ + unsigned int i, n = size / sizeof(void *); + const void * const *from = (const void * const *)src; + const void **to = (const void **)target; + + for (i = 0; i < n; i++) + if (from[i]) + to[i] = from[i]; } void __init init_hypervisor_platform(void) { + const struct hypervisor_x86 *h; - detect_hypervisor_vendor(); + h = detect_hypervisor_vendor(); - if (!x86_hyper) + if (!h) return; - if (x86_hyper->init_platform) - x86_hyper->init_platform(); -} + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); -bool __init hypervisor_x2apic_available(void) -{ - return x86_hyper && - x86_hyper->x2apic_available && - x86_hyper->x2apic_available(); -} - -void hypervisor_pin_vcpu(int cpu) -{ - if (!x86_hyper) - return; - - if (x86_hyper->pin_vcpu) - x86_hyper->pin_vcpu(cpu); - else - WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); + x86_hyper = h; + x86_init.hyper.init_platform(); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 236324e83a3a..6bb84d655e4b 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -257,6 +257,6 @@ static void __init ms_hyperv_init_platform(void) const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, - .init_platform = ms_hyperv_init_platform, + .init.init_platform = ms_hyperv_init_platform, }; EXPORT_SYMBOL(x86_hyper_ms_hyperv); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 40ed26852ebd..4804c1d063c8 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -208,7 +208,7 @@ static bool __init vmware_legacy_x2apic_available(void) const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .init_platform = vmware_platform_setup, - .x2apic_available = vmware_legacy_x2apic_available, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8bb9594d0761..9dca8437c795 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -547,7 +547,7 @@ static uint32_t __init kvm_detect(void) const struct hypervisor_x86 x86_hyper_kvm __refconst = { .name = "KVM", .detect = kvm_detect, - .x2apic_available = kvm_para_available, + .init.x2apic_available = kvm_para_available, }; EXPORT_SYMBOL_GPL(x86_hyper_kvm); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a088b2c47f73..5b2d10c1973a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -28,6 +28,8 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } +bool __init bool_x86_init_noop(void) { return false; } +void x86_op_int_noop(int cpu) { } /* * The platform setup functions are preset with the default functions @@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = { .init_irq = x86_default_pci_init_irq, .fixup_irqs = x86_default_pci_fixup_irqs, }, + + .hyper = { + .init_platform = x86_init_noop, + .x2apic_available = bool_x86_init_noop, + .init_mem_mapping = x86_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit = { @@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, + .hyper.pin_vcpu = x86_op_int_noop, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index af5c1ed21d43..a22c2b95e513 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -671,7 +671,7 @@ void __init init_mem_mapping(void) load_cr3(swapper_pg_dir); __flush_tlb_all(); - hypervisor_init_mem_mapping(); + x86_init.hyper.init_mem_mapping(); early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index de503c225ae1..7b1622089f96 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -229,9 +229,9 @@ static uint32_t __init xen_platform_hvm(void) const struct hypervisor_x86 x86_hyper_xen_hvm = { .name = "Xen HVM", .detect = xen_platform_hvm, - .init_platform = xen_hvm_guest_init, - .pin_vcpu = xen_pin_vcpu, - .x2apic_available = xen_x2apic_para_available, - .init_mem_mapping = xen_hvm_init_mem_mapping, + .init.init_platform = xen_hvm_guest_init, + .init.x2apic_available = xen_x2apic_para_available, + .init.init_mem_mapping = xen_hvm_init_mem_mapping, + .runtime.pin_vcpu = xen_pin_vcpu, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d4396e27b1fb..69d1a7054ddb 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1463,6 +1463,6 @@ static uint32_t __init xen_platform_pv(void) const struct hypervisor_x86 x86_hyper_xen_pv = { .name = "Xen PV", .detect = xen_platform_pv, - .pin_vcpu = xen_pin_vcpu, + .runtime.pin_vcpu = xen_pin_vcpu, }; EXPORT_SYMBOL(x86_hyper_xen_pv); diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h index b4054fd5b6f6..b19563f9a8eb 100644 --- a/include/linux/hypervisor.h +++ b/include/linux/hypervisor.h @@ -7,8 +7,12 @@ * Juergen Gross */ -#ifdef CONFIG_HYPERVISOR_GUEST -#include +#ifdef CONFIG_X86 +#include +static inline void hypervisor_pin_vcpu(int cpu) +{ + x86_platform.hyper.pin_vcpu(cpu); +} #else static inline void hypervisor_pin_vcpu(int cpu) { From 03b2a320b19f1424e9ac9c21696be9c60b6d0d93 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Nov 2017 14:27:36 +0100 Subject: [PATCH 1060/1085] x86/virt: Add enum for hypervisors to replace x86_hyper The x86_hyper pointer is only used for checking whether a virtual device is supporting the hypervisor the system is running on. Use an enum for that purpose instead and drop the x86_hyper pointer. Signed-off-by: Juergen Gross Acked-by: Thomas Gleixner Acked-by: Xavier Deguillard Cc: Linus Torvalds Cc: Peter Zijlstra Cc: akataria@vmware.com Cc: arnd@arndb.de Cc: boris.ostrovsky@oracle.com Cc: devel@linuxdriverproject.org Cc: dmitry.torokhov@gmail.com Cc: gregkh@linuxfoundation.org Cc: haiyangz@microsoft.com Cc: kvm@vger.kernel.org Cc: kys@microsoft.com Cc: linux-graphics-maintainer@vmware.com Cc: linux-input@vger.kernel.org Cc: moltmann@vmware.com Cc: pbonzini@redhat.com Cc: pv-drivers@vmware.com Cc: rkrcmar@redhat.com Cc: sthemmin@microsoft.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20171109132739.23465-3-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_init.c | 2 +- arch/x86/include/asm/hypervisor.h | 23 ++++++++++++++--------- arch/x86/kernel/cpu/hypervisor.c | 12 +++++++++--- arch/x86/kernel/cpu/mshyperv.c | 4 ++-- arch/x86/kernel/cpu/vmware.c | 4 ++-- arch/x86/kernel/kvm.c | 4 ++-- arch/x86/xen/enlighten_hvm.c | 4 ++-- arch/x86/xen/enlighten_pv.c | 4 ++-- drivers/hv/vmbus_drv.c | 2 +- drivers/input/mouse/vmmouse.c | 10 ++++------ drivers/misc/vmw_balloon.c | 2 +- 11 files changed, 40 insertions(+), 31 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5db63f728a2..a0b86cf486e0 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -113,7 +113,7 @@ void hyperv_init(void) u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; - if (x86_hyper != &x86_hyper_ms_hyperv) + if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; /* Allocate percpu VP index */ diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 0eca7239a7aa..1b0a5abcd8ae 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -29,6 +29,16 @@ /* * x86 hypervisor information */ + +enum x86_hypervisor_type { + X86_HYPER_NATIVE = 0, + X86_HYPER_VMWARE, + X86_HYPER_MS_HYPERV, + X86_HYPER_XEN_PV, + X86_HYPER_XEN_HVM, + X86_HYPER_KVM, +}; + struct hypervisor_x86 { /* Hypervisor name */ const char *name; @@ -36,6 +46,9 @@ struct hypervisor_x86 { /* Detection routine */ uint32_t (*detect)(void); + /* Hypervisor type */ + enum x86_hypervisor_type type; + /* init time callbacks */ struct x86_hyper_init init; @@ -43,15 +56,7 @@ struct hypervisor_x86 { struct x86_hyper_runtime runtime; }; -extern const struct hypervisor_x86 *x86_hyper; - -/* Recognized hypervisors */ -extern const struct hypervisor_x86 x86_hyper_vmware; -extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_pv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; -extern const struct hypervisor_x86 x86_hyper_kvm; - +extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); #else static inline void init_hypervisor_platform(void) { } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 22226c1bf092..bea8d3e24f50 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -26,6 +26,12 @@ #include #include +extern const struct hypervisor_x86 x86_hyper_vmware; +extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; + static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PV @@ -41,8 +47,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif }; -const struct hypervisor_x86 *x86_hyper; -EXPORT_SYMBOL(x86_hyper); +enum x86_hypervisor_type x86_hyper_type; +EXPORT_SYMBOL(x86_hyper_type); static inline const struct hypervisor_x86 * __init detect_hypervisor_vendor(void) @@ -87,6 +93,6 @@ void __init init_hypervisor_platform(void) copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); - x86_hyper = h; + x86_hyper_type = h->type; x86_init.hyper.init_platform(); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 6bb84d655e4b..85eb5fc180c8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -254,9 +254,9 @@ static void __init ms_hyperv_init_platform(void) #endif } -const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { +const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, + .type = X86_HYPER_MS_HYPERV, .init.init_platform = ms_hyperv_init_platform, }; -EXPORT_SYMBOL(x86_hyper_ms_hyperv); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 4804c1d063c8..8e005329648b 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void) (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; } -const __refconst struct hypervisor_x86 x86_hyper_vmware = { +const __initconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, + .type = X86_HYPER_VMWARE, .init.init_platform = vmware_platform_setup, .init.x2apic_available = vmware_legacy_x2apic_available, }; -EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9dca8437c795..a94de09edbed 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } -const struct hypervisor_x86 x86_hyper_kvm __refconst = { +const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, + .type = X86_HYPER_KVM, .init.x2apic_available = kvm_para_available, }; -EXPORT_SYMBOL_GPL(x86_hyper_kvm); static __init int activate_jump_labels(void) { diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 7b1622089f96..754d5391d9fa 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -226,12 +226,12 @@ static uint32_t __init xen_platform_hvm(void) return xen_cpuid_base(); } -const struct hypervisor_x86 x86_hyper_xen_hvm = { +const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { .name = "Xen HVM", .detect = xen_platform_hvm, + .type = X86_HYPER_XEN_HVM, .init.init_platform = xen_hvm_guest_init, .init.x2apic_available = xen_x2apic_para_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .runtime.pin_vcpu = xen_pin_vcpu, }; -EXPORT_SYMBOL(x86_hyper_xen_hvm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69d1a7054ddb..168efb2534c0 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1460,9 +1460,9 @@ static uint32_t __init xen_platform_pv(void) return 0; } -const struct hypervisor_x86 x86_hyper_xen_pv = { +const __initconst struct hypervisor_x86 x86_hyper_xen_pv = { .name = "Xen PV", .detect = xen_platform_pv, + .type = X86_HYPER_XEN_PV, .runtime.pin_vcpu = xen_pin_vcpu, }; -EXPORT_SYMBOL(x86_hyper_xen_pv); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 937801ac2fe0..2cd134dd94d2 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1534,7 +1534,7 @@ static int __init hv_acpi_init(void) { int ret, t; - if (x86_hyper != &x86_hyper_ms_hyperv) + if (x86_hyper_type != X86_HYPER_MS_HYPERV) return -ENODEV; init_completion(&probe_event); diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c index 0f586780ceb4..1ae5c1ef3f5b 100644 --- a/drivers/input/mouse/vmmouse.c +++ b/drivers/input/mouse/vmmouse.c @@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse) /* * Array of supported hypervisors. */ -static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = { - &x86_hyper_vmware, -#ifdef CONFIG_KVM_GUEST - &x86_hyper_kvm, -#endif +static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = { + X86_HYPER_VMWARE, + X86_HYPER_KVM, }; /** @@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void) int i; for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++) - if (vmmouse_supported_hypervisors[i] == x86_hyper) + if (vmmouse_supported_hypervisors[i] == x86_hyper_type) return true; return false; diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 1e688bfec567..9047c0a529b2 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void) * Check if we are running on VMware's hypervisor and bail out * if we are not. */ - if (x86_hyper != &x86_hyper_vmware) + if (x86_hyper_type != X86_HYPER_VMWARE) return -ENODEV; for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES; From 6d7305254ea947d575a066711c3349fafbebaffa Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Nov 2017 14:27:37 +0100 Subject: [PATCH 1061/1085] x86/virt, x86/acpi: Add test for ACPI_FADT_NO_VGA Add a test for ACPI_FADT_NO_VGA when scanning the FADT and set the new flag x86_platform.legacy.no_vga accordingly. Signed-off-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: len.brown@intel.com Cc: linux-pm@vger.kernel.org Cc: pavel@ucw.cz Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/20171109132739.23465-4-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/acpi/boot.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ad15a0fda917..260e9a85fecb 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -208,6 +208,7 @@ enum x86_legacy_i8042_state { struct x86_legacy_features { enum x86_legacy_i8042_state i8042; int rtc; + int no_vga; int reserve_bios_regions; struct x86_legacy_devices devices; }; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 079535e53e2a..ef9e02e614d0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -961,6 +961,11 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) x86_platform.legacy.rtc = 0; } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { + pr_debug("ACPI: probing for VGA not safe\n"); + x86_platform.legacy.no_vga = 1; + } + #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { From f3614646005a1b59f836ff54351c9bd2224b6005 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Nov 2017 14:27:38 +0100 Subject: [PATCH 1062/1085] x86/virt, x86/platform: Add ->guest_late_init() callback to hypervisor_x86 structure Add a new guest_late_init callback to the hypervisor_x86 structure. It will replace the current kvm_guest_init() call which is changed to make use of the new callback. Signed-off-by: Juergen Gross Acked-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Linus Torvalds Cc: Peter Zijlstra Cc: kvm@vger.kernel.org Cc: rkrcmar@redhat.com Link: http://lkml.kernel.org/r/20171109132739.23465-5-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kvm_para.h | 2 -- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/kvm.c | 3 ++- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 1 + 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c373e44049b1..7b407dda2bd7 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,6 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, #ifdef CONFIG_KVM_GUEST bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); -void __init kvm_guest_init(void); void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); @@ -103,7 +102,6 @@ static inline void kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #else /* CONFIG_KVM_GUEST */ -#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 260e9a85fecb..5dd011a8b560 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -117,11 +117,13 @@ struct x86_init_pci { /** * struct x86_hyper_init - x86 hypervisor init functions * @init_platform: platform setup + * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() */ struct x86_hyper_init { void (*init_platform)(void); + void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a94de09edbed..4b1f6f56b3e1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -465,7 +465,7 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } -void __init kvm_guest_init(void) +static void __init kvm_guest_init(void) { int i; @@ -548,6 +548,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..372da5299334 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1294,7 +1294,7 @@ void __init setup_arch(char **cmdline_p) io_apic_init_mappings(); - kvm_guest_init(); + x86_init.hyper.guest_late_init(); e820__reserve_resources(); e820__register_nosave_regions(max_low_pfn); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 5b2d10c1973a..c8fa4cd31903 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -86,6 +86,7 @@ struct x86_init_ops x86_init __initdata = { .hyper = { .init_platform = x86_init_noop, + .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, }, From 418492ba40b2c2bbdaf1a169aac5b1673bde8189 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Nov 2017 14:27:39 +0100 Subject: [PATCH 1063/1085] x86/virt/xen: Use guest_late_init to detect Xen PVH guest In case we are booted via the default boot entry by a generic loader like grub or OVMF it is necessary to distinguish between a HVM guest with a device model supporting legacy devices and a PVH guest without device model. PVH guests will always have x86_platform.legacy.no_vga set and x86_platform.legacy.rtc cleared, while both won't be true for HVM guests. Test for both conditions in the guest_late_init hook and set xen_pvh to true if they are met. Move some of the early PVH initializations to the new hook in order to avoid duplicated code. Signed-off-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20171109132739.23465-6-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten_hvm.c | 24 ++++++++++++++++++++++-- arch/x86/xen/enlighten_pvh.c | 9 --------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 754d5391d9fa..826898701045 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -188,8 +189,6 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); - if (xen_pvh_domain()) - machine_ops.emergency_restart = xen_emergency_restart; #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; machine_ops.crash_shutdown = xen_hvm_crash_shutdown; @@ -226,6 +225,26 @@ static uint32_t __init xen_platform_hvm(void) return xen_cpuid_base(); } +static __init void xen_hvm_guest_late_init(void) +{ +#ifdef CONFIG_XEN_PVH + /* Test for PVH domain (PVH boot path taken overrides ACPI flags). */ + if (!xen_pvh && + (x86_platform.legacy.rtc || !x86_platform.legacy.no_vga)) + return; + + /* PVH detected. */ + xen_pvh = true; + + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (!nr_ioapics && acpi_irq_model == ACPI_IRQ_MODEL_PIC) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; + + machine_ops.emergency_restart = xen_emergency_restart; + pv_info.name = "Xen PVH"; +#endif +} + const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { .name = "Xen HVM", .detect = xen_platform_hvm, @@ -233,5 +252,6 @@ const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { .init.init_platform = xen_hvm_guest_init, .init.x2apic_available = xen_x2apic_para_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, + .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, }; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 7bd3ee08393e..436c4f003e17 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -25,13 +25,6 @@ struct boot_params pvh_bootparams __attribute__((section(".data"))); struct hvm_start_info pvh_start_info; unsigned int pvh_start_info_sz = sizeof(pvh_start_info); -static void xen_pvh_arch_setup(void) -{ - /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ - if (nr_ioapics == 0) - acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; -} - static void __init init_pvh_bootparams(void) { struct xen_memory_map memmap; @@ -102,6 +95,4 @@ void __init xen_prepare_pvh(void) wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); init_pvh_bootparams(); - - x86_init.oem.arch_setup = xen_pvh_arch_setup; } From efeb88c68bd0dbd317c4d24a1b5b58c5e5bd68aa Mon Sep 17 00:00:00 2001 From: Keerthy Date: Fri, 10 Nov 2017 17:22:43 +0530 Subject: [PATCH 1064/1085] regulator: tps65218: Fix strobe assignment Currentlly tps_info structure is no longer used. So use the strobes parameter in tps65218 structure to capture the info. Fixes: 2dc4940360d4c0c (regulator: tps65218: Remove all the compatibles) Signed-off-by: Keerthy Signed-off-by: Mark Brown --- drivers/regulator/tps65218-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/tps65218-regulator.c b/drivers/regulator/tps65218-regulator.c index 9aafbb03482d..bc489958fed7 100644 --- a/drivers/regulator/tps65218-regulator.c +++ b/drivers/regulator/tps65218-regulator.c @@ -154,7 +154,7 @@ static int tps65218_pmic_set_suspend_disable(struct regulator_dev *dev) if (!tps->strobes[rid]) { if (rid == TPS65218_DCDC_3) - tps->info[rid]->strobe = 3; + tps->strobes[rid] = 3; else return -EINVAL; } From 125b1192bd48544d9429e240c8f8e64d1533b2c1 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Fri, 10 Nov 2017 17:22:44 +0530 Subject: [PATCH 1065/1085] regulator: tps65218: remove unused tps_info structure remove unused tps_info structure. Signed-off-by: Keerthy Signed-off-by: Mark Brown --- include/linux/mfd/tps65218.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h index bccd2d68b1e3..f069c518c0ed 100644 --- a/include/linux/mfd/tps65218.h +++ b/include/linux/mfd/tps65218.h @@ -245,24 +245,6 @@ enum tps65218_irqs { TPS65218_INVALID4_IRQ, }; -/** - * struct tps_info - packages regulator constraints - * @id: Id of the regulator - * @name: Voltage regulator name - * @min_uV: minimum micro volts - * @max_uV: minimum micro volts - * @strobe: sequencing strobe value for the regulator - * - * This data is used to check the regualtor voltage limits while setting. - */ -struct tps_info { - int id; - const char *name; - int min_uV; - int max_uV; - int strobe; -}; - /** * struct tps65218 - tps65218 sub-driver chip access routines * @@ -280,7 +262,6 @@ struct tps65218 { u32 irq_mask; struct regmap_irq_chip_data *irq_data; struct regulator_desc desc[TPS65218_NUM_REGULATOR]; - struct tps_info *info[TPS65218_NUM_REGULATOR]; struct regmap *regmap; u8 *strobes; }; From 450cbdd0125cfa5d7bbf9e2a6b6961cc48d29730 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 27 Oct 2017 19:14:31 +0300 Subject: [PATCH 1066/1085] locking/x86: Use LOCK ADD for smp_mb() instead of MFENCE MFENCE appears to be way slower than a locked instruction - let's use LOCK ADD unconditionally, as we always did on old 32-bit. Performance testing results: perf stat -r 10 -- ./virtio_ring_0_9 --sleep --host-affinity 0 --guest-affinity 0 Before: 0.922565990 seconds time elapsed ( +- 1.15% ) After: 0.578667024 seconds time elapsed ( +- 1.21% ) i.e. about ~60% faster. Just poking at SP would be the most natural, but if we then read the value from SP, we get a false dependency which will slow us down. This was noted in this article: http://shipilev.net/blog/2014/on-the-fence-with-dependencies/ And is easy to reproduce by sticking a barrier in a small non-inline function. So let's use a negative offset - which avoids this problem since we build with the red zone disabled. For userspace, use an address just below the redzone. The one difference between LOCK ADD and MFENCE is that LOCK ADD does not affect CLFLUSH, previous patches converted all uses of CLFLUSH to call mb(), such that changes to smp_mb() won't affect it. Update mb/rmb/wmb() on 32-bit to use the negative offset, too, for consistency. As a follow-up, it might be worth considering switching users of CLFLUSH to another API (e.g. clflush_mb()?) - we will then be able to convert mb() to smp_mb() again. Also arguably, GCC should switch to use LOCK ADD for __sync_synchronize(). This might be worth pursuing separately. Suggested-by: Andy Lutomirski Signed-off-by: Michael S. Tsirkin Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Paul E. McKenney Cc: qemu-devel@nongnu.org Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/1509118355-4890-1-git-send-email-mst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 12 ++++++++---- tools/virtio/ringtest/main.h | 4 ++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 01727dbc294a..7fb336210e1b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define mb() asm volatile("mfence":::"memory") @@ -31,7 +31,11 @@ #endif #define dma_wmb() barrier() -#define __smp_mb() mb() +#ifdef CONFIG_X86_32 +#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc") +#else +#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") +#endif #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h index 90b0133004e1..5706e075adf2 100644 --- a/tools/virtio/ringtest/main.h +++ b/tools/virtio/ringtest/main.h @@ -110,11 +110,15 @@ static inline void busy_wait(void) barrier(); } +#if defined(__x86_64__) || defined(__i386__) +#define smp_mb() asm volatile("lock; addl $0,-128(%%rsp)" ::: "memory", "cc") +#else /* * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized * with other __ATOMIC_SEQ_CST calls. */ #define smp_mb() __sync_synchronize() +#endif /* * This abuses the atomic builtins for thread fences, and From 78372709bf95c2ef5c886780efc268f7f052168a Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Tue, 7 Nov 2017 19:16:25 +0100 Subject: [PATCH 1067/1085] s390/bpf: take advantage of stack_depth tracking Make use of the "stack_depth" tracking feature introduced with commit 8726679a0fa31 ("bpf: teach verifier to track stack depth") for the s390 JIT, so that stack usage can be reduced. Signed-off-by: Michael Holzheu Signed-off-by: Heiko Carstens --- arch/s390/net/bpf_jit.h | 7 +++++-- arch/s390/net/bpf_jit_comp.c | 26 +++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h index fda605dbc1b4..5057728b300e 100644 --- a/arch/s390/net/bpf_jit.h +++ b/arch/s390/net/bpf_jit.h @@ -52,10 +52,13 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * * We get 160 bytes stack space from calling function, but only use * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt. + * + * The stack size used by the BPF program ("BPF stack" above) is passed + * via "aux->stack_depth". */ -#define STK_SPACE (MAX_BPF_STACK + 8 + 8 + 4 + 4 + 160) +#define STK_SPACE_ADD (8 + 8 + 4 + 4 + 160) #define STK_160_UNUSED (160 - 12 * 8) -#define STK_OFF (STK_SPACE - STK_160_UNUSED) +#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED) #define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */ #define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */ #define STK_OFF_SKBP 176 /* Offset of SKB pointer on stack */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8ec88497a28d..6730652b1a75 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -319,12 +319,12 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) /* * Restore registers from "rs" (register start) to "re" (register end) on stack */ -static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re) +static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth) { u32 off = STK_OFF_R6 + (rs - 6) * 8; if (jit->seen & SEEN_STACK) - off += STK_OFF; + off += STK_OFF + stack_depth; if (rs == re) /* lg %rs,off(%r15) */ @@ -368,7 +368,7 @@ static int get_end(struct bpf_jit *jit, int start) * Save and restore clobbered registers (6-15) on stack. * We save/restore registers in chunks with gap >= 2 registers. */ -static void save_restore_regs(struct bpf_jit *jit, int op) +static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) { int re = 6, rs; @@ -381,7 +381,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op) if (op == REGS_SAVE) save_regs(jit, rs, re); else - restore_regs(jit, rs, re); + restore_regs(jit, rs, re, stack_depth); re++; } while (re <= 15); } @@ -413,7 +413,7 @@ static void emit_load_skb_data_hlen(struct bpf_jit *jit) * Save registers and create stack frame if necessary. * See stack frame layout desription in "bpf_jit.h"! */ -static void bpf_jit_prologue(struct bpf_jit *jit) +static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) { if (jit->seen & SEEN_TAIL_CALL) { /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ @@ -426,7 +426,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit) /* Tail calls have to skip above initialization */ jit->tail_call_start = jit->prg; /* Save registers */ - save_restore_regs(jit, REGS_SAVE); + save_restore_regs(jit, REGS_SAVE, stack_depth); /* Setup literal pool */ if (jit->seen & SEEN_LITERAL) { /* basr %r13,0 */ @@ -441,7 +441,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit) /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); /* aghi %r15,-STK_OFF */ - EMIT4_IMM(0xa70b0000, REG_15, -STK_OFF); + EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); if (jit->seen & SEEN_FUNC) /* stg %w1,152(%r15) (backchain) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, @@ -458,7 +458,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit) /* * Function epilogue */ -static void bpf_jit_epilogue(struct bpf_jit *jit) +static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) { /* Return 0 */ if (jit->seen & SEEN_RET0) { @@ -470,7 +470,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit) /* Load exit code: lgr %r2,%b0 */ EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ - save_restore_regs(jit, REGS_RESTORE); + save_restore_regs(jit, REGS_RESTORE, stack_depth); /* br %r14 */ _EMIT2(0x07fe); } @@ -1018,7 +1018,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i */ if (jit->seen & SEEN_STACK) - off = STK_OFF_TCCNT + STK_OFF; + off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth; else off = STK_OFF_TCCNT; /* lhi %w0,1 */ @@ -1046,7 +1046,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* * Restore registers before calling function */ - save_restore_regs(jit, REGS_RESTORE); + save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth); /* * goto *(prog->bpf_func + tail_call_start); @@ -1272,7 +1272,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) jit->lit = jit->lit_start; jit->prg = 0; - bpf_jit_prologue(jit); + bpf_jit_prologue(jit, fp->aux->stack_depth); for (i = 0; i < fp->len; i += insn_count) { insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) @@ -1280,7 +1280,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) /* Next instruction address */ jit->addrs[i + insn_count] = jit->prg; } - bpf_jit_epilogue(jit); + bpf_jit_epilogue(jit, fp->aux->stack_depth); jit->lit_start = jit->prg; jit->size = jit->lit; From a1c5befc1c24eb9c1ee83f711e0f21ee79cbb556 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Nov 2017 12:29:34 +0100 Subject: [PATCH 1068/1085] s390: fix transactional execution control register handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dan Horák reported the following crash related to transactional execution: User process fault: interruption code 0013 ilc:3 in libpthread-2.26.so[3ff93c00000+1b000] CPU: 2 PID: 1 Comm: /init Not tainted 4.13.4-300.fc27.s390x #1 Hardware name: IBM 2827 H43 400 (z/VM 6.4.0) task: 00000000fafc8000 task.stack: 00000000fafc4000 User PSW : 0705200180000000 000003ff93c14e70 R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:1 AS:0 CC:2 PM:0 RI:0 EA:3 User GPRS: 0000000000000077 000003ff00000000 000003ff93144d48 000003ff93144d5e 0000000000000000 0000000000000002 0000000000000000 000003ff00000000 0000000000000000 0000000000000418 0000000000000000 000003ffcc9fe770 000003ff93d28f50 000003ff9310acf0 000003ff92b0319a 000003ffcc9fe6d0 User Code: 000003ff93c14e62: 60e0b030 std %f14,48(%r11) 000003ff93c14e66: 60f0b038 std %f15,56(%r11) #000003ff93c14e6a: e5600000ff0e tbegin 0,65294 >000003ff93c14e70: a7740006 brc 7,3ff93c14e7c 000003ff93c14e74: a7080000 lhi %r0,0 000003ff93c14e78: a7f40023 brc 15,3ff93c14ebe 000003ff93c14e7c: b2220000 ipm %r0 000003ff93c14e80: 8800001c srl %r0,28 There are several bugs with control register handling with respect to transactional execution: - on task switch update_per_regs() is only called if the next task has an mm (is not a kernel thread). This however is incorrect. This breaks e.g. for user mode helper handling, where the kernel creates a kernel thread and then execve's a user space program. Control register contents related to transactional execution won't be updated on execve. If the previous task ran with transactional execution disabled then the new task will also run with transactional execution disabled, which is incorrect. Therefore call update_per_regs() unconditionally within switch_to(). - on startup the transactional execution facility is not enabled for the idle thread. This is not really a bug, but an inconsistency to other facilities. Therefore enable the facility if it is available. - on fork the new thread's per_flags field is not cleared. This means that a child process inherits the PER_FLAG_NO_TE flag. This flag can be set with a ptrace request to disable transactional execution for the current process. It should not be inherited by new child processes in order to be consistent with the handling of all other PER related debugging options. Therefore clear the per_flags field in copy_thread_tls(). Reported-and-tested-by: Dan Horák Fixes: d35339a42dd1 ("s390: add support for transactional memory") Cc: # v3.7+ Cc: Martin Schwidefsky Reviewed-by: Christian Borntraeger Reviewed-by: Hendrik Brueckner Signed-off-by: Heiko Carstens --- arch/s390/include/asm/switch_to.h | 2 +- arch/s390/kernel/early.c | 4 +++- arch/s390/kernel/process.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index f6c2b5814ab0..8e6b07609ff4 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -36,8 +36,8 @@ static inline void restore_access_regs(unsigned int *acrs) save_ri_cb(prev->thread.ri_cb); \ save_gs_cb(prev->thread.gs_cb); \ } \ + update_cr_regs(next); \ if (next->mm) { \ - update_cr_regs(next); \ set_cpu_flag(CIF_FPU); \ restore_access_regs(&next->thread.acrs[0]); \ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 389e9f61c76f..5096875f7822 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -238,8 +238,10 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; if (test_facility(40)) S390_lowcore.machine_flags |= MACHINE_FLAG_LPP; - if (test_facility(50) && test_facility(73)) + if (test_facility(50) && test_facility(73)) { S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + __ctl_set_bit(0, 55); + } if (test_facility(51)) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; if (test_facility(129)) { diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 080c851dd9a5..cee658e27732 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -86,6 +86,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, memset(&p->thread.per_user, 0, sizeof(p->thread.per_user)); memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); clear_tsk_thread_flag(p, TIF_SINGLE_STEP); + p->thread.per_flags = 0; /* Initialize per thread user and system timer values */ p->thread.user_timer = 0; p->thread.guest_timer = 0; From d0e810eeb3d326978f248b8f0233a2f30f58c72d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Nov 2017 23:00:14 +0100 Subject: [PATCH 1069/1085] s390/noexec: execute kexec datamover without DAT Rebooting into a new kernel with kexec fails (system dies) if tried on a machine that has no-execute support. Reason for this is that the so called datamover code gets executed with DAT on (MMU is active) and the page that contains the datamover is marked as non-executable. Therefore when branching into the datamover an unexpected program check happens and afterwards the machine is dead. This can be simply avoided by disabling DAT, which also disables any no-execute checks, just before the datamover gets executed. In fact the first thing done by the datamover is to disable DAT. The code in the datamover that disables DAT can be removed as well. Thanks to Michael Holzheu and Gerald Schaefer for tracking this down. Reviewed-by: Michael Holzheu Reviewed-by: Philipp Rudo Cc: Gerald Schaefer Cc: Martin Schwidefsky Fixes: 57d7f939e7bd ("s390: add no-execute support") Cc: # v4.11+ Signed-off-by: Heiko Carstens --- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/relocate_kernel.S | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index c72e551e5951..4d91761f1177 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -265,6 +265,7 @@ static void __do_machine_kexec(void *data) s390_reset_system(); data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); + __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ /* Call the moving routine */ (*data_mover)(&image->head, image->start); diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index 4bdc65636603..a613581fea67 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -28,7 +28,6 @@ ENTRY(relocate_kernel) basr %r13,0 # base address .base: - stnsm sys_msk-.base(%r13),0xfb # disable DAT stctg %c0,%c15,ctlregs-.base(%r13) stmg %r0,%r15,gprregs-.base(%r13) lghi %r0,3 @@ -102,8 +101,6 @@ ENTRY(relocate_kernel) .align 8 load_psw: .long 0x00080000,0x80000000 - sys_msk: - .quad 0 ctlregs: .rept 16 .quad 0 From 364a5607d6988b05b54cc0138cf8229203b9a33e Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 10 Nov 2017 15:56:11 +0100 Subject: [PATCH 1070/1085] MAINTAINERS: add virtio-ccw.h to virtio/s390 section The file arch/s390/include/uapi/asm/virtio-ccw.h belongs to the s390 virtio drivers as well. Signed-off-by: Cornelia Huck Signed-off-by: Heiko Carstens --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fc..48c6f1179b9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14312,6 +14312,7 @@ L: virtualization@lists.linux-foundation.org L: kvm@vger.kernel.org S: Supported F: drivers/s390/virtio/ +F: arch/s390/include/uapi/asm/virtio-ccw.h VIRTIO GPU DRIVER M: David Airlie From 881a0b993e9f065cbb3673c94c395fa1de24bdcc Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Tue, 31 Oct 2017 12:49:04 -0700 Subject: [PATCH 1071/1085] spi: imx: GPIO based chip selects should not be required The driver will fail to load if no gpio chip selects are specified, this patch changes this so that it no longer fails. It's possible to use all native chip selects, in which case there is no reason to have a gpio chip select array. This is what happens if the *optional* device tree property "cs-gpios" is omitted. The spi core already checks for the absence of gpio chip selects in the master and assigns any slaves the gpio_cs value of -ENOENT. Also have the driver respect the standard SPI device tree property "num-cs" to allow setting the number of chip selects without using cs-gpios. CC: Mark Brown CC: Shawn Guo CC: Sascha Hauer CC: Fabio Estevam CC: Oleksij Rempel Signed-off-by: Trent Piepho Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 5ddd32ba2521..70dcc8ee1f6b 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1523,6 +1523,7 @@ static int spi_imx_probe(struct platform_device *pdev) spi_imx->devtype_data = devtype_data; + /* Get number of chip selects, either platform data or OF */ if (mxc_platform_info) { master->num_chipselect = mxc_platform_info->num_chipselect; master->cs_gpios = devm_kzalloc(&master->dev, @@ -1532,7 +1533,13 @@ static int spi_imx_probe(struct platform_device *pdev) for (i = 0; i < master->num_chipselect; i++) master->cs_gpios[i] = mxc_platform_info->chipselect[i]; - } + } else { + u32 num_cs; + + if (!of_property_read_u32(np, "num-cs", &num_cs)) + master->num_chipselect = num_cs; + /* If not preset, default value of 1 is used */ + } spi_imx->bitbang.chipselect = spi_imx_chipselect; spi_imx->bitbang.setup_transfer = spi_imx_setupxfer; @@ -1614,13 +1621,8 @@ static int spi_imx_probe(struct platform_device *pdev) master->dev.of_node = pdev->dev.of_node; - if (!spi_imx->slave_mode) { - if (!master->cs_gpios) { - dev_err(&pdev->dev, "No CS GPIOs available\n"); - ret = -EINVAL; - goto out_clk_put; - } - + /* Request GPIO CS lines, if any */ + if (!spi_imx->slave_mode && master->cs_gpios) { for (i = 0; i < master->num_chipselect; i++) { if (!gpio_is_valid(master->cs_gpios[i])) continue; From 4e21791e741c7c1b962c4a8327529f52310b9aac Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Tue, 31 Oct 2017 12:49:05 -0700 Subject: [PATCH 1072/1085] spi: imx: Fix failure path leak on GPIO request error If the code that requests any chip select GPIOs fails, the cleanup of spi_bitbang_start() by calling spi_bitbang_stop() is not done. Add this to the failure path. Note that spi_bitbang_start() has to be called before requesting GPIOs because the GPIO data in the spi master is populated when the master is registed, and that doesn't happen until spi_bitbang_start() is called. CC: Shawn Guo CC: Sascha Hauer CC: Fabio Estevam CC: Mark Brown CC: Oleksij Rempel Signed-off-by: Trent Piepho Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 70dcc8ee1f6b..fdbbdac66839 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1633,7 +1633,7 @@ static int spi_imx_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Can't get CS GPIO %i\n", master->cs_gpios[i]); - goto out_clk_put; + goto out_spi_bitbang; } } } @@ -1650,6 +1650,8 @@ static int spi_imx_probe(struct platform_device *pdev) clk_disable(spi_imx->clk_per); return ret; +out_spi_bitbang: + spi_bitbang_stop(&spi_imx->bitbang); out_clk_put: clk_disable_unprepare(spi_imx->clk_ipg); out_put_per: From ffd4db9e10fdb431b559af052c17015682bbe2aa Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Tue, 31 Oct 2017 12:49:06 -0700 Subject: [PATCH 1073/1085] spi: imx: Don't require platform data chipselect array If the array is not present, assume all chip selects are native. This is the standard behavior for SPI masters configured via the device tree and the behavior of this driver as well when it is configured via device tree. This reduces platform data vs DT differences and allows most of the platform data based boards to remove their chip select arrays. CC: Shawn Guo CC: Sascha Hauer CC: Fabio Estevam CC: Mark Brown Signed-off-by: Trent Piepho Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index fdbbdac66839..79ddefe4180d 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1526,13 +1526,15 @@ static int spi_imx_probe(struct platform_device *pdev) /* Get number of chip selects, either platform data or OF */ if (mxc_platform_info) { master->num_chipselect = mxc_platform_info->num_chipselect; - master->cs_gpios = devm_kzalloc(&master->dev, - sizeof(int) * master->num_chipselect, GFP_KERNEL); - if (!master->cs_gpios) - return -ENOMEM; + if (mxc_platform_info->chipselect) { + master->cs_gpios = devm_kzalloc(&master->dev, + sizeof(int) * master->num_chipselect, GFP_KERNEL); + if (!master->cs_gpios) + return -ENOMEM; - for (i = 0; i < master->num_chipselect; i++) - master->cs_gpios[i] = mxc_platform_info->chipselect[i]; + for (i = 0; i < master->num_chipselect; i++) + master->cs_gpios[i] = mxc_platform_info->chipselect[i]; + } } else { u32 num_cs; From 2244645ab194fe45ffcbaa08f235c8f0c7fb54fc Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Fri, 10 Nov 2017 11:16:24 -0800 Subject: [PATCH 1074/1085] x86/intel_rdt: Fix a silent failure when writing zero value schemata Writing an invalid schemata with no domain values (e.g., "(L3|MB):"), results in a silent failure, i.e. the last_cmd_status returns OK, Check for an empty value and set the result string with a proper error message and return -EINVAL. Before the fix: # mkdir /sys/fs/resctrl/p1 # echo "L3:" > /sys/fs/resctrl/p1/schemata (silent failure) # cat /sys/fs/resctrl/info/last_cmd_status ok # echo "MB:" > /sys/fs/resctrl/p1/schemata (silent failure) # cat /sys/fs/resctrl/info/last_cmd_status ok After the fix: # mkdir /sys/fs/resctrl/p1 # echo "L3:" > /sys/fs/resctrl/p1/schemata -bash: echo: write error: Invalid argument # cat /sys/fs/resctrl/info/last_cmd_status Missing 'L3' value # echo "MB:" > /sys/fs/resctrl/p1/schemata -bash: echo: write error: Invalid argument # cat /sys/fs/resctrl/info/last_cmd_status Missing 'MB' value [ Tony: This is an unintended side effect of the patch earlier to allow the user to just write the value they want to change. While allowing user to specify less than all of the values, it also allows an empty value. ] Fixes: c4026b7b95a4 ("x86/intel_rdt: Implement "update" mode when writing schemata file") Signed-off-by: Xiaochen Shen Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Vikas Shivappa Cc: Fenghua Yu Link: https://lkml.kernel.org/r/20171110191624.20280-1-tony.luck@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 30aeb267cbd2..23e1d5c249c6 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -257,6 +257,11 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, ret = -EINVAL; goto out; } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } ret = rdtgroup_parse_resource(resname, tok, closid); if (ret) goto out; From d00a08cf9ee986ad6689ce8c6fd176aff679c106 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 12 Nov 2017 13:02:51 +0100 Subject: [PATCH 1075/1085] irq/work: Use llist_for_each_entry_safe The llist_for_each_entry() loop in irq_work_run_list() is unsafe because once the works PENDING bit is cleared it can be requeued on another CPU. Use llist_for_each_entry_safe() instead. Fixes: 16c0890dc66d ("irq/work: Don't reinvent the wheel but use existing llist API") Reported-by:Chris Wilson Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: Byungchul Park Cc: Peter Zijlstra Cc: Petri Latvala Link: http://lkml.kernel.org/r/151027307351.14762.4611888896020658384@mail.alporthouse.com --- kernel/irq_work.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e2ebe8c71e8f..6647b33f7eb0 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -128,9 +128,9 @@ bool irq_work_needs_cpu(void) static void irq_work_run_list(struct llist_head *list) { - unsigned long flags; - struct irq_work *work; + struct irq_work *work, *tmp; struct llist_node *llnode; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -138,7 +138,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry(work, llnode, llnode) { + llist_for_each_entry_safe(work, tmp, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used. From df27067e6040b51188184876253d93da002433aa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 16:25:04 +0100 Subject: [PATCH 1076/1085] pstore: Use ktime_get_real_fast_ns() instead of __getnstimeofday() __getnstimeofday() is a rather odd interface, with a number of quirks: - The caller may come from NMI context, but the implementation is not NMI safe, one way to get there from NMI is NMI handler: something bad panic() kmsg_dump() pstore_dump() pstore_record_init() __getnstimeofday() - The calling conventions are different from any other timekeeping functions, to deal with returning an error code during suspended timekeeping. Address the above issues by using a completely different method to get the time: ktime_get_real_fast_ns() is NMI safe and has a reasonable behavior when timekeeping is suspended: it returns the time at which it got suspended. As Thomas Gleixner explained, this is safe, as ktime_get_real_fast_ns() does not call into the clocksource driver that might be suspended. The result can easily be transformed into a timespec structure. Since ktime_get_real_fast_ns() was not exported to modules, add the export. The pstore behavior for the suspended case changes slightly, as it now stores the timestamp at which timekeeping was suspended instead of storing a zero timestamp. This change is not addressing y2038-safety, that's subject to a more complex follow up patch. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Cc: Tony Luck Cc: Anton Vorontsov Cc: Stephen Boyd Cc: John Stultz Cc: Colin Cross Link: https://lkml.kernel.org/r/20171110152530.1926955-1-arnd@arndb.de --- fs/pstore/platform.c | 5 +---- kernel/time/timekeeping.c | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index ec7199e859d2..086e491faf04 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -482,10 +482,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(&record->time)) { - record->time.tv_sec = 0; - record->time.tv_nsec = 0; - } + record->time = ns_to_timespec(ktime_get_real_fast_ns()); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 353f7bd1eeb0..198afa78bf69 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -528,6 +528,7 @@ u64 ktime_get_real_fast_ns(void) { return __ktime_get_real_fast_ns(&tk_fast_mono); } +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. From b24591e2fcf852ad7ad2ccf745c8220bf378d312 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Nov 2017 12:35:07 +0000 Subject: [PATCH 1077/1085] timers: Add a function to start/reduce a timer Add a function, similar to mod_timer(), that will start a timer if it isn't running and will modify it if it is running and has an expiry time longer than the new time. If the timer is running with an expiry time that's the same or sooner, no change is made. The function looks like: int timer_reduce(struct timer_list *timer, unsigned long expires); This can be used by code such as networking code to make it easier to share a timer for multiple timeouts. For instance, in upcoming AF_RXRPC code, the rxrpc_call struct will maintain a number of timeouts: unsigned long ack_at; unsigned long resend_at; unsigned long ping_at; unsigned long expect_rx_by; unsigned long expect_req_by; unsigned long expect_term_by; each of which is set independently of the others. With timer reduction available, when the code needs to set one of the timeouts, it only needs to look at that timeout and then call timer_reduce() to modify the timer, starting it or bringing it forward if necessary. There is no need to refer to the other timeouts to see which is earliest and no need to take any lock other than, potentially, the timer lock inside timer_reduce(). Note, that this does not protect against concurrent invocations of any of the timer functions. As an example, the expect_rx_by timeout above, which terminates a call if we don't get a packet from the server within a certain time window, would be set something like this: unsigned long now = jiffies; unsigned long expect_rx_by = now + packet_receive_timeout; WRITE_ONCE(call->expect_rx_by, expect_rx_by); timer_reduce(&call->timer, expect_rx_by); The timer service code (which might, say, be in a work function) would then check all the timeouts to see which, if any, had triggered, deal with those: t = READ_ONCE(call->ack_at); if (time_after_eq(now, t)) { cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); set_bit(RXRPC_CALL_EV_ACK, &call->events); } and then restart the timer if necessary by finding the soonest timeout that hasn't yet passed and then calling timer_reduce(). The disadvantage of doing things this way rather than comparing the timers each time and calling mod_timer() is that you *will* take timer events unless you can finish what you're doing and delete the timer in time. The advantage of doing things this way is that you don't need to use a lock to work out when the next timer should be set, other than the timer's own lock - which you might not have to take. [ tglx: Fixed weird formatting and adopted it to pending changes ] Signed-off-by: David Howells Signed-off-by: Thomas Gleixner Cc: keyrings@vger.kernel.org Cc: linux-afs@lists.infradead.org Link: https://lkml.kernel.org/r/151023090769.23050.1801643667223880753.stgit@warthog.procyon.org.uk --- include/linux/timer.h | 1 + kernel/time/timer.c | 45 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 9f8895decb82..37b5e2f74d21 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -203,6 +203,7 @@ extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); +extern int timer_reduce(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer diff --git a/kernel/time/timer.c b/kernel/time/timer.c index fbb1f85327bf..af0b8bae4502 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, } } +#define MOD_TIMER_PENDING_ONLY 0x01 +#define MOD_TIMER_REDUCE 0x02 + static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) +__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; @@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * larger granularity than you would get from adding a new * timer with this expiry. */ - if (timer->expires == expires) + long diff = timer->expires - expires; + + if (!diff) + return 1; + if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* @@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) base = lock_timer_base(timer, &flags); forward_timer_base(base); + if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && + time_before_eq(timer->expires, expires)) { + ret = 1; + goto out_unlock; + } + clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { - timer->expires = expires; + if (!(options & MOD_TIMER_REDUCE)) + timer->expires = expires; + else if (time_after(timer->expires, expires)) + timer->expires = expires; ret = 1; goto out_unlock; } @@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } ret = detach_if_pending(timer, base, false); - if (!ret && pending_only) + if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; debug_activate(timer, expires); @@ -1042,7 +1058,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true); + return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); @@ -1068,10 +1084,25 @@ EXPORT_SYMBOL(mod_timer_pending); */ int mod_timer(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, false); + return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); +/** + * timer_reduce - Modify a timer's timeout if it would reduce the timeout + * @timer: The timer to be modified + * @expires: New timeout in jiffies + * + * timer_reduce() is very similar to mod_timer(), except that it will only + * modify a running timer if that would reduce the expiration time (it will + * start a timer that isn't running). + */ +int timer_reduce(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_REDUCE); +} +EXPORT_SYMBOL(timer_reduce); + /** * add_timer - start a timer * @timer: the timer to be added @@ -1754,7 +1785,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, false); + __mod_timer(&timer.timer, expire, 0); schedule(); del_singleshot_timer_sync(&timer.timer); From 6714796edcce27f7a1845e2f79783cd51bb4799b Mon Sep 17 00:00:00 2001 From: Wen Yaxng Date: Wed, 8 Nov 2017 09:55:03 +0800 Subject: [PATCH 1078/1085] genirq/proc: Return proper error code when irq_set_affinity() fails write_irq_affinity() returns the number of written bytes, which means success, unconditionally whether the actual irq_set_affinity() call succeeded or not. Add proper error handling and pass the error code returned from irq_set_affinity() back to user space in case of failure. [ tglx: Fixed coding style and massaged changelog ] Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Reviewed-by: Jiang Biao Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1510106103-184761-1-git-send-email-wen.yang99@zte.com.cn --- kernel/irq/proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6376b4a598d3..29d6f520a9fb 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,8 +154,9 @@ static ssize_t write_irq_affinity(int type, struct file *file, */ err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { - irq_set_affinity(irq, new_value); - err = count; + err = irq_set_affinity(irq, new_value); + if (!err) + err = count; } free_cpumask: From 306eb5a38dfc1a4c8a5c910c9844da6a97f2b9a4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sun, 12 Nov 2017 22:29:03 +0100 Subject: [PATCH 1079/1085] irqdomain: Drop pointless NULL check in virq_debug_show_one data has been already derefenced unconditionally, so it's pointless to do a NULL pointer check on it afterwards. Drop it. [ tglx: Depersonify changelog. ] Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20171112212904.28574-1-linux@rasmusvillemoes.dk --- kernel/irq/irqdomain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index fbbf34293b17..4f4f60015e8a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) chip = irq_data_get_irq_chip(data); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - seq_printf(m, data ? "0x%p " : " %p ", - irq_data_get_irq_chip_data(data)); + seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); From ffc661c99f621152d5fdcf53f9df0d48c326318b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 30 Oct 2017 22:35:47 +0100 Subject: [PATCH 1080/1085] genirq: Fix type of shifting literal 1 in __setup_irq() If ffz() ever returns a value >= 31 then the following shift is undefined behaviour because the literal 1 which gets shifted is treated as signed integer. In practice, the bug is probably harmless, since the first undefined shift count is 31 which results - ignoring UB - in (int)(0x80000000). This gets sign extended so bit 32-63 will be set as well and all subsequent __setup_irq() calls would just end up hitting the -EBUSY branch. However, a sufficiently aggressive optimizer may use the UB of 1<<31 to decide that doesn't happen, and hence elide the sign-extension code, so that subsequent calls can indeed get ffz > 31. In any case, the right thing to do is to make the literal 1UL. [ tglx: For this to happen a single interrupt would have to be shared by 32 devices. Hardware like that does not exist and would have way more problems than that. ] Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171030213548.16831-1-linux@rasmusvillemoes.dk --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c65f282cbc9a..6a1bf9dc7a6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1289,7 +1289,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ - new->thread_mask = 1 << ffz(thread_mask); + new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { From fcdfafcb73be8fa45909327bbddca46fb362a675 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 13 Nov 2017 07:59:10 +0100 Subject: [PATCH 1081/1085] kprobes: Don't spam the build log with deprecation warnings The jprobes APIs are deprecated - but are still in occasional use for code that few people seem to care about, so stop generating deprecation warnings. Cc: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 56b2e698dbad..9440a2fc8893 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -468,18 +468,18 @@ static inline int enable_kprobe(struct kprobe *kp) return -ENOSYS; } #endif /* CONFIG_KPROBES */ -static inline int __deprecated register_jprobe(struct jprobe *p) +static inline int register_jprobe(struct jprobe *p) { return -ENOSYS; } -static inline int __deprecated register_jprobes(struct jprobe **jps, int num) +static inline int register_jprobes(struct jprobe **jps, int num) { return -ENOSYS; } -static inline void __deprecated unregister_jprobe(struct jprobe *p) +static inline void unregister_jprobe(struct jprobe *p) { } -static inline void __deprecated unregister_jprobes(struct jprobe **jps, int num) +static inline void unregister_jprobes(struct jprobe **jps, int num) { } static inline int disable_kretprobe(struct kretprobe *rp) @@ -490,11 +490,11 @@ static inline int enable_kretprobe(struct kretprobe *rp) { return enable_kprobe(&rp->kp); } -static inline int __deprecated disable_jprobe(struct jprobe *jp) +static inline int disable_jprobe(struct jprobe *jp) { return -ENOSYS; } -static inline int __deprecated enable_jprobe(struct jprobe *jp) +static inline int enable_jprobe(struct jprobe *jp) { return -ENOSYS; } From f4c09f87adfe31587aa4b2aea2cb2dbde2150f54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Nov 2017 09:39:01 +0100 Subject: [PATCH 1082/1085] cpu/hotplug: Get rid of CPU hotplug notifier leftovers The CPU hotplug notifiers are history. Remove the last reminders. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- .../fault-injection/notifier-error-inject.txt | 30 ------------------- .../power/suspend-and-cpuhotplug.txt | 9 +++--- include/linux/cpu.h | 25 ++++++---------- 3 files changed, 13 insertions(+), 51 deletions(-) diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt index 83d3f4e43e91..e861d761de24 100644 --- a/Documentation/fault-injection/notifier-error-inject.txt +++ b/Documentation/fault-injection/notifier-error-inject.txt @@ -6,41 +6,11 @@ specified notifier chain callbacks. It is useful to test the error handling of notifier call chain failures which is rarely executed. There are kernel modules that can be used to test the following notifiers. - * CPU notifier * PM notifier * Memory hotplug notifier * powerpc pSeries reconfig notifier * Netdevice notifier -CPU notifier error injection module ------------------------------------ -This feature can be used to test the error handling of the CPU notifiers by -injecting artificial errors to CPU notifier chain callbacks. - -If the notifier call chain should be failed with some events notified, write -the error code to debugfs interface -/sys/kernel/debug/notifier-error-inject/cpu/actions//error - -Possible CPU notifier events to be failed are: - - * CPU_UP_PREPARE - * CPU_UP_PREPARE_FROZEN - * CPU_DOWN_PREPARE - * CPU_DOWN_PREPARE_FROZEN - -Example1: Inject CPU offline error (-1 == -EPERM) - - # cd /sys/kernel/debug/notifier-error-inject/cpu - # echo -1 > actions/CPU_DOWN_PREPARE/error - # echo 0 > /sys/devices/system/cpu/cpu1/online - bash: echo: write error: Operation not permitted - -Example2: inject CPU online error (-2 == -ENOENT) - - # echo -2 > actions/CPU_UP_PREPARE/error - # echo 1 > /sys/devices/system/cpu/cpu1/online - bash: echo: write error: No such file or directory - PM notifier error injection module ---------------------------------- This feature is controlled through debugfs interface diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt index 2fc909502db5..31abd04b9572 100644 --- a/Documentation/power/suspend-and-cpuhotplug.txt +++ b/Documentation/power/suspend-and-cpuhotplug.txt @@ -232,7 +232,7 @@ d. Handling microcode update during suspend/hibernate: hibernate/restore cycle.] In the current design of the kernel however, during a CPU offline operation - as part of the suspend/hibernate cycle (the CPU_DEAD_FROZEN notification), + as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set), the existing copy of microcode image in the kernel is not freed up. And during the CPU online operations (during resume/restore), since the kernel finds that it already has copies of the microcode images for all the @@ -252,10 +252,9 @@ Yes, they are listed below: the _cpu_down() and _cpu_up() functions is *always* 0. This might not reflect the true current state of the system, since the tasks could have been frozen by an out-of-band event such as a suspend - operation in progress. Hence, it will lead to wrong notifications being - sent during the cpu online/offline events (eg, CPU_ONLINE notification - instead of CPU_ONLINE_FROZEN) which in turn will lead to execution of - inappropriate code by the callbacks registered for such CPU hotplug events. + operation in progress. Hence, the cpuhp_tasks_frozen variable will not + reflect the frozen state and the CPU hotplug callbacks which evaluate + that variable might execute the wrong code path. 2. If a regular CPU hotplug stress test happens to race with the freezer due to a suspend operation in progress at the same time, then we could hit the diff --git a/include/linux/cpu.h b/include/linux/cpu.h index cd4771b772c0..b6e4a598b2cd 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -55,24 +55,17 @@ extern void unregister_cpu(struct cpu *cpu); extern ssize_t arch_cpu_probe(const char *, size_t); extern ssize_t arch_cpu_release(const char *, size_t); #endif -struct notifier_block; -#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ -#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */ -#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ -#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug - * lock is dropped */ -#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, - * perhaps due to preemption. */ - -/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend - * operation in progress +/* + * These states are not related to the core CPU hotplug mechanism. They are + * used by various (sub)architectures to track internal state */ -#define CPU_TASKS_FROZEN 0x0010 - -#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN) -#define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN) -#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) +#define CPU_ONLINE 0x0002 /* CPU is up */ +#define CPU_UP_PREPARE 0x0003 /* CPU coming up */ +#define CPU_DEAD 0x0007 /* CPU dead */ +#define CPU_DEAD_FROZEN 0x0008 /* CPU timed out on unplug */ +#define CPU_POST_DEAD 0x0009 /* CPU successfully unplugged */ +#define CPU_BROKEN 0x000B /* CPU did not die properly */ #ifdef CONFIG_SMP extern bool cpuhp_tasks_frozen; From 01c313dded34a16ef69e3972ceca687ba8a7cdf2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Nov 2017 17:50:59 +0100 Subject: [PATCH 1083/1085] kallsyms: fix building without printk Building kallsyms fails without CONFIG_PRINTK due to a missing declaration: kernel/kallsyms.c: In function 'kallsyms_show_value': kernel/kallsyms.c:670:10: error: 'kptr_restrict' undeclared (first use in this function); did you mean 'keyring_restrict'? This moves the declaration outside of the #ifdef guard, the definition is already available without CONFIG_PRINTK. Fixes: c0f3ea158939 ("stop using '%pK' for /proc/kallsyms pointer values") Signed-off-by: Arnd Bergmann [ I clearly need to start doing "allnoconfig" builds too, or just have a test branch for the 0day robot - Linus ] Signed-off-by: Linus Torvalds --- include/linux/printk.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 335926039adc..905bba92f015 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; -extern int kptr_restrict; extern int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, @@ -280,6 +279,8 @@ static inline void printk_safe_flush_on_panic(void) } #endif +extern int kptr_restrict; + extern asmlinkage void dump_stack(void) __cold; #ifndef pr_fmt From e4a8ca3baa5557fa54557d42b5910ed0d3316922 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Nov 2017 17:51:00 +0100 Subject: [PATCH 1084/1085] /proc/module: fix building without kallsyms As reported by kernelci and other build bots, we now get a link failure without CONFIG_KALLSYMS: module.c:(.text+0xf2c): undefined reference to `kallsyms_show_value' This adds a dummy helper with the same name that can be used for compilation. It's not entirely clear to me what this should return for !CONFIG_KALLSYMS, I picked an unconditional 'false', which leads to the module address being unavailable to user space. Link: https://kernelci.org/build/mainline/branch/master/kernel/v4.14-5-g516fb7f2e73d/ Fixes: 516fb7f2e73d ("/proc/module: use the same logic as /proc/kallsyms for address exposure") Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 0a777c5216b1..708f337d780b 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -14,8 +14,6 @@ #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) -/* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); #ifndef CONFIG_64BIT # define KALLSYM_FMT "%08lx" #else @@ -54,6 +52,9 @@ extern void __print_symbol(const char *fmt, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); +/* How and when do we show kallsyms values? */ +extern int kallsyms_show_value(void); + #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) @@ -112,6 +113,11 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } +static inline int kallsyms_show_value(void) +{ + return false; +} + /* Stupid that this does nothing, but I didn't create this mess. */ #define __print_symbol(fmt, addr) #endif /*CONFIG_KALLSYMS*/ From b29c6ef7bb1257853c1e31616d84f55e561cf631 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Nov 2017 02:15:39 +0100 Subject: [PATCH 1085/1085] x86 / CPU: Avoid unnecessary IPIs in arch_freq_get_on_cpu() Even though aperfmperf_snapshot_khz() caches the samples.khz value to return if called again in a sufficiently short time, its caller, arch_freq_get_on_cpu(), still uses smp_call_function_single() to run it which may allow user space to trigger an IPI storm by reading from the scaling_cur_freq cpufreq sysfs file in a tight loop. To avoid that, move the decision on whether or not to return the cached samples.khz value to arch_freq_get_on_cpu(). This change was part of commit 941f5f0f6ef5 ("x86: CPU: Fix up "cpu MHz" in /proc/cpuinfo"), but it was not the reason for the revert and it remains applicable. Fixes: 4815d3c56d1e (cpufreq: x86: Make scaling_cur_freq behave more as expected) Signed-off-by: Rafael J. Wysocki Reviewed-by: WANG Chao Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/aperfmperf.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 0ee83321a313..957813e0180d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -42,10 +42,6 @@ static void aperfmperf_snapshot_khz(void *dummy) s64 time_delta = ktime_ms_delta(now, s->time); unsigned long flags; - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return; - local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); @@ -74,6 +70,7 @@ static void aperfmperf_snapshot_khz(void *dummy) unsigned int arch_freq_get_on_cpu(int cpu) { + s64 time_delta; unsigned int khz; if (!cpu_khz) @@ -82,6 +79,12 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!static_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + /* Don't bother re-computing within the cache threshold time. */ + time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu)); + khz = per_cpu(samples.khz, cpu); + if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return khz; + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); khz = per_cpu(samples.khz, cpu); if (khz)